In [1]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [6]:
df = pd.read_csv('./polish_names.csv')
df.sample(5)

Unnamed: 0,name,gender
1548,Zenona,f
736,Klaudyna,f
1362,Terencjusz,m
616,Innocenty,m
769,Krystiana,f


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    1705 non-null   object
 1   gender  1705 non-null   object
dtypes: object(2)
memory usage: 26.8+ KB


In [8]:
df['gender'].value_counts()

m    1033
f     672
Name: gender, dtype: int64

In [9]:
def transform_string_into_number(string):
    return string
    
df['gender'].head().map( transform_string_into_number )

0    m
1    m
2    m
3    m
4    m
Name: gender, dtype: object

In [10]:
def transform_string_into_number(string):
    return int(string == 'm')
    
df['gender'].head().map( transform_string_into_number )

0    1
1    1
2    1
3    1
4    1
Name: gender, dtype: int64

In [14]:
df['target'] = df['gender'].map( lambda x: int(x == 'm') )

In [15]:
df['len_name'] = df['name'].map( lambda x: len(x))
df.sample(5)

Unnamed: 0,name,gender,target,len_name
214,Bolemir,m,1,7
353,Długosz,m,1,7
1531,Zbylut,m,1,6
1634,Hubert,m,1,6
1497,Wojciecha,f,0,9


In [16]:
X = df[ ['len_name'] ].values
y = df['target'].values

model = DummyClassifier(strategy = 'stratified')
model.fit(X, y)
y_pred = model.predict(X)

In [17]:
df['gender_pred'] = y_pred
df['gender_pred'].value_counts()

1    989
0    716
Name: gender_pred, dtype: int64

In [18]:
df[ df.target != y_pred ].shape

(820, 5)

In [19]:
model = DummyClassifier(strategy = 'stratified', random_state=0)
model.fit(X, y)
y_pred = model.predict(X)
accuracy_score(y, y_pred)

0.5237536656891496

In [20]:
model = LogisticRegression(solver='lbfgs')
model.fit(X, y)
y_pred = model.predict(X)
accuracy_score(y, y_pred)

0.6058651026392962

In [21]:
df['gender_pred'] = y_pred
df['gender_pred'].value_counts()

1    1705
Name: gender_pred, dtype: int64

In [22]:
def train_and_predict_model(X, y, model, success_metric=accuracy_score):
    model.fit(X, y)
    y_pred = model.predict(X)
    
    print("Distribution:")
    print( pd.Series(y_pred).value_counts() )
    
    return success_metric(y, y_pred)

In [23]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def how_many_vowels(name):
    count = sum( map(lambda x: int(x in vowels), name.lower()) )
    
    return count
df['count_vowels'] = df['name'].map(how_many_vowels)
train_and_predict_model(df[['len_name', 'count_vowels'] ], y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1082
0     623
dtype: int64


0.7143695014662756

In [24]:
def first_is_vowel(name):
    return name.lower()[0] in vowels

df['first_is_vowel'] = df['name'].map(first_is_vowel)
train_and_predict_model(df[['len_name', 'first_is_vowel'] ], y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1705
dtype: int64


0.6058651026392962

In [25]:
X = df[['len_name', 'count_vowels', 'first_is_vowel'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1106
0     599
dtype: int64


0.7296187683284457

In [26]:
df['last_letter'] = df['name'].map( lambda x: x.lower()[-1])
df['last_letter_cnt'] = df['last_letter'].factorize()[0]

X = df[['len_name', 'count_vowels', 'first_is_vowel', 'last_letter_cnt']]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs'))

Distribution:
1    1138
0     567
dtype: int64


0.7472140762463343

In [30]:
def last_is_vowel(name):
    return name.lower()[-1] in vowels

df['last_is_vowel'] = df['name'].map(last_is_vowel)
X = df[['last_is_vowel'] ]
train_and_predict_model(X, y, LogisticRegression(solver='lbfgs', max_iter=200))

Distribution:
1    964
0    741
dtype: int64


0.9524926686217009