In [2]:
import nltk
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from nltk.corpus import names
from random import shuffle
from sklearn.model_selection import train_test_split
from collections import Counter


In [3]:
# letters = "qwertyuiopasdfghjklzxcvbnm"
def extract_features(nameWords):
    def extract_feature_from_word(word):
        tword = word.strip().lower()
        letterCounter = Counter(nltk.bigrams("^" + tword + "$"))
        features = {
        #'lastLetter': tword[-1] if len(tword) > 0 else '',
        }
        features.update(dict(("bg{}".format(letter), letterCounter[letter]) for letter in letterCounter.keys()))
        return features
    return map(extract_feature_from_word, nameWords)

In [4]:
tagged_names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
shuffle(tagged_names)


In [5]:
X_names, y_names = list(zip(*tagged_names))
X_trainNames, X_testNames, y_trainNames, y_testNames = train_test_split(X_names, y_names)

In [6]:
pl_1 = Pipeline([('feature_extraction', FunctionTransformer(extract_features, validate=False)),
               ('encoder', DictVectorizer()), 
               ('classifier', DecisionTreeClassifier())])
pl_2 = Pipeline([('feature_extraction', FunctionTransformer(extract_features, validate=False)),
               ('encoder', DictVectorizer()), 
               ('classifier', LinearSVC())])
pl_3 = Pipeline([('feature_extraction', FunctionTransformer(extract_features, validate=False)),
               ('encoder', DictVectorizer(sparse=False)),
               ('select', SelectKBest(k=20)),
               ('classifier', DecisionTreeClassifier())])

In [7]:
pl_1.fit(X_trainNames, y_trainNames);
pl_2.fit(X_trainNames, y_trainNames);
pl_3.fit(X_trainNames, y_trainNames);

In [8]:
pl_1.score(X_testNames, y_testNames)

0.7502517623363545

In [9]:
pl_2.score(X_testNames, y_testNames)

0.8162134944612286

In [10]:
pl_3.score(X_testNames, y_testNames)

0.7623363544813696

In [11]:
tr = pl_1.named_steps['classifier']
dv = pl_1.named_steps['encoder']

In [None]:
print()
s = input()
while not s == '':
    print(s, "=>", pl_2.predict([s])[0])
    s = input()


chantal => male
charlotte => female
carolien => female
caroline => female
corinda => female


In [12]:
import pandas as pd

In [13]:
dfFeatureImportance = pd.DataFrame(list(zip(dv.feature_names_, tr.feature_importances_)))
dfFeatureImportance.sort_values(1, ascending=False)

Unnamed: 0,0,1
39,"bg('a', '$')",0.180248
120,"bg('e', '$')",0.084044
194,"bg('i', '$')",0.050033
472,"bg('y', '$')",0.023934
265,"bg('l', 'y')",0.020618
296,"bg('n', 'n')",0.013579
140,"bg('e', 't')",0.011922
252,"bg('l', 'i')",0.011793
243,"bg('l', '$')",0.010187
126,"bg('e', 'e')",0.009924


In [21]:
dfNames = pd.DataFrame(tagged_names)

In [23]:
dfNames.groupby(1).count()

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
female,5001
male,2943


In [24]:
2943/5001

0.5884823035392921

In [43]:
gb = dfNames.groupby(0).count().sort_values(1)

In [50]:
(gb >= 2).sum()

1    365
dtype: int64

In [49]:
gb.count()

1    7579
dtype: int64

In [52]:
1-365/7579

0.9518406122179707

In [None]:
# ^ Names which cannot be guessed right either.