In [30]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_csv("data/name_gender.tsv", sep="\t", header=None)
df.columns = ["Name", "Total", "Female", "Male", "Confidence"]

In [278]:
def gender(row, thresh=0.9):
    if row["Confidence"] < thresh:
        return 'N'
    if row["Female"] >= thresh:
        return 'F'
    if row["Male"] >= thresh:
        return 'M'
    return 'O'

In [279]:
df["Gender"] = df.apply(lambda x: gender(x), axis=1)

In [280]:
df.head()

Unnamed: 0,Name,Total,Female,Male,Confidence,Gender
0,Philomenia,5,1,0,1,F
1,Nashay,22,1,0,1,F
2,Tennile,5,1,0,1,F
3,Jeneane,6,1,0,1,F
4,Shahida,5,1,0,1,F


In [281]:
df.groupby("Gender").agg({"Name": len, "Total": sum, "Female": np.mean, "Male": np.mean, "Confidence": np.mean})

Unnamed: 0_level_0,Male,Confidence,Total,Name,Female
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,0.000794,0.999206,135347171,17870,0.999206
M,0.998426,0.998426,148022945,11067,0.001574
N,0.502764,0.71824,12356949,891,0.497236


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

In [67]:
df_fm = df[df["Gender"].isin(["M", "F"])].copy()
df_fm["Mod_Name"] = df_fm["Name"].apply(lambda x: "^%s$" % x)
df_fm.head()

Unnamed: 0,Name,Total,Female,Male,Confidence,Gender,Mod_Name
0,Philomenia,5,1,0,1,F,^Philomenia$
1,Nashay,22,1,0,1,F,^Nashay$
2,Tennile,5,1,0,1,F,^Tennile$
3,Jeneane,6,1,0,1,F,^Jeneane$
4,Shahida,5,1,0,1,F,^Shahida$


In [126]:
features = CountVectorizer(ngram_range=(2, 5), analyzer="char", binary=True)

In [127]:
X = features.fit_transform(df_fm["Mod_Name"])

In [128]:
X.shape, df_fm["Mod_Name"].shape

((28937, 78194), (28937,))

In [129]:
clf = SGDClassifier(penalty='l1', random_state=1337)

In [130]:
clf.fit(X, df_fm["Gender"])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=1337, shuffle=True,
       verbose=0, warm_start=False)

In [131]:
clf.predict(features.transform(["^Philomenia$"]))

array(['F'], 
      dtype='|S1')

In [132]:
clf.coef_[0][clf.coef_[0].argsort()[-10:]]

array([  7.71006487,   7.88253764,   7.9692655 ,   8.31427367,
         8.38643986,   9.04355435,   9.24224734,  10.1969067 ,
        12.75150489,  12.91774746])

In [133]:
feature_names = np.array(features.get_feature_names())

In [134]:
feature_strength = (clf.coef_[0]**2)
feature_class = (clf.coef_[0] > 0)

feature_names[(feature_class)]

df_features = pd.DataFrame(data={"strength": feature_strength, 
                                "sign": feature_class, 
                                "weight": clf.coef_[0]}, index=feature_names)
df_features.sort("strength").tail(20)

Unnamed: 0,sign,strength,weight
^lil,False,56.786945,-7.535711
kade$,True,58.911442,7.675379
^vard,True,59.4451,7.710065
vard,True,59.4451,7.710065
ah$,False,59.937593,-7.741937
k$,True,62.1344,7.882538
uda$,True,63.509193,7.969265
ilin,False,63.988669,-7.999292
edm,True,69.127147,8.314274
dm,True,70.332374,8.38644


In [136]:
df_fm[df_fm.Name.str.contains("uda")]

Unnamed: 0,Name,Total,Female,Male,Confidence,Gender,Mod_Name
352,Yuda,5,0.0,1.0,1.0,M,^Yuda$
4290,Yudany,5,1.0,0.0,1.0,F,^Yudany$
5568,Gaudalupe,5,1.0,0.0,1.0,F,^Gaudalupe$
7213,Auda,5,1.0,0.0,1.0,F,^Auda$
7554,Shenouda,16,0.0,1.0,1.0,M,^Shenouda$
9084,Daouda,16,0.0,1.0,1.0,M,^Daouda$
10111,Judas,57,0.0,1.0,1.0,M,^Judas$
11131,Suda,5,1.0,0.0,1.0,F,^Suda$
14045,Gudalupe,5,1.0,0.0,1.0,F,^Gudalupe$
15536,Vauda,5,1.0,0.0,1.0,F,^Vauda$


In [138]:
clf.score(X, df_fm["Gender"])

0.91211943186923317

In [146]:
from sklearn.grid_search import GridSearchCV
from time import time

In [164]:
parameters = {
    'alpha': np.logspace(-3, -7, num=6),
    'penalty': ('l1', 'l2', 'elasticnet'),
    'n_iter': np.linspace(100, 300, num=5),
}

grid_search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(X, df_fm["Gender"])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
parameters:
{'penalty': ('l1', 'l2', 'elasticnet'), 'alpha': array([  1.00000000e-03,   1.58489319e-04,   2.51188643e-05,
         3.98107171e-06,   6.30957344e-07,   1.00000000e-07]), 'n_iter': array([ 100.,  150.,  200.,  250.,  300.])}
Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 224 out of 270 | elapsed:   41.2s remaining:    8.5s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:   47.0s finished


done in 49.500s
()
Best score: 0.899
Best parameters set:
	alpha: 0.00015848931924611142
	n_iter: 200.0
	penalty: 'elasticnet'


In [165]:
clf = SGDClassifier(penalty='elasticnet', alpha=1e-4, n_iter=200, random_state=1337)

In [166]:
clf.fit(X, df_fm["Gender"])
clf.score(X, df_fm["Gender"])

0.98044026678646712

In [157]:
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelBinarizer

In [161]:
class LastVowel(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda x: x.lower()[-2] in "aieou")[:, np.newaxis]

is_vowel = LastVowel()
feature_all = FeatureUnion([("ngram", features), ("is_vowel", is_vowel)])

In [162]:
X = feature_all.fit_transform(df_fm["Mod_Name"])

In [211]:
class LabelVectors(BaseEstimator):
    def fit(self, X, y=None):
        self.classes = np.unique(X)
        return self
    def transform(self, X):
        print X.shape[0], self.classes.shape[0]
        X_t = np.zeros((X.shape[0], self.classes.shape[0]))
        for i, c in enumerate(self.classes):
            X_t[:,i] = 1 & (X == c)
        return X_t
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [192]:
X.toarray().shape

(28937, 78195)

In [267]:
# labels = LabelVectors()
labels = LabelBinarizer()
y = labels.fit_transform(df_fm["Gender"])
y[y[:,0] == 0] = -1
y[:10], y[-10:], y.shape, df_fm["Gender"].values[-10:]

(array([[-1],
        [-1],
        [-1],
        [-1],
        [-1],
        [-1],
        [-1],
        [-1],
        [-1],
        [-1]]), array([[-1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [-1]]), (28937,
  1), array(['F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F'], dtype=object))

In [246]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.regularizers import l2, activity_l2


In [268]:
model = Sequential()
model.add(Dense(input_dim=X.shape[1], output_dim=100, init="glorot_uniform", W_regularizer=l2(0.01)))
model.add(Activation("relu"))
model.add(Dense(input_dim=100, output_dim=y.shape[1], init="glorot_uniform", W_regularizer=l2(0.01)))
model.add(Activation("tanh"))

In [269]:
model.compile(loss='hinge', optimizer='rmsprop')

In [270]:
model.fit(X.toarray(), y, nb_epoch=10, batch_size=100, shuffle=True, validation_split=0.2, show_accuracy=True)

Train on 23149 samples, validate on 5788 samples
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<keras.callbacks.History at 0x7f27b7df02d0>

In [272]:
model.predict_classes(X[-10:,:].toarray())
model.evaluate(X[-10:,:].toarray(), y[-10:,:], show_accuracy=True)



[1.040812269458707, 1.0]

In [241]:
df_fm["Gender"][-10:]

28927    F
28928    M
28929    M
28930    M
28931    M
28932    M
28933    M
28934    M
28935    M
28936    F
Name: Gender, dtype: object

In [242]:
model.evaluate(X.toarray(), y, batch_size=100, show_accuracy=True)



[0.10798193781124817, 0.97297577495939458]

In [274]:
df[df["Confidence"] < 0.9].head()

Unnamed: 0,Name,Total,Female,Male,Confidence,Gender
28937,Sawyer,25745,0.100058,0.899942,0.899942,N
28938,Nikita,8744,0.899931,0.100069,0.899931,N
28939,Caelan,714,0.10084,0.89916,0.89916,N
28940,Sheridan,2823,0.899044,0.100956,0.899044,N
28941,Ottie,99,0.89899,0.10101,0.89899,N


In [290]:
df_test = df[(df["Confidence"] < 0.9) & (df["Confidence"] > 0.85)].copy()
df_test["Mod_Name"] = df_test["Name"].apply(lambda k: "^%s$" % k)
df_test["Gender"] = df_test.apply(lambda x: gender(x, thresh=0.85), axis=1)

In [294]:
df_test.groupby("Gender").agg({"Name": len, "Total": sum, "Female": np.mean, "Male": np.mean, "Confidence": np.mean})

Unnamed: 0_level_0,Male,Confidence,Total,Name,Female
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,0.11986,0.88014,2142817,89,0.88014
M,0.877389,0.877389,898031,75,0.122611


In [296]:
X_test = feature_all.transform(df_test["Mod_Name"])
y_test = labels.transform(df_test["Gender"])
y_test[y_test[:,0] == 0] = -1

In [299]:
y_test[:10,:]

array([[ 1],
       [-1],
       [ 1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [ 1],
       [-1]])

In [301]:
print df_test.shape
df_test.head()

(164, 7)


Unnamed: 0,Name,Total,Female,Male,Confidence,Gender,Mod_Name
28937,Sawyer,25745,0.100058,0.899942,0.899942,M,^Sawyer$
28938,Nikita,8744,0.899931,0.100069,0.899931,F,^Nikita$
28939,Caelan,714,0.10084,0.89916,0.89916,M,^Caelan$
28940,Sheridan,2823,0.899044,0.100956,0.899044,F,^Sheridan$
28941,Ottie,99,0.89899,0.10101,0.89899,F,^Ottie$


In [300]:
model.evaluate(X_test.toarray(), y_test, batch_size=100, show_accuracy=True)



[0.86797289785919429, 1.0]

In [417]:
test_names = ["^Shivangi$", "^Pushpa$", "^Kshitij$", "^Jana$", "^Vetle$",
              "^Ingvald$","^Shankar$", "^Vishnu$", "^Mala$", "^Jagdish$",
             "^Ming$", "^Julian$", "^Shadi$", "^Rezvane$", "^Rezvaneh$",
              "^Amir$", "^Motahhare$", "^Motahareh$", "^Enchuan$", "^Ada$",
             "^Barack$", "^Eugene$", "^Shubhanshu$", "^Alankrita$", "^Ridit$",
             "^Reyansh$", "^Agastya$", "^Girish$", "^Andrej$", "^Jinseok$", 
             "^Harathi$", "^Mae$", "^Pei$", "^Arijit$", "^Ravi$", "^Atul$", "^Aseel$",
             "^Vivek$", "^Sudhanshu$", "^Mansi$", "^Harpreet$", "^Hadi$", "^Liang$",
             "^Arpit$", "^Geethika$", "^Kiumars$", "^Craig$", "^Rituraj$", "^Speranza$",
             "^Constanta$", "^Viorica$", "^Hooriyah$", "^Andrea$", "^Jan$", "^Jeanpaul$",
             "^Josemaria$"]
output_lbl = model.predict(feature_all.transform(pd.Series(data=test_names)).toarray())
df_output = pd.DataFrame(data={"Name": test_names, "score": output_lbl[:, 0]})
df_output["Gender"] = df_output["score"].apply(lambda x: "F" if x < 0 else "M")
df_output["Gender_SVM"] = clf.predict(feature_all.transform(pd.Series(data=test_names)))
df_output

Unnamed: 0,Name,score,Gender,Gender_SVM
0,^Shivangi$,-0.22714,F,F
1,^Pushpa$,-0.826976,F,F
2,^Kshitij$,0.943411,M,M
3,^Jana$,-0.993774,F,F
4,^Vetle$,-0.850577,F,F
5,^Ingvald$,0.955292,M,M
6,^Shankar$,0.620609,M,M
7,^Vishnu$,0.69997,M,M
8,^Mala$,-0.988562,F,F
9,^Jagdish$,0.869108,M,M


In [331]:
model.save_weights("Gender.09242015.h5")

In [403]:
df_fm[df["Name"] == "Viorica".title()]

Unnamed: 0,Name,Total,Female,Male,Confidence,Gender,Mod_Name


In [416]:
df[df["Name"] == "Josemaria"].head()

Unnamed: 0,Name,Total,Female,Male,Confidence,Gender
6516,Josemaria,215,0,1,1,M
