# Training the RF classifier

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read the training data

df = pd.read_csv('final_all_names.csv',index_col=False)
df = df.drop('Unnamed: 0',1)
df.name = df.name.str.replace('[^a-zA-Z]', '')
df['name'] = df['name'].str.lower()

In [3]:
# Remove names shorter than 2 letters

length_Sel = []
for everyname in df.name:
    if len(everyname) >= 2:
        length_Sel.append(1)
    else:
        length_Sel.append(0)
        
df['lengthselector'] = length_Sel
df = df[df.lengthselector == 1]

In [6]:
df.head()

Unnamed: 0,name,gender,lengthselector
0,aamir,m,1
1,aaron,m,1
2,abbey,m,1
3,abbie,m,1
4,abbot,m,1


In [7]:
# Feature Selection 

get_last_letter = lambda x: x[-1]
get_2ndlast_letter = lambda x: x[-2]
get_first_letter = lambda x: x[0]
get_2nd_letter = lambda x: x[1]
#vowels = 'AEIOUaeiou'
#no_of_vowel = lambda x : sum( in vowels for s in x)

df['factor1'] = df.name.map(get_last_letter)
df['factor2'] = df.name.map(get_2ndlast_letter)
df['factor3'] = df.name.map(get_first_letter)
df['factor4'] = df.name.map(get_2nd_letter)

In [8]:
# Function to encode the name features for input to scikit models

class Encoder(object):
    def __init__(self,tokens):
        uniquetokens = np.unique(tokens)
        self.tokendict = {uniquetoken: i for i, uniquetoken in enumerate(uniquetokens)}
        self.tokendictinv = {i: uniquetoken for i, uniquetoken in enumerate(uniquetokens)}
        self.ntokens = len(uniquetokens)
        
    def encode(self,tokens):
        n = len(tokens)
        encodearray = np.zeros((n,self.ntokens))
        for i, token in enumerate(tokens):
            tokenid = self.tokendict.get(token,None)
            if tokenid is not None:
                encodearray[i,tokenid] = 1
        return encodearray
    
    def colidx_to_token(self,colidx):
        return self.tokendictinv[colidx]

In [9]:
# Initialising the encoder

Lastl = Encoder(df.factor1)
Seclastl = Encoder(df.factor2)
Firstl = Encoder(df.factor3)
Secondl = Encoder(df.factor4)

In [10]:
# Encoding names

f1 = Lastl.encode(df.factor1)
f2 = Seclastl.encode(df.factor2)
f3 = Firstl.encode(df.factor3)
f4 = Secondl.encode(df.factor4)

In [20]:
# Preparing data for training model/models

x_input = np.concatenate([f1, f2, f3, f4], axis=1)
gender = df.gender
y_input = gender.apply(lambda s: 1 if s == 'f' else 0)

In [22]:
#from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
#from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(x_input, y_input, test_size=0.33)

In [24]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [139]:
important_features = []
for x,i in enumerate(rfc.feature_importances_):
    if i>np.average(rfc.feature_importances_):
        important_features.append((x))

In [140]:
print important_features

[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 24, 25, 29, 30, 33, 34, 37, 38, 39, 43, 44, 50, 67, 69, 70, 73, 81, 83, 89, 95, 97, 101, 108, 109, 110, 114, 115, 116, 121, 122, 133, 135, 144, 150, 166, 168, 170, 176, 177, 179, 180, 183, 184, 202, 210, 213, 215, 219, 225, 229, 232, 235, 245, 264, 267, 268, 271, 276, 277, 281, 284, 299, 304, 310, 314, 328, 331, 332, 336, 342, 347, 352, 354, 356, 357, 368, 370, 372, 374, 375, 379, 384, 390, 399, 404, 429, 446, 450, 458, 462, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 498, 499, 500, 502, 503, 505, 506, 508, 510, 518, 519, 520, 524, 525, 527, 531, 533, 535, 542, 543, 548, 551, 554, 557, 564, 567, 575, 579, 595, 596, 610, 620, 622, 634, 635, 659, 664, 672, 676, 679, 685, 696, 699, 701, 702, 707, 708, 715, 719, 722, 725, 728, 730, 731, 735, 738, 743, 746, 748, 754, 790, 813, 817, 823, 829, 832, 833, 834, 840, 845, 846, 851, 853, 855, 861, 864, 869, 930, 940, 948]


In [None]:
list1 = []
list2 = []
list3 = []
list4 = []

for every in important_features:
    if every in range(0,25):
        list1.append(every)
    elif every in range(25,455):
        list2.append(every)
    elif every in range(455,455+26):
        list3.append(every)
    else:
        list4.append(every)


In [108]:
print list2

[26, 30, 34, 39, 40, 43, 46, 50]


In [None]:
for every in list2:
    #print Lastl.colidx_to_token(every)
    print Seclastl.colidx_to_token(every-2)
#   print Firstl.colidx_to_token(every-26-26)
#    print Secondl.colidx_to_token(every-78)    

In [17]:

f1 = Lastl.colidx_to_token(list1)
f2 = Seclastl.colidx_to_token(list2)
f3 = Firstl.colidx_to_token(list3)
f4 = Secondl.colidx_to_token(list4)

NameError: name 'list1' is not defined

In [None]:
# Function to encode individual names 

def nametogender(aname):
    
    f1 = Lastl.encode(get_last_letter(aname))
    f2 = Seclastl.encode(get_2ndlast_letter(aname))
    f3 = Firstl.encode(get_first_letter(aname))
    f4 = Secondl.encode(get_2nd_letter(aname))
    cinput = np.concatenate([f1, f2, f3, f4], axis=1)
    return cinput

In [None]:
rfc.predict_proba(nametogender('Joseph'))

In [25]:

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier


# Compute Precision-Recall and plot curve
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(Y_test,
                                                        scores[:, i])
    average_precision[i] = average_precision_score(Y_test, scores[:, i])

# Compute micro-average ROC curve and ROC area
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(),
    y_score.ravel())
average_precision["micro"] = average_precision_score(y_test, y_score,
                                                     average="micro")

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
plt.show()

# Plot Precision-Recall curve for each class
plt.clf()
plt.plot(recall["micro"], precision["micro"],
         label='micro-average Precision-recall curve (area = {0:0.2f})'
               ''.format(average_precision["micro"]))
for i in range(n_classes):
    plt.plot(recall[i], precision[i],
             label='Precision-recall curve of class {0} (area = {1:0.2f})'
                   ''.format(i, average_precision[i]))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Extension of Precision-Recall curve to multi-class')
plt.legend(loc="lower right")
plt.show()

NameError: name 'n_classes' is not defined

In [54]:
X_train.shape,X_test.shape

((72570, 165), (35744, 165))

In [175]:
clf = BernoulliNB()
clf.fit(X_train, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [176]:
clf.score(X_test,Y_test)

0.81934924320846036

In [18]:
rfc.score(X_test,Y_test)

0.83317013121450356

In [19]:
rfc.score(X_train,Y_train)

0.88884754778342778

In [153]:
scores[:,1]

array([ 0.        ,  0.54404737,  0.78129682, ...,  0.26247184,
        0.80388781,  0.58583333])

In [22]:
scores = rfc.predict_proba(X_test)
scores
scores.shape,Y_test.shape

((35743, 2), (35743,))

In [29]:
Y_predict = rfc.predict(X_test)

In [31]:
Y_predict.shape,X_test.shape

((35743,), (35743, 104))

In [207]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Y_test, scores[:,1])

In [23]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random

false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test,scores[:,1] )
roc_auc = auc(false_positive_rate, true_positive_rate)


In [20]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(rfc, X_test, Y_test, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)"
      % (scores.mean(), scores.std()*2))

Accuracy: 0.83 (+/- 0.01)


In [16]:
from sklearn.externals import joblib
 
rfc = joblib.load('randfor.pkl') 


In [24]:
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [27]:
metrics.roc_curve(Y_test, scores[:,1], pos_label=None, sample_weight=None, drop_intermediate=True)

NameError: name 'metrics' is not defined

In [28]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [26]:
import matplotlib.pyplot as plt

##############################################################################
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr[2], tpr[2], label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()



NameError: name 'fpr' is not defined

Try Logistic regressiona and Naive Bayes

In [None]:
logreg = LogisticRegression(class_weight = "balanced")
logregr.fit(X_test,Y_test)

In [183]:
logregr.score(X_test,Y_test)

0.83135159331897157

In [179]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_test,Y_test)

GaussianNB()

In [180]:
gnb.score(X_test,Y_test)

0.68824664969364635

In [181]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [182]:
clf.score(X_test,Y_test)

0.82290238648126901