In [1]:
#utility
import pandas as pd
import numpy as np
import pickle
#ml packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,sex,name,count
0,M,James,4924235
1,M,John,4818746
2,M,Robert,4703680
3,M,Michael,4280040
4,M,William,3811998


In [4]:
df.size

98040

In [5]:
df.columns

Index(['sex', 'name', 'count'], dtype='object')

In [6]:
df.isnull().isnull().sum()

sex      0
name     0
count    0
dtype: int64

In [7]:
df.dtypes

sex      object
name     object
count     int64
dtype: object

In [8]:
df[df.sex == 'M'].size

38799

In [9]:
df[df.sex == 'F'].size

59241

In [10]:
names = df
names.sex.replace({'F':0, 'M':1}, inplace = True)
names.head()

Unnamed: 0,sex,name,count
0,1,James,4924235
1,1,John,4818746
2,1,Robert,4703680
3,1,Michael,4280040
4,1,William,3811998


In [11]:
names.sex.unique()

array([1, 0], dtype=int64)

In [12]:
#to extract features
exFeatures = names['name']
cv = CountVectorizer()
X = cv.fit_transform(exFeatures)
cv.get_feature_names()

['aaban',
 'aadan',
 'aadarsh',
 'aaden',
 'aadhya',
 'aadi',
 'aadil',
 'aadin',
 'aadit',
 'aaditya',
 'aadya',
 'aadyn',
 'aahan',
 'aahana',
 'aahil',
 'aaiden',
 'aaima',
 'aakash',
 'aalayah',
 'aaleah',
 'aaleyah',
 'aalia',
 'aaliah',
 'aalijah',
 'aaliya',
 'aaliyah',
 'aaliyha',
 'aalliyah',
 'aalyah',
 'aalyiah',
 'aamina',
 'aaminah',
 'aamir',
 'aamira',
 'aamiyah',
 'aanchal',
 'aanika',
 'aaniya',
 'aaniyah',
 'aanya',
 'aaradhya',
 'aaralyn',
 'aaralynn',
 'aarav',
 'aaren',
 'aarian',
 'aariana',
 'aaric',
 'aarika',
 'aarin',
 'aarion',
 'aariyah',
 'aariz',
 'aarna',
 'aarnav',
 'aarohi',
 'aaron',
 'aaronjames',
 'aaronjoshua',
 'aaronmichael',
 'aarron',
 'aarti',
 'aarush',
 'aarushi',
 'aarya',
 'aaryan',
 'aaryn',
 'aasha',
 'aashi',
 'aashna',
 'aashritha',
 'aasia',
 'aasim',
 'aasiyah',
 'aastha',
 'aayan',
 'aayat',
 'aayden',
 'aayla',
 'aayush',
 'aayushi',
 'ab',
 'abagail',
 'abagayle',
 'abaigeal',
 'abanoub',
 'abayomi',
 'abba',
 'abbagail',
 'abbas',

In [13]:
#splitting the dataset into training and testing
y = names.sex #labels
X #features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
#using Naive Bayes Classification
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.541860465116279

In [15]:
#doing sample testing on real-time testcases
#53% is not appropriate
sample_names = ['mohil', 'harsh', 'zubin', 'prasuk', 'amey', 'sarthak']
v = cv.transform(sample_names).toarray()
clf.predict(v)

array([0, 1, 1, 0, 0, 0], dtype=int64)

In [16]:
#edges near 50% on real time test cases, trying out decision tree
#feeding in commonly known traits for better results
def features(name):
    name = name.lower()
    return {
        'FL':name[0],
        'F2L':name[0:2],
        'F3L':name[0:3],
        'LL':name[-1],
        'L2L':name[-2:],
        'L3L':name[-3:]
    }

In [17]:
features = np.vectorize(features)
print(features(['mohil', 'harsh', 'zubin', 'prasuk', 'amey', 'sarthak']))

[{'FL': 'm', 'F2L': 'mo', 'F3L': 'moh', 'LL': 'l', 'L2L': 'il', 'L3L': 'hil'}
 {'FL': 'h', 'F2L': 'ha', 'F3L': 'har', 'LL': 'h', 'L2L': 'sh', 'L3L': 'rsh'}
 {'FL': 'z', 'F2L': 'zu', 'F3L': 'zub', 'LL': 'n', 'L2L': 'in', 'L3L': 'bin'}
 {'FL': 'p', 'F2L': 'pr', 'F3L': 'pra', 'LL': 'k', 'L2L': 'uk', 'L3L': 'suk'}
 {'FL': 'a', 'F2L': 'am', 'F3L': 'ame', 'LL': 'y', 'L2L': 'ey', 'L3L': 'mey'}
 {'FL': 's', 'F2L': 'sa', 'F3L': 'sar', 'LL': 'k', 'L2L': 'ak', 'L3L': 'hak'}]


In [18]:
dfX = features(names['name'])
dfy = names['sex']

In [19]:
corpus = features(['mohil', 'harsh', 'zubin', 'prasuk', 'amey', 'sarthak'])
dv = DictVectorizer()
dv.fit(corpus)
changed = dv.transform(corpus)
print(changed)

  (0, 2)	1.0
  (0, 8)	1.0
  (0, 14)	1.0
  (0, 20)	1.0
  (0, 26)	1.0
  (0, 32)	1.0
  (1, 1)	1.0
  (1, 7)	1.0
  (1, 13)	1.0
  (1, 22)	1.0
  (1, 28)	1.0
  (1, 30)	1.0
  (2, 5)	1.0
  (2, 11)	1.0
  (2, 17)	1.0
  (2, 21)	1.0
  (2, 24)	1.0
  (2, 33)	1.0
  (3, 3)	1.0
  (3, 9)	1.0
  (3, 15)	1.0
  (3, 23)	1.0
  (3, 29)	1.0
  (3, 31)	1.0
  (4, 0)	1.0
  (4, 6)	1.0
  (4, 12)	1.0
  (4, 19)	1.0
  (4, 27)	1.0
  (4, 34)	1.0
  (5, 4)	1.0
  (5, 10)	1.0
  (5, 16)	1.0
  (5, 18)	1.0
  (5, 25)	1.0
  (5, 31)	1.0


In [20]:
dv.get_feature_names()

['F2L=am',
 'F2L=ha',
 'F2L=mo',
 'F2L=pr',
 'F2L=sa',
 'F2L=zu',
 'F3L=ame',
 'F3L=har',
 'F3L=moh',
 'F3L=pra',
 'F3L=sar',
 'F3L=zub',
 'FL=a',
 'FL=h',
 'FL=m',
 'FL=p',
 'FL=s',
 'FL=z',
 'L2L=ak',
 'L2L=ey',
 'L2L=il',
 'L2L=in',
 'L2L=sh',
 'L2L=uk',
 'L3L=bin',
 'L3L=hak',
 'L3L=hil',
 'L3L=mey',
 'L3L=rsh',
 'L3L=suk',
 'LL=h',
 'LL=k',
 'LL=l',
 'LL=n',
 'LL=y']

In [21]:
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(dfX, dfy, test_size=0.25, random_state=42)

In [22]:
dv.fit_transform(dfX_train)

<24510x5767 sparse matrix of type '<class 'numpy.float64'>'
	with 147060 stored elements in Compressed Sparse Row format>

In [23]:
#Model definition
from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [24]:
#Sampling on decision tree classifier
sample_names = ['kushagra']
transform_dv =dv.transform(features(sample_names))
v = transform_dv.toarray()

In [25]:
dclf.predict(v)

array([1], dtype=int64)

In [26]:
print(dclf.score(dv.transform(dfX_train), dfy_train))

0.9289269685842513


In [27]:
#print('Decision Tree Classifier Model Accuracy: ' + str(round(dclf.score(dv.transform(dfX_train), dfy_train)*100)) + '%')

In [28]:
def genderPredictDT(names):
    transform_dv = dv.transform(features(names))
    v = transform_dv.toarray()
    ans = dclf.predict(v)
    if ans == 0:
        print("Female")
    else:
        print("Male")

In [29]:
genderPredictDT(['Amay'])

Female


In [30]:
#saving the model for further use
decisionTreeModel = open('DecisionTreeGenderClassifier.pkl', 'wb')
pickle.dump(dclf, decisionTreeModel)
decisionTreeModel.close()