Gender Classification of Names UsingMachine Learning
Sklearn, Pandas, TextExtraction

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [3]:
df = pd.read_csv('names_dataset.csv')

In [4]:
df.head()


Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [5]:
df.size

285075

In [6]:
#Data Cleaning
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [7]:
#data types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [8]:
#check missing values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [9]:
df[df.sex == 'M'].size

103275

In [10]:
df[df.sex == 'F'].size

181800

In [11]:
df_names = df

In [12]:
df_names.sex.replace(to_replace='F', value=0, inplace=True)
df_names.sex.replace(to_replace='M', value=1, inplace=True)

In [13]:
df_names.sex.unique()

array([0, 1])

In [14]:
df.dtypes

index     int64
name     object
sex       int64
dtype: object

In [15]:
Xfeatures = df_names['name']

In [16]:
#feature extractioin
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [17]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
#features
X
#labels
y = df_names.sex

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [21]:
#Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6398163206734908

In [22]:
#Accuracy on training set
print("accuracy=", clf.score(X_train,y_train)*100,"%")

accuracy= 100.0 %


In [23]:
#Sample Prediction
sample_name = ['Mary']
vect = cv.transform(sample_name).toarray()

In [24]:
vect.dtype
vect

array([[0, 0, 0, ..., 0, 0, 0]])

In [25]:
clf.predict(vect)

array([0])

In [26]:

# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [27]:
(clf.predict(vect3)).dtype

dtype('int64')

In [28]:
#a function to predict gender based on input name
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector)==0:
        print("Female")
    else: 
        print("Male")
        

In [29]:
genderpredictor('Rhea')

Female


In [30]:
genderpredictor('Romi')

Female


In [31]:
genderpredictor('Schineade')

Female


In [32]:
genderpredictor('Aarti')

Female


Using a custom function for feature analysis

We use the assumption that most female names end with or ave the sound of 'A' & 'E'

In [33]:
def features(name):
    name = name.lower()
    return{
        'first-letter': name[0],
        'first2-letters': name[0:2],
        'first3-letters': name[0:3],
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:]
    }

In [34]:
#vectorize the features function
features = np.vectorize(features)
print(features(["Anna","Varun","Anisha","Tanisha"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'v', 'first2-letters': 'va', 'first3-letters': 'var', 'last-letter': 'n', 'last2-letters': 'un', 'last3-letters': 'run'}
 {'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ani', 'last-letter': 'a', 'last2-letters': 'ha', 'last3-letters': 'sha'}
 {'first-letter': 't', 'first2-letters': 'ta', 'first3-letters': 'tan', 'last-letter': 'a', 'last2-letters': 'ha', 'last3-letters': 'sha'}]


In [35]:
#Extract features for the dataset
df_X = features(df_names['name'])

In [36]:
df_y = df_names['sex']

In [37]:
from sklearn.feature_extraction import DictVectorizer

corpus = features(["Mike","Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [38]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [39]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [40]:
dfX_train

array([{'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'ele', 'last-letter': 'a', 'last2-letters': 'ia', 'last3-letters': 'nia'},
       {'first-letter': 'a', 'first2-letters': 'ad', 'first3-letters': 'adi', 'last-letter': 'l', 'last2-letters': 'il', 'last3-letters': 'dil'},
       {'first-letter': 'k', 'first2-letters': 'ka', 'first3-letters': 'kad', 'last-letter': 'e', 'last2-letters': 'ze', 'last3-letters': 'nze'},
       ...,
       {'first-letter': 'j', 'first2-letters': 'ja', 'first3-letters': 'jaz', 'last-letter': 'y', 'last2-letters': 'ly', 'last3-letters': 'zly'},
       {'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'elv', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ina'},
       {'first-letter': 'l', 'first2-letters': 'le', 'first3-letters': 'led', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ger'}],
      dtype=object)

In [41]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<63666x8194 sparse matrix of type '<class 'numpy.float64'>'
	with 381996 stored elements in Compressed Sparse Row format>

In [42]:
custom_feature_clf = MultinomialNB()
my_xfeatures = dv.transform(dfX_train)
custom_feature_clf.fit(my_xfeatures, dfy_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [43]:
#build features and transform them
sample_name_eg = ['Yash']
transform_dv = dv.transform(features(sample_name_eg))

In [44]:
vect3 = transform_dv.toarray()

In [45]:
custom_feature_clf.predict(vect3)

array([1])

In [46]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [47]:
random_name = "Alex"
print(genderpredictor(random_name))

Male
None


In [48]:
# Accuracy on training set
print(custom_feature_clf.score(dv.transform(dfX_train), dfy_train))

0.8655169164075016


In [49]:
# Accuracy on test set
print(custom_feature_clf.score(dv.transform(dfX_test), dfy_test))

0.8525463184412768


# Preserve the model

In [50]:
import pickle

In [51]:
NBmodel = open("naive_bayes_model.pkl","wb")

In [52]:
pickle.dump(clf,NBmodel)

# testing with input from the web app form

In [53]:
input = "Rhea"
data = [input]

In [54]:
sample_df_X = df.name
sample_cv = CountVectorizer()
sample_X = sample_cv.fit_transform(sample_df_X)

In [57]:
import pickle
sample_dbfile = open('naive_bayes_model.pkl', 'rb')      
sample_clf = pickle.load(sample_dbfile) 

In [58]:
sample_clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [66]:
vect = sample_cv.transform(data).toarray()

In [73]:
vect.reshape(-1,1).shape

(1, 95025)