# Simple Gender Classifier


## Libraries used.
- Pandas
- Sklearn
- Feature Extraction from Text

In [75]:
#Packages for data wrangling
import pandas as pd
import numpy as np

In [76]:
#ML packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [77]:
#Loading names dataset
df = pd.read_csv('gender_cleaned.csv')

In [78]:
#Check the headers
df.head()


Unnamed: 0,Index,Name,Gender
0,1,Abebi,F
1,2,Abel,M
2,3,Abena,F
3,4,Abeni,F
4,5,Abidemi,F


In [79]:
#Check the dataset size
df.size

289713

In [80]:
#Checking for the columns names and dat type.
df.columns
df.dtypes

Index      int64
Name      object
Gender    object
dtype: object

In [81]:
#chck for the missing values
df.isnull().isnull().sum()

Index     0
Name      0
Gender    0
dtype: int64

In [82]:
#Number of female names
df[df.Gender == 'F'].size

183867

In [83]:
# Number of male names
df[df.Gender == 'M'].size

105846

In [84]:
#Get df into new structure
df_names = df

In [85]:
# Replace all F and M with - and one respectively
df_names.Gender.replace({'F':0,'M':1},inplace = True)


In [86]:
df_names.isnull().isnull().sum()
df_names.Gender.unique()

array([0, 1], dtype=int64)

In [87]:
Xfeatures = df_names['Name']

In [88]:
# Feature extraction
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [89]:
#cv.get_feature_names()

In [90]:
# Splitting the datset into training and test models
from sklearn.model_selection import train_test_split

In [91]:
y = df_names.Gender

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33, random_state = 42)

In [93]:
#Importing Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

In [94]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

0.6340330728921523

In [95]:
#Check the accuracy of model with test data
test_ac = classifier.score(X_test, y_test) * 100
train_ac = classifier.score(X_train, y_train) * 100
print('Accuracy of Test = {0:2f}\nAccuracy of Training = {1:2f}'.format(test_ac,train_ac))

Accuracy of Test = 63.403307
Accuracy of Training = 99.800624


## Predictions


In [96]:
#Sampel 
sample_name = ["Harris"]
vect = cv.transform(sample_name).toarray()

In [97]:
print(vect)

[[0 0 0 ... 0 0 0]]


In [98]:
classifier.predict(vect)

array([1], dtype=int64)

In [99]:
# A module to do the prediction.
def find_gender(name):
    test_name = [name]
    vector = cv.transform(test_name).toarray()
    if classifier.predict(vector) == 0:
        return "Female"
    else:
        return "Male"

In [101]:
names = ['Kathrine','Harry','Bibek','']

In [102]:
for name in names:
    if find_gender(name) == 'Female':
        print(name + " is Female.")
    else:
        print(name + " is Male.")

Kathrine is Female.
Harry is Male.
Bibek is Male.
