In [1]:
import pandas as pd
import numpy as np
 
names = pd.read_csv('names_dataset.csv')

 
print("There are %d names in dataset" % len(names)) 
names.head()

There are 95025 names in dataset


Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [2]:
names = names.as_matrix()[:, 1:]
TRAIN_SPLIT = 0.8 # set a split for Train and Test sets

print(names)

[['Mary' 'F']
 ['Anna' 'F']
 ['Emma' 'F']
 ...
 ['Ziyu' 'M']
 ['Zykir' 'M']
 ['Zyus' 'M']]


In [3]:
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }
 
print (features("John"))


{'last-letter': 'n', 'first-letter': 'j', 'first3-letters': 'joh', 'last3-letters': 'ohn', 'first2-letters': 'jo', 'last2-letters': 'hn'}


In [4]:
# Vectorize the features function
features = np.vectorize(features)
print (features(["Anna", "Hannah", "Paul"]))

# Extract the features for the whole dataset
X = features(names[:, 0])
 
# Get the gender column
y = names[:, 1] 
 
# check the dataset
print ("Name: %s, features=%s, gender=%s" % (names[0][0], X[0], y[0]))

[{'last-letter': 'a', 'first-letter': 'a', 'first3-letters': 'ann', 'last3-letters': 'nna', 'first2-letters': 'an', 'last2-letters': 'na'}
 {'last-letter': 'h', 'first-letter': 'h', 'first3-letters': 'han', 'last3-letters': 'nah', 'first2-letters': 'ha', 'last2-letters': 'ah'}
 {'last-letter': 'l', 'first-letter': 'p', 'first3-letters': 'pau', 'last3-letters': 'aul', 'first2-letters': 'pa', 'last2-letters': 'ul'}]
Name: Mary, features={'last-letter': 'y', 'first-letter': 'm', 'first3-letters': 'mar', 'last3-letters': 'ary', 'first2-letters': 'ma', 'last2-letters': 'ry'}, gender=F


In [5]:
from sklearn.utils import shuffle
X, y = shuffle(X, y)
X_train, X_test = X[:int(TRAIN_SPLIT * len(X))], X[int(TRAIN_SPLIT * len(X)):]
y_train, y_test = y[:int(TRAIN_SPLIT * len(y))], y[int(TRAIN_SPLIT * len(y)):]
 
# Check to see if the datasets add up
print (len(X_train), len(X_test), len(y_train), len(y_test))

76020 19005 76020 19005


In [6]:
from sklearn.feature_extraction import DictVectorizer
 
print (features(["Mary", "John"]))
vectorizer = DictVectorizer()
vectorizer.fit(X_train)
 
transformed = vectorizer.transform(features(["Mary", "John"]))
print (transformed)

 
print (type(transformed)) # <class 'scipy.sparse.csr.csr_matrix'>
print (transformed.toarray()[0][12])    # 1.0
print (vectorizer.feature_names_[12])  # first-letter=m

[{'last-letter': 'y', 'first-letter': 'm', 'first3-letters': 'mar', 'last3-letters': 'ary', 'first2-letters': 'ma', 'last2-letters': 'ry'}
 {'last-letter': 'n', 'first-letter': 'j', 'first3-letters': 'joh', 'last3-letters': 'ohn', 'first2-letters': 'jo', 'last2-letters': 'hn'}]
  (0, 12)	1.0
  (0, 244)	1.0
  (0, 2682)	1.0
  (0, 4463)	1.0
  (0, 4771)	1.0
  (0, 5097)	1.0
  (1, 9)	1.0
  (1, 199)	1.0
  (1, 2232)	1.0
  (1, 4452)	1.0
  (1, 4587)	1.0
  (1, 7152)	1.0
<class 'scipy.sparse.csr.csr_matrix'>
1.0
first-letter=m


In [7]:
from sklearn.tree import DecisionTreeClassifier
 
clf = DecisionTreeClassifier()
clf.fit(vectorizer.transform(X_train), y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
print (clf.predict(vectorizer.transform(features(["Alex", "Emma"]))))

['M' 'F']


In [9]:
# Accuracy on training set
print (clf.score(vectorizer.transform(X_train), y_train))   
 
# Accuracy on test set
print (clf.score(vectorizer.transform(X_test), y_test))

0.9879636937647988
0.8669297553275453


In [20]:
print (clf.predict(vectorizer.transform(features(["Mohamed"]))))

['M']
