## Max Wagner
## Week 9 - Project 3 - Data 620

In [1]:
import nltk
from nltk.corpus import names
import random

In [2]:
# returns the last letter
def gender_features(word):
    return {'last_letter': word[-1]}

In [3]:
# loads in names, shuffles them up
names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [4]:
# break into a training and test set (500, 500, remaining)
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [5]:
# test with dev set for a starting level
print nltk.classify.accuracy(classifier, dev_set)

0.708


In [6]:
# whats the most imformative to start with
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = u'a'           female : male   =     17.3 : 1.0
             last_letter = u'o'             male : female =     12.6 : 1.0
             last_letter = u'r'             male : female =      7.4 : 1.0
             last_letter = u's'             male : female =      7.4 : 1.0
             last_letter = u'd'             male : female =      7.0 : 1.0
             last_letter = u'k'             male : female =      4.2 : 1.0
             last_letter = u'g'             male : female =      3.3 : 1.0
             last_letter = u'h'             male : female =      3.0 : 1.0
             last_letter = u'f'             male : female =      2.3 : 1.0
             last_letter = u'n'             male : female =      2.2 : 1.0


In [7]:
# a second gender features def from the book that will probably overfit
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [8]:
# break into a training and test set (500, 500, remaining)... again with gender2
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)
# looks like it's almost the same as it was before... overfitting

0.726


In [9]:
# print where the guesses were wrong
train_names = names[0:500]
devtest_names = names[500:1000]
test_names = names[1000:]

errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
        
for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)

correct=male     guess=female   name=Abel                          
correct=male     guess=female   name=Abraham                       
correct=male     guess=female   name=Ahmed                         
correct=male     guess=female   name=Allah                         
correct=male     guess=female   name=Amos                          
correct=male     guess=female   name=Andonis                       
correct=male     guess=female   name=Anthony                       
correct=male     guess=female   name=Antonio                       
correct=male     guess=female   name=Ashley                        
correct=male     guess=female   name=Barrie                        
correct=male     guess=female   name=Bartel                        
correct=male     guess=female   name=Bearnard                      
correct=male     guess=female   name=Beau                          
correct=male     guess=female   name=Berchtold                     
correct=male     guess=female   name=Bernie     

In [10]:
# trying 2 letters instead of 1
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [11]:
# break into a training and test set (500, 500, remaining)... again with 2 letter def
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.742


### Including 2 letters seemed to improve the accuracy, let's try with 3 letters instead.

In [12]:
# the 2 letters didn't seem to help much, let's try 3 letters
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features

In [13]:
# break into a training and test set (500, 500, remaining)... again with 3 letter def
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.758


### There was further improvement with the inclusion of 3 letters at the start and finish, but what about vowel count?

In [14]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features

In [25]:
import re

def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features

In [26]:
# break into a training and test set (500, 500, remaining)... again with 3 letter def and vowel count
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.762


### This raised the accuracy further. Maybe try more prefix and suffix letters along with it.

In [29]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["first4"] = name[:4].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    features["last4"] = name[-4:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features

In [30]:
# break into a training and test set (500, 500, remaining)... again with 3 letter def and vowel count
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.752


### This is worse than just choosing 3 letters, moving away from that option. Let's include a has statement this time, but with 3 letters.

In [31]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [32]:
# break into a training and test set (500, 500, remaining)... again with 4 letter def and has
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.754


### This is worse than not including a has statement. Let's try with a different type of classifier. In this case, a decision tree method.

In [47]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.DecisionTreeClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.616


### It seems worse than the naive bayes method by a fair amount. Maybe try something different than just letter sequences. Syllables seem to be a good route.

In [42]:
# using a mix of sources, a seemingly standard way to do it
def syllable_count(name):
    count = 0
    vowels = 'aeiouy'
    if len(name) <= 3:
        return 1
    if name[0] in vowels:
        count += 1
    for i in range(1, len(name)):
        if name[i] in vowels and name[i-1] not in vowels:
            count += 1
    if name.endswith('e'):
        count -= 1
    if name.endswith('le'):
        count+=1
    if name.endswith('bile'):
            count -= 1
    if count == 0:
        count +=1
    return count

In [43]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    features["syl_count"] = syllable_count(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [44]:
# break into a training and test set (500, 500, remaining)... with 3 letters and syllable count
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.752


### Unfortunately it's almost identical to before, maybe only 2 letters will give a better result.

In [45]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

# break into a training and test set (500, 500, remaining)... with 2 letters and syllable count
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.744


### Even worse! Back to 3 letters. And try the syllable method with a decision tree.

In [48]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["first2"] = name[:2].lower()
    features["first3"] = name[:3].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["last3"] = name[-3:].lower()
    features["syl_count"] = syllable_count(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.DecisionTreeClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.616


### Still awful in comparison, maybe some combo of first + last and syllables.

In [58]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.77


### Some improvement, maybe check if the last or first letter is a vowel?

In [57]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    features["vowel_last"] = name[-1] in 'aeiouy'
    features["vowel_first"] = name[1] in 'aeiouy'
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.764


### Worse, but not by much. Maybe some sort of transformation? Let's scramble the names.

In [68]:
def shuf(name):
    name_list = list(name)
    random.shuffle(name_list)
    return ''.join(name_list)

def gender_features(name):
    features = {}
    name = shuf(name)
    features["first1"] = name[:1].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.664


### Bad, as expected, but worth a shot. Maybe try switching a few letters.

In [69]:
def gender_features(name):
    features = {}
    name = name[-1:] + name[1:-1] + name[:1]
    features["first1"] = name[:1].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, dev_set)

0.736


### Seem's transformations are a bad idea as well. I'll stick with the method that included the first letter, last 2 letters, syllable count, and vowel count, as it was the best option of the bunch. Let's try it with the full test set.

In [70]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)

0.762816820276


### It performed a little worse than I had hoped it would, but it could be due to the smaller training group size. I am also testing changes on the dev group, not the test group. Let's see what 50/50 split in training/testing does.

In [76]:
def gender_features(name):
    features = {}
    features["first1"] = name[:1].lower()
    features["last1"] = name[-1:].lower()
    features["last2"] = name[-2:].lower()
    features["syl_count"] = syllable_count(name)
    features["vowel_count"] = len(re.findall(r'[aeiouy]', name))
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[0:4000], featuresets[4000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)

0.790821501014


### It seems that the larger training set helps to get a little better accuracy. In a better scenario, the data source for names would be much larger, which would mean a better training set, and more comprehensive testing. 