# Project 3 - Name Classification

Jimmy Ng & Corey Arnouts

Oct 23, 2020

## Set up

In [1]:
import nltk
nltk.download('names')
from nltk.corpus import names
import random

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\myvio\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


## Data prep

### Get name

In [2]:
names = ([(name, "male") for name in names.words("male.txt")] + \
         [(name, "female") for name in names.words("female.txt")])

In [3]:
random.seed(1234)
random.shuffle(names)

In [4]:
print('There are ' + str(len([g for (n, g) in names if g == 'female'])) + ' female names.')
print('There are ' + str(len([g for (n, g) in names if g == 'male'])) + ' male names.')

There are 5001 female names.
There are 2943 male names.


### Split data set into three

In [5]:
test, dev_test, training = names[:500], names[500:1000], names[1000:]

In [6]:
print('There are ' + str(len([g for (n, g) in training if g == 'female'])) + ' female names in the train set.')
print('There are ' + str(len([g for (n, g) in training if g == 'male'])) + ' male names in the train set.')
print("\n")
print('There are ' + str(len([g for (n, g) in dev_test if g == 'female'])) + ' female names in the dev test.')
print('There are ' + str(len([g for (n, g) in dev_test if g == 'male'])) + ' male names in the dev test.')
print("\n")
print('There are ' + str(len([g for (n, g) in test if g == 'female'])) + ' female names in the test set.')
print('There are ' + str(len([g for (n, g) in test if g == 'male'])) + ' male names in the test set.')

There are 4382 female names in the train set.
There are 2562 male names in the train set.


There are 319 female names in the dev test.
There are 181 male names in the dev test.


There are 300 female names in the test set.
There are 200 male names in the test set.


## Feature Engineering

### Error analysis

Here's a function modified from the example in the textbook. We can use it to print out the error/discrepancy of our prediction and true label.

In [7]:
def error_analysis(FEATURES):        
    errors = [] 
    for (name, tag) in dev_test:
        guess = classifier.classify(FEATURES(name))
        if guess != tag:
            errors.append((tag, guess, name))
    print("Number of Errors: ", len(errors))    
    for (tag, guess, name) in sorted(errors):
        print('correct = {:<8} guess = {:<8s} name = {:<30}'.format(tag, guess, name))

### Gender Feature 1
Here's a function from the textbook. We can use it to serve as a benchmark for improvement.

In [8]:
def features1(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in "abcdefghijklmnopqrstuvwxyz":
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [9]:
train_set = [(features1(n), gender) for (n, gender) in training]
classifier = nltk.NaiveBayesClassifier.train(train_set)

devtest_set = [(features1(n), gender) for (n, gender) in dev_test]
print(nltk.classify.accuracy(classifier, devtest_set))

0.758


In [10]:
error_analysis(features1)

Number of Errors:  121
correct = female   guess = male     name = Ardyth                        
correct = female   guess = male     name = Audry                         
correct = female   guess = male     name = Bird                          
correct = female   guess = male     name = Bridget                       
correct = female   guess = male     name = Brooke                        
correct = female   guess = male     name = Brynn                         
correct = female   guess = male     name = Charis                        
correct = female   guess = male     name = Chery                         
correct = female   guess = male     name = Christin                      
correct = female   guess = male     name = Corry                         
correct = female   guess = male     name = Darb                          
correct = female   guess = male     name = Dorolice                      
correct = female   guess = male     name = Fawne                         
correct = femal

### Gender Feature 2
For our second feature extractor, we will use the first one to three, and last one to three letters. 

In [11]:
def features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower() 
    features["prefix2"] = name[:2].lower()
    features["prefix3"] = name[:3].lower()
    features["suffix2"] = name[-2:].lower()
    features["suffix3"] = name[-3:].lower()    
    return features

In [12]:
train_set2 = [(features2(n), gender) for (n, gender) in training]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

devtest_set2 = [(features2(n), gender) for (n, gender) in dev_test]
print(nltk.classify.accuracy(classifier2, devtest_set2))

0.832


In [13]:
error_analysis(features2)

Number of Errors:  181
correct = male     guess = female   name = Adolphe                       
correct = male     guess = female   name = Adrian                        
correct = male     guess = female   name = Agustin                       
correct = male     guess = female   name = Ahmad                         
correct = male     guess = female   name = Alaa                          
correct = male     guess = female   name = Alan                          
correct = male     guess = female   name = Aldrich                       
correct = male     guess = female   name = Alic                          
correct = male     guess = female   name = Alister                       
correct = male     guess = female   name = Allah                         
correct = male     guess = female   name = Andre                         
correct = male     guess = female   name = Archie                        
correct = male     guess = female   name = Armstrong                     
correct = male 

Although the accuracy seems to be way up by more than 7%, this feature extractor is actually worse than the benchmark because it classifies every name as female. 

### Gender Feature 3
Let's modify the above feature extractor by counting letters, vowels and checking whether the first and last letters are vowel.

In [14]:
def features3(name):
    features = {}              
    features["count_letters"] = len(name)
    features["count_vowel"] = len([v for v in name if v in 'aeiou'])
    features["firstletter_is_vowel"] = name[0].lower() in 'aeiou'
    features["lastletter_is_vowel"] = name[-1].lower() in 'aeiou'    
    return features

In [15]:
train_set3 = [(features3(n), gender) for (n, gender) in training]
classifier3 = nltk.NaiveBayesClassifier.train(train_set3)

devtest_set3 = [(features3(n), gender) for (n, gender) in dev_test]
print(nltk.classify.accuracy(classifier3, devtest_set3))

0.726


In [16]:
error_analysis(features3)

Number of Errors:  181
correct = male     guess = female   name = Adolphe                       
correct = male     guess = female   name = Adrian                        
correct = male     guess = female   name = Agustin                       
correct = male     guess = female   name = Ahmad                         
correct = male     guess = female   name = Alaa                          
correct = male     guess = female   name = Alan                          
correct = male     guess = female   name = Aldrich                       
correct = male     guess = female   name = Alic                          
correct = male     guess = female   name = Alister                       
correct = male     guess = female   name = Allah                         
correct = male     guess = female   name = Andre                         
correct = male     guess = female   name = Archie                        
correct = male     guess = female   name = Armstrong                     
correct = male 

The accuracy is even worse, and all names are once again classified as female. 

### Gender Feature 4
Instead of identifying vowels, let's combine useful features from the features1, 2 and 3 and see if we can do better.

In [17]:
def features4(name):
    features = {}              
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    features["count_letters"] = len(name)    
    for letter in "abcdefghijklmnopqrstuvwxyz":
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())        
    features["prefix2"] = name[:2].lower()
    features["prefix3"] = name[:3].lower()
    features["suffix2"] = name[-2:].lower()
    features["suffix3"] = name[-3:].lower()        
    return features

In [18]:
train_set4 = [(features4(n), gender) for (n, gender) in training]
classifier4 = nltk.NaiveBayesClassifier.train(train_set4)

devtest_set4 = [(features4(n), gender) for (n, gender) in dev_test]
print(nltk.classify.accuracy(classifier4, devtest_set4))

0.822


In [19]:
error_analysis(features4)

Number of Errors:  121
correct = female   guess = male     name = Ardyth                        
correct = female   guess = male     name = Audry                         
correct = female   guess = male     name = Bird                          
correct = female   guess = male     name = Bridget                       
correct = female   guess = male     name = Brooke                        
correct = female   guess = male     name = Brynn                         
correct = female   guess = male     name = Charis                        
correct = female   guess = male     name = Chery                         
correct = female   guess = male     name = Christin                      
correct = female   guess = male     name = Corry                         
correct = female   guess = male     name = Darb                          
correct = female   guess = male     name = Dorolice                      
correct = female   guess = male     name = Fawne                         
correct = femal

The result is much better and has the best accuracy among the 4 feature extractors that we have built. Let's apply it on our test set.

## Test set

In [20]:
test_set = [(features4(n), gender) for (n, gender) in test]
print(nltk.classify.accuracy(classifier, test_set))

0.762
