# Project 3

Authors: Ari and Lucas

In the following excercise, we're going to analyze the Names corpus and build a gender classifier

In [11]:
import nltk
from nltk.corpus import names
import random
from collections import defaultdict
import soundex

soundex_instance = soundex.Soundex()
#Load and shuffle the names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

#Split the data
test_names = labeled_names[:500]
dev_test_names = labeled_names[500:1000]
train_names = labeled_names[1000:]

In [12]:
def gender_features_soundex_package(name):
    name_lower = name.lower()
    features = {
        'first_letter': name_lower[0],
        'last_letter': name_lower[-1],
        'length': len(name),
        'vowel_count': sum(1 for char in name_lower if char in 'aeiou'),
        'consonant_count': sum(1 for char in name_lower if char not in 'aeiou'),
        'suffix1': name_lower[-1:],
        'suffix2': name_lower[-2:],
        'suffix3': name_lower[-3:],
        'soundex': soundex_instance.soundex(name)
    }
    #Add character n-grams
    for i in range(len(name_lower) - 1):
        features[f'bigram_{name_lower[i:i+2]}'] = True
    for i in range(len(name_lower) - 2):
        features[f'trigram_{name_lower[i:i+3]}'] = True
        
    return features

#Prepare the feature sets with the new feature extractor
train_set_pkg = [(gender_features_soundex_package(name), gender) for (name, gender) in train_names]
dev_test_set_pkg = [(gender_features_soundex_package(name), gender) for (name, gender) in dev_test_names]
test_set_pkg = [(gender_features_soundex_package(name), gender) for (name, gender) in test_names]

#Train and evaluate the Naive Bayes classifier
classifier_nb_pkg = nltk.NaiveBayesClassifier.train(train_set_pkg)
accuracy_nb_pkg = nltk.classify.accuracy(classifier_nb_pkg, dev_test_set_pkg)
print(f"Naive Bayes accuracy with soundex package on dev-test set: {accuracy_nb_pkg:.4f}")

#Train and evaluate the Decision Tree classifier
classifier_dt_pkg = nltk.DecisionTreeClassifier.train(train_set_pkg)
accuracy_dt_pkg = nltk.classify.accuracy(classifier_dt_pkg, dev_test_set_pkg)
print(f"Decision Tree accuracy with soundex package on dev-test set: {accuracy_dt_pkg:.4f}")

Naive Bayes accuracy with soundex package on dev-test set: 0.8320
Decision Tree accuracy with soundex package on dev-test set: 0.7300


In [13]:
#Final evaluation on the test set
final_accuracy_test_pkg = nltk.classify.accuracy(classifier_nb_pkg, test_set_pkg)
print(f"\nFinal accuracy of the best classifier on the test set: {final_accuracy_test_pkg:.4f}")
print(f"Accuracy of the same classifier on the dev-test set: {accuracy_nb_pkg:.4f}")


Final accuracy of the best classifier on the test set: 0.8380
Accuracy of the same classifier on the dev-test set: 0.8320
