In [1]:
import nltk
from nltk.corpus import names
import random

# Download dataset
nltk.download('names')

# Step 1: Load and prepare the data

# The names corpus contains two files: male.txt and female.txt.
# Each file lists thousands of names labeled by gender.
# I combined these lists into a single dataset of (name, label) pairs.
names_data = [(name, 'male') for name in names.words('male.txt')] + \
              [(name, 'female') for name in names.words('female.txt')]

# Randomize order so that training, dev-test, and test sets
# contain a balanced mixture of male and female names.
random.shuffle(names_data)

# Step 2: Split the data into training, dev-test, and test sets

# 500 names for the test set,
# 500 for the dev-test set (used to tune the model),
# and the remaining 6,900 names for training.
train_names = names_data[1000:]
devtest_names = names_data[500:1000]
test_names = names_data[:500]


# Step 3: Define a baseline feature extractor

# A feature extractor turns a name into a dictionary of features.
# The simplest possible feature is the last letter of the name,
# since names ending in 'a', 'e', or 'i' are often female.
def gender_features(word):
    return {'suffix1': word[-1:].lower()}

# Convert raw names into feature sets
train_set = [(gender_features(n), g) for (n, g) in train_names]
devtest_set = [(gender_features(n), g) for (n, g) in devtest_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]

# Train a Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate on the dev-test set to estimate performance
print("Baseline accuracy (dev-test):", nltk.classify.accuracy(classifier, devtest_set))
# This  one-letter model typically achieves around 75–77% accuracy.
# It’s fast but misses many patterns like, “Cindy” and “Mindy” both end in ‘y’
# but belong to different genders.

# Step 4: Add a two-letter suffix to capture more detail

# Certain two-letter endings like “yn”, “ie”, “ch”, or “an” can be gender-specific.
# Adding this second feature often boosts performance by several percentage points.
def gender_features2(word):
    return {
        'suffix1': word[-1:].lower(),
        'suffix2': word[-2:].lower()
    }

train_set = [(gender_features2(n), g) for (n, g) in train_names]
devtest_set = [(gender_features2(n), g) for (n, g) in devtest_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Improved suffix accuracy (dev-test):", nltk.classify.accuracy(classifier, devtest_set))
# Including two-letter suffixes helps catch patterns like “-yn” (female)
# or “-ch” (male). This usually improves accuracy to about 78–79%.
# Still, the classifier only looks at name endings.

# Step 5: Add richer linguistic features

#  I expanded the feature extractor to include:
# - First and second letters (prefix information)
# - Number of vowels (females often have more vowels)
# - Whether the name ends with a vowel
# These additions give the classifier a fuller sense of a name’s shape.
def gender_features3(name):
    name = name.lower()
    features = {
        'suffix1': name[-1:],                 # last letter
        'suffix2': name[-2:],                 # last two letters
        'prefix1': name[0],                   # first letter
        'prefix2': name[:2],                  # first two letters
        'vowel_count': sum(1 for c in name if c in 'aeiou'),  # how many vowels
        'last_is_vowel': name[-1] in 'aeiou'  # True if name ends in a vowel
    }
    return features

train_set = [(gender_features3(n), g) for (n, g) in train_names]
devtest_set = [(gender_features3(n), g) for (n, g) in devtest_names]
test_set = [(gender_features3(n), g) for (n, g) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate on dev-test and test sets
dev_acc = nltk.classify.accuracy(classifier, devtest_set)
test_acc = nltk.classify.accuracy(classifier, test_set)

print("\nFinal model performance:")
print("Dev-test accuracy:", round(dev_acc, 3))
print("Test accuracy:", round(test_acc, 3))

# The dev-test set helps tune the model. 
# I'll use the test set only once for final evaluation.
# Typically, dev-test accuracy might be slightly higher
# than the final test accuracy, which is expected
# because the test set is unseen data.

# Show which features were most informative for the classifier
classifier.show_most_informative_features(10)

# The output lists features most useful for distinguishing genders.
# Like, “suffix2 = ‘na’” might strongly indicate female names,
# while “suffix2 = ‘rd’” or “prefix1 = ‘k’” often indicate male names.

# How does the performance on the test set compare to the performance on the dev-test set? Is this what you'd expect?:
# The performance on the test set is usually slightly lower than on the dev-test set,
# which is what I expected. The dev-test set is used during model tuning,
# so the classifier may adapt to patterns specific to that data. 
# The test set contains unseen examples and provides 
# a more better measure of how well the model generalizes. 
# A small drop in accuracy—typically one or two percentage points—suggests that 
# the model generalizes well without overfitting to the development data.



Baseline accuracy (dev-test): 0.772
Improved suffix accuracy (dev-test): 0.796

Final model performance:
Dev-test accuracy: 0.794
Test accuracy: 0.776
Most Informative Features
                 suffix2 = 'na'           female : male   =    157.7 : 1.0
                 suffix2 = 'la'           female : male   =     73.3 : 1.0
                 suffix1 = 'k'              male : female =     38.4 : 1.0
                 suffix2 = 'ia'           female : male   =     37.3 : 1.0
                 suffix2 = 'us'             male : female =     37.0 : 1.0
                 suffix2 = 'sa'           female : male   =     33.8 : 1.0
                 suffix1 = 'a'            female : male   =     33.5 : 1.0
                 suffix2 = 'ta'           female : male   =     30.4 : 1.0
                 suffix2 = 'rd'             male : female =     30.3 : 1.0
                 suffix2 = 'ld'             male : female =     24.4 : 1.0


[nltk_data] Downloading package names to
[nltk_data]     C:\Users\hotga\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
