In [2]:
import nltk
import random
from nltk.corpus import names

# Download names corpus
nltk.download('names')


[nltk_data] Downloading package names to /home/codespace/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [3]:
# Label male and female names
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
               [(name, 'female') for name in names.words('female.txt')])

# Shuffle the data
random.shuffle(namesgender)


In [4]:
train_names = namesgender[500:]
test_names = namesgender[:500]


In [5]:
def gender_features(word):
    return {'last_letter': word[-1]}


In [6]:
def gender_features3(word):
    return {'suffix1': word[-1], 'suffix2': word[-2]}


In [7]:
# Feature sets with enhanced two-letter suffix features
train_set = [(gender_features3(n), g) for (n, g) in train_names]
test_set = [(gender_features3(n), g) for (n, g) in test_names]


In [8]:
# Train classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate accuracy on the test set
print("Accuracy:", nltk.classify.accuracy(classifier, test_set))


Accuracy: 0.784


In [None]:
def get_errors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features3(name))
        if guess != tag:
            errors.append((tag, guess, name))
    return errors

# Get and print errors
errors = get_errors(test_names)
print("Number of errors:", len(errors))


Number of errors: 108


In [10]:
def print_errors(errors):
    for (tag, guess, name) in sorted(errors):
        print(f'Correct={tag:<8s} Guess={guess:<8s} Name={name:<30s}')

print_errors(errors)


Correct=female   Guess=male     Name=Adah                          
Correct=female   Guess=male     Name=Alanah                        
Correct=female   Guess=male     Name=Amabel                        
Correct=female   Guess=male     Name=Ambur                         
Correct=female   Guess=male     Name=Arabel                        
Correct=female   Guess=male     Name=Ashleigh                      
Correct=female   Guess=male     Name=Brier                         
Correct=female   Guess=male     Name=Cameo                         
Correct=female   Guess=male     Name=Casey                         
Correct=female   Guess=male     Name=Cat                           
Correct=female   Guess=male     Name=Cherey                        
Correct=female   Guess=male     Name=Chriss                        
Correct=female   Guess=male     Name=Clarey                        
Correct=female   Guess=male     Name=Coriss                        
Correct=female   Guess=male     Name=Deloris    

PART 1 Bonus

In [11]:
# Define the gender_features_bonus function with three-letter suffixes
def gender_features_bonus(word):
    return {'suffix1': word[-1], 'suffix2': word[-2], 'suffix3': word[-3] if len(word) > 2 else ''}

# Create new train and test sets using the bonus feature function
train_set_bonus = [(gender_features_bonus(n), g) for (n, g) in train_names]
test_set_bonus = [(gender_features_bonus(n), g) for (n, g) in test_names]

# Train a Naïve Bayes classifier with the new feature set
classifier_bonus = nltk.NaiveBayesClassifier.train(train_set_bonus)

# Calculate and print accuracy
accuracy_bonus = nltk.classify.accuracy(classifier_bonus, test_set_bonus)
print("Bonus Accuracy with three-letter suffix:", accuracy_bonus)

# Define error-checking function
def get_errors_bonus(test):
    errors = []
    for (name, tag) in test:
        guess = classifier_bonus.classify(gender_features_bonus(name))
        if guess != tag:
            errors.append((tag, guess, name))
    return errors

# Get and print number of errors
errors_bonus = get_errors_bonus(test_names)
print("Number of errors with three-letter suffix:", len(errors_bonus))

# Print errors for review
def print_errors_bonus(errors):
    for (tag, guess, name) in sorted(errors):
        print(f'Correct={tag:<8s} Guess={guess:<8s} Name={name:<30s}')

print_errors_bonus(errors_bonus)

Bonus Accuracy with three-letter suffix: 0.788
Number of errors with three-letter suffix: 106
Correct=female   Guess=male     Name=Adah                          
Correct=female   Guess=male     Name=Alanah                        
Correct=female   Guess=male     Name=Amabel                        
Correct=female   Guess=male     Name=Ambur                         
Correct=female   Guess=male     Name=Arabel                        
Correct=female   Guess=male     Name=Ashleigh                      
Correct=female   Guess=male     Name=Brier                         
Correct=female   Guess=male     Name=Cameo                         
Correct=female   Guess=male     Name=Cat                           
Correct=female   Guess=male     Name=Cherey                        
Correct=female   Guess=male     Name=Clarey                        
Correct=female   Guess=male     Name=Cyb                           
Correct=female   Guess=male     Name=Deloris                       
Correct=female   Guess

PART 2


In [12]:
import nltk
from nltk.corpus import movie_reviews
import random

nltk.download('movie_reviews')


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [13]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


In [14]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


In [15]:
random.shuffle(documents)

In [16]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features_2000 = [word for (word, freq) in all_words.most_common(2000)]


In [17]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features


In [18]:
word_features_1000 = [word for (word, freq) in all_words.most_common(1000)]
featuresets_1000 = [(document_features(d, word_features_1000), c) for (d, c) in documents]

# Split into train and test sets (80-20 split)
split_index = int(0.8 * len(featuresets_1000))
train_set, test_set = featuresets_1000[:split_index], featuresets_1000[split_index:]

# Train and evaluate
classifier_1000 = nltk.NaiveBayesClassifier.train(train_set)
print("Accuracy with 1000 words:", nltk.classify.accuracy(classifier_1000, test_set))


Accuracy with 1000 words: 0.75


In [19]:
word_features_3000 = [word for (word, freq) in all_words.most_common(3000)]
featuresets_3000 = [(document_features(d, word_features_3000), c) for (d, c) in documents]

# Split into train and test sets
train_set, test_set = featuresets_3000[:split_index], featuresets_3000[split_index:]

# Train and evaluate
classifier_3000 = nltk.NaiveBayesClassifier.train(train_set)
print("Accuracy with 3000 words:", nltk.classify.accuracy(classifier_3000, test_set))


Accuracy with 3000 words: 0.7975
