# Classification Tasks with NLTK

### Setup

In [1]:
import random
import string

import nltk
from nltk.corpus import names

In [2]:
nltk.download("names")

[nltk_data] Downloading package names to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

### Feature Extraction

In [3]:
def features(word):
    return {"last_letter": word[-1]}

In [4]:
tagset = [(name, "male") for name in names.words("male.txt")] + [(name, "female") for name in names.words("female.txt")]
random.shuffle(tagset)

In [5]:
tagset[:15]

[('Dana', 'female'),
 ('Terrence', 'male'),
 ('Briggs', 'male'),
 ('Godfry', 'male'),
 ('Gwyneth', 'female'),
 ('Viole', 'female'),
 ('Blakeley', 'female'),
 ('Jud', 'male'),
 ('Aila', 'female'),
 ('Latrena', 'female'),
 ('Katrina', 'female'),
 ('Annice', 'female'),
 ('Whit', 'male'),
 ('Cindelyn', 'female'),
 ('Ardath', 'female')]

In [6]:
len(tagset)

7944

In [7]:
fset = [(features(n), g) for (n, g) in tagset]
n_values = int(0.85 * len(tagset))
train_set, test_set = fset[:n_values], fset[n_values:]

### Train the classifier

In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
name = "Antonio"
classifier.classify(features(name))

'male'

In [10]:
nltk.classify.accuracy(classifier, test_set)

0.7550335570469798

### Using more features

In [24]:
def more_features(name):
    feats = {}
    feats["first_letters"] = name[:2].lower()
    feats["last_letters"] = name[-2:].lower()
    for letter in string.ascii_lowercase:
        feats[f"count({letter})"] = name.lower().count(letter)
        feats[f"includes({letter})"] = (letter in name.lower())
    
    return feats

In [25]:
fset = [(more_features(n), g) for (n, g) in tagset]
train_set, test_set = fset[:n_values], fset[n_values:]

In [26]:
classifier_more_feats = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier_more_feats, test_set)

0.7835570469798657