# <center>Book: Steven Bird, Ewan Klein, Edward Loper, 2009. **Natural Language Processing (NLP) with Python**, O'Reilly.</center> 

This notebook is an exploration of the solutions proposed by the user:
    https : // github.com / Sturz gef ahr

### Chapter 6 - Learning how to classify text

#### Identifying gender

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [2]:
import nltk, random
from nltk.corpus import names

labeled_names = ([(name, 'male') for name in names.words('male.txt')] + 
                 [(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)

In [3]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
classifier.classify(gender_features('Neo'))

'male'

In [5]:
classifier.classify(gender_features('Trinity'))

'female'

In [6]:
print(nltk.classify.accuracy(classifier, test_set))

0.738


In [7]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.2 : 1.0
             last_letter = 'k'              male : female =     31.0 : 1.0
             last_letter = 'f'              male : female =     16.1 : 1.0
             last_letter = 'p'              male : female =     11.3 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0


In [8]:
def gender_features(word):
    return {'last_letter': word[-1],
            'length'     : len(word)}

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.74


In [9]:
def gender_features(word):
    return {'last_letter' : word[-1],
            'length'      : len(word),
            "first_letter": word[0]}

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.732


In [10]:
def gender_features(word):
    return {'last_letter'  : word[-1],
            'length'       : len(word),
            "first_letter" : word[0],
            "number_vowels": sum([1 for ch in word if ch in 'AEIOUaeiouy'])}

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.756


In [11]:
classifier.show_most_informative_features(30)

Most Informative Features
             last_letter = 'a'            female : male   =     33.2 : 1.0
             last_letter = 'k'              male : female =     31.0 : 1.0
             last_letter = 'f'              male : female =     16.1 : 1.0
             last_letter = 'p'              male : female =     11.3 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0
             last_letter = 'd'              male : female =      9.8 : 1.0
             last_letter = 'm'              male : female =      8.4 : 1.0
             last_letter = 'o'              male : female =      8.2 : 1.0
             last_letter = 'r'              male : female =      7.0 : 1.0
             last_letter = 'g'              male : female =      5.1 : 1.0
            first_letter = 'W'              male : female =      4.8 : 1.0
             last_letter = 'u'              male : female =      4.7 : 1.0
             last_letter = 'w'              male : female =      4.5 : 1.0

In [12]:
def gender_features(word):
    return {'last_letter'  : word[-1],
            'length'       : len(word),
            'first_letter' : word[0],
            'last_2letters': word[-2:]}

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.766


In [13]:
def gender_features(word):
    return {'last_letter'  : word[-1],
            'length'       : len(word),
            'first_letter' : word[0],
            'last_2letters': word[-2:],
            'first_2letters': word[:2]}
            

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.774


In [14]:
def gender_features(word):
    return {'last_letter'  : word[-1],
            'length'       : len(word),
            'first_letter' : word[0],
            'last_2letters': word[-2:],
            'first_2letters': word[:2],
            'first_vowel'  : [i for i in range(len(word)) 
                              if word[i] in 'AEIOUaeiouy'][0]}
            

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.776


In [15]:
def gender_features(word):
    return {'last_letter'  : word[-1],
            'length'       : len(word),
            'first_letter' : word[0],
            'second_letter': word[1],
            'last_2letters': word[-2:],
            'first_2letters': word[:2],
            'first_vowel'  : [i for i in range(len(word)) 
                              if word[i] in 'AEIOUaeiouy'][0]}
            

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.768


In [16]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [17]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.768
