In [1]:
import nltk

In [2]:
def gender_features(word):
     return {'last_letter': word[-1]}

# Text Classification
Features are extracted from input training data set, then feature with correspondong label are fed to ML algo to generated classifier model. From testing data, again extracts feature but this time model will predict label.


In [3]:
from nltk.corpus import names

In [6]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + 
                 [(name, 'female') for name in names.words('female.txt')])

In [5]:
import random

In [7]:
random.shuffle(labeled_names)

In [8]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [12]:
# featuresets
'''
[({'last_letter': 'n'}, 'male'),
 ({'last_letter': 't'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
'''

"\n[({'last_letter': 'n'}, 'male'),\n ({'last_letter': 't'}, 'female'),\n ({'last_letter': 'a'}, 'female'),\n"

In [13]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [14]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [15]:
classifier.classify(gender_features('Neo'))

'male'

In [16]:
classifier.classify(gender_features('Trinity'))

'female'

In [17]:
print(nltk.classify.accuracy(classifier, test_set))

0.752


In [18]:
 classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     35.4 : 1.0
             last_letter = 'k'              male : female =     30.3 : 1.0
             last_letter = 'f'              male : female =     16.6 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'v'              male : female =      9.8 : 1.0


This listing shows that the names in the training set that end in "a" are female 35 times more often than they are male, but names that end in "k" are male 30 times more often than they are female. These ratios are known as likelihood ratios, and can be useful for comparing different feature-outcome relationships.

In [20]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

# Building Right Feature

In [22]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features2('John') 

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [23]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]

In [24]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [25]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [26]:
print(nltk.classify.accuracy(classifier, test_set))

0.776


In [28]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [29]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

In [30]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [31]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.783


In [32]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if(guess != tag):
        errors.append((name, tag, guess))


In [36]:
for(name, tag, guess) in sorted(errors):
    print("name={}, correct={}, guess={}".format(name, tag, guess))

name=Abagael, correct=female, guess=male
name=Abagail, correct=female, guess=male
name=Abbie, correct=male, guess=female
name=Abdullah, correct=male, guess=female
name=Alison, correct=female, guess=male
name=Alix, correct=male, guess=female
name=Anais, correct=female, guess=male
name=Anatole, correct=male, guess=female
name=Andrea, correct=male, guess=female
name=Annabell, correct=female, guess=male
name=Anne-Mar, correct=female, guess=male
name=Antony, correct=male, guess=female
name=Ardelis, correct=female, guess=male
name=Artie, correct=male, guess=female
name=Arvy, correct=male, guess=female
name=Aube, correct=male, guess=female
name=Avis, correct=female, guess=male
name=Barnaby, correct=male, guess=female
name=Barney, correct=male, guess=female
name=Barnie, correct=male, guess=female
name=Barth, correct=male, guess=female
name=Bary, correct=male, guess=female
name=Beau, correct=female, guess=male
name=Bennie, correct=male, guess=female
name=Bentley, correct=male, guess=female
name

# Document Classification

In [37]:
from nltk.corpus import movie_reviews

In [38]:
documents = [(list(movie_reviews.words(fileid)), category) 
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [49]:
random.shuffle(documents)
#documents[:1]

In [42]:
movie_reviews.categories()

['neg', 'pos']

In [48]:
#movie_reviews.fileids('neg')

In [50]:
# Get top 2000 frequent words used in corpus to restrict the dictionary
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [51]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [52]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(plot)': True, 'contains(:)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(,)': True, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(.)': True, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, "contains(')": True, 'contains(s)': True, 'contains(deal)': False, 'contains

In [53]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [54]:
train_set, test_set = featuresets[100:], featuresets[:100]

In [55]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [56]:
featuresets[:1]

[({'contains(plot)': True,
   'contains(:)': True,
   'contains(two)': False,
   'contains(teen)': False,
   'contains(couples)': False,
   'contains(go)': True,
   'contains(to)': True,
   'contains(a)': True,
   'contains(church)': False,
   'contains(party)': False,
   'contains(,)': True,
   'contains(drink)': False,
   'contains(and)': True,
   'contains(then)': True,
   'contains(drive)': False,
   'contains(.)': True,
   'contains(they)': False,
   'contains(get)': False,
   'contains(into)': True,
   'contains(an)': True,
   'contains(accident)': False,
   'contains(one)': True,
   'contains(of)': True,
   'contains(the)': True,
   'contains(guys)': False,
   'contains(dies)': False,
   'contains(but)': True,
   'contains(his)': True,
   'contains(girlfriend)': False,
   'contains(continues)': False,
   'contains(see)': False,
   'contains(him)': False,
   'contains(in)': True,
   'contains(her)': True,
   'contains(life)': False,
   'contains(has)': True,
   'contains(nightmar

In [57]:
print(nltk.classify.accuracy(classifier, test_set))

0.85
