In [8]:
import nltk
from nltk.corpus import movie_reviews

In [30]:
nltk.FreqDist(movie_reviews.words())
all_words = nltk.FreqDist(word.lower() for word in movie_reviews.words())

In [53]:
len(list(all_words))
word_features = list(all_words)[:2000]

In [42]:
documents = [([word for word in movie_reviews.words(fileid)], category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

In [90]:
import random
random.seed = 151515895
random.shuffle(documents)

In [100]:
def document_features(document):
    features = {}
    document_words = set(document)
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

{'contains(,)': True,
 'contains(the)': True,
 'contains(.)': True,
 'contains(a)': True,
 'contains(and)': True,
 'contains(of)': True,
 'contains(to)': True,
 "contains(')": True,
 'contains(is)': True,
 'contains(in)': True,
 'contains(s)': True,
 'contains(")': True,
 'contains(it)': True,
 'contains(that)': True,
 'contains(-)': True,
 'contains())': True,
 'contains(()': True,
 'contains(as)': True,
 'contains(with)': True,
 'contains(for)': True,
 'contains(his)': True,
 'contains(this)': True,
 'contains(film)': False,
 'contains(i)': False,
 'contains(he)': True,
 'contains(but)': True,
 'contains(on)': True,
 'contains(are)': True,
 'contains(t)': False,
 'contains(by)': True,
 'contains(be)': True,
 'contains(one)': True,
 'contains(movie)': True,
 'contains(an)': True,
 'contains(who)': True,
 'contains(not)': True,
 'contains(you)': True,
 'contains(from)': True,
 'contains(at)': False,
 'contains(was)': False,
 'contains(have)': True,
 'contains(they)': True,
 'contains(h

In [92]:
features_set = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = features_set[100:], features_set[:100]
len(features_set)

2000

In [93]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [94]:
nltk.classify.accuracy(classifier, test_set)

0.82

In [95]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
        contains(seagal) = True              neg : pos    =      8.4 : 1.0
         contains(mulan) = True              pos : neg    =      7.5 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.3 : 1.0
         contains(damon) = True              pos : neg    =      6.0 : 1.0


In [96]:
reference = [d[1] for d in test_set]
test = [classifier.classify(d[0]) for d in test_set]
print(nltk.ConfusionMatrix(reference=reference, test=test))

    |  n  p |
    |  e  o |
    |  g  s |
----+-------+
neg |<53> 9 |
pos |  9<29>|
----+-------+
(row = reference; col = test)



In [97]:
true_positive = 0 
false_negative = 0 
for data in test_set:
    classify = classifier.classify(data[0])
    if data[1] == "pos" and classify != "pos": false_negative += 1
    if data[1] == "pos" and classify == "pos": true_positive += 1
print("Recall = {}".format(true_positive/(true_positive+false_negative)))

Recall = 0.7631578947368421


In [99]:
true_positive = 0 
false_positive = 0 
for data in test_set:
    classify = classifier.classify(data[0])
    if data[1] == "neg" and classify == "pos": false_positive += 1
    if data[1] == "pos" and classify == "pos": true_positive += 1
print("Precision = {}".format(true_positive/(true_positive+false_positive)))

Precision = 0.7631578947368421
