In [2]:
# This is a sentiment classification project built in Python 3. Uses Naive Bayes

In [3]:
import nltk
from nltk.corpus import movie_reviews
import random

In [4]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [8]:
# Define a feature extractor for the created documents

# First, creating a list of the 2000 most frequently used words in the overall corpus

# Then, define a feature extractor checking if each of the 2000 words is present in a given document

In [9]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [10]:
# Training the Naive Bayes classifier

In [15]:
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [16]:
# Let's test out the classifier

print(nltk.classify.accuracy(classifier, test_set))

0.83


In [17]:
# Let's take a look at what Naive Bayes interprets as the most important features

classifier.show_most_informative_features(5)

Most Informative Features
        contains(turkey) = True              neg : pos    =      8.4 : 1.0
 contains(unimaginative) = True              neg : pos    =      8.3 : 1.0
        contains(temper) = True              pos : neg    =      7.7 : 1.0
    contains(schumacher) = True              neg : pos    =      7.3 : 1.0
        contains(suvari) = True              neg : pos    =      6.9 : 1.0


In [None]:
# Now we can see that according to Naive Bayes, if a document contains the word, "turkey," it is 8.4 times more likely to be negative than positive.