In [23]:
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk import NaiveBayesClassifier
import nltk
import string
import random

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
print(len(movie_reviews.fileids("neg")))
print(len(movie_reviews.fileids("pos")))

1000
1000


In [4]:
len(movie_reviews.fileids())

2000

In [5]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
random.shuffle(documents)
documents[0:5]

[(['*', '*', '*', 'the', 'following', 'review', ...], 'neg'),
 (['the', 'caveman', "'", 's', 'valentine', 'starring', ...], 'pos'),
 (['seen', 'august', '8', ',', '1998', 'at', '6', 'p', ...], 'pos'),
 (['matthew', 'broderick', 'and', 'high', 'school', ...], 'pos'),
 (['a', 'big', 'surprise', 'to', 'me', '.', 'the', ...], 'pos')]

In [8]:
def get_simple_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
stops = set(stopwords.words("english"))
punctuations = list(string.punctuation)
stops.update(punctuations)
print(stops)
print(punctuations)

{'other', '"', ']', 'these', 'him', '[', 'over', 'by', 'nor', 've', '<', 'but', ':', 'above', 'mustn', 'up', 'wouldn', ')', '_', 'before', "shouldn't", "wouldn't", 'very', 'shan', 'at', 'against', 'because', 'own', 'i', 'will', "couldn't", 'doing', 'below', '}', "haven't", 'yourself', 'she', '=', 'our', 'did', '|', 'aren', 'which', 'y', 'to', 'or', 'so', 'some', '^', 'during', 'with', 'not', 'they', 'haven', '*', '\\', 'themselves', 'm', 'yourselves', "mustn't", '/', 'this', 'same', '#', 'your', 'were', 'until', 'ma', 'through', 'between', 'down', 'what', "should've", "doesn't", 'was', '+', 'the', 'any', 'that', 'now', 'no', 'a', 'shouldn', 'from', "didn't", 'are', "you'll", 'only', 'we', 'those', 'and', 'under', 'here', 'don', "don't", "shan't", 'further', 'am', 'more', 'd', "needn't", '.', 'ain', 'than', 'on', ';', 'its', 'theirs', 'both', 'it', 'do', "isn't", ',', 'ours', 'there', 'having', 'doesn', 'himself', 'such', 'too', 's', 'isn', 'few', 'he', "it's", 'again', 't', 'for', 'if'

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [12]:
documents = [(clean_review(document),category) for document, category in documents]

In [13]:
print(documents[0])

(['follow', 'review', 'contains', 'spoiler', 'way', '---', 'rapist', 'matt', 'frewer', 'responds', 'supergirl', 'helen', 'slater', 'query', 'attack', 'example', 'mind', 'numbingly', 'bad', 'dialogue', 'supergirl', 'admit', 'love', 'superman', 'iii', 'know', 'hat', 'amongst', 'superman', 'faithful', 'dismiss', 'nothing', 'vehicle', 'richard', 'pryor', 'still', 'think', 'worthy', 'addition', 'man', 'steel', 'franchise', 'supergirl', 'spin', 'film', 'sort', 'series', 'producer', 'alexander', 'ilya', 'salkind', 'even', 'rival', 'superman', 'iii', 'term', 'quality', 'let', 'alone', 'superman', 'superman', 'ii', 'supergirl', 'lose', 'opening', 'scene', 'go', 'steadily', 'downhill', 'rest', 'two', 'hour', 'plus', 'run', 'time', 'film', 'begin', 'see', 'resident', 'argo', 'city', 'commune', 'like', 'place', 'consist', 'refugee', 'krypton', 'resides', 'inner', 'space', 'go', 'daily', 'life', 'ok', 'exile', 'krypton', 'blew', 'right', 'living', 'krypton', 'explode', 'get', 'inner', 'space', 'who

In [14]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [16]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [17]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [18]:
print(features)

['film', 'movie', 'one', 'make', 'like', 'character', 'get', 'see', 'go', 'time', 'well', 'scene', 'even', 'good', 'story', 'take', 'much', 'would', 'come', 'also', 'life', 'bad', 'give', 'two', 'look', 'know', '--', 'way', 'end', 'seem', 'first', 'work', 'year', 'thing', 'really', 'plot', 'say', 'play', 'little', 'show', 'man', 'people', 'star', 'could', 'try', 'never', 'director', 'love', 'great', 'performance', 'best', 'new', 'action', 'big', 'actor', 'find', 'want', 'many', 'watch', 'act', 'u', 'role', 'think', 'another', 'still', 'something', 'back', 'world', 'audience', 'turn', 'day', 'old', 'use', 'comedy', 'every', 'set', 'however', 'guy', 'enough', 'real', 'around', 'though', 'part', 'begin', 'feel', 'cast', 'point', 'write', 'funny', 'run', 'interest', 'last', 'may', 'script', 'young', 'woman', 'right', 'fact', 'name', 'lot', 'actually', 'nothing', 'long', 'place', 'effect', 'minute', 'screen', 'friend', 'ever', 'played', 'almost', 'although', 'moment', 'since', 'john', 'line

In [19]:
def get_feature_dict(words):
    current_feature = {}
    words_set = set(words)
    for w in features:
        current_feature[w] = w in words_set
    return current_feature

In [21]:
training_data = [(get_feature_dict(doc),category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc),category) for doc, category in testing_documents]

In [24]:
classifier = NaiveBayesClassifier.train(training_data)

In [29]:
nltk.classify.accuracy(classifier, testing_data)

0.818

In [30]:
classifier.show_most_informative_features(15)

Most Informative Features
                   inept = True              neg : pos    =     10.2 : 1.0
               stupidity = True              neg : pos    =     10.2 : 1.0
                   anger = True              pos : neg    =      9.8 : 1.0
             outstanding = True              pos : neg    =      9.7 : 1.0
                  seagal = True              neg : pos    =      9.3 : 1.0
               painfully = True              neg : pos    =      8.9 : 1.0
                   damon = True              pos : neg    =      8.3 : 1.0
               ludicrous = True              neg : pos    =      8.2 : 1.0
                 idiotic = True              neg : pos    =      7.9 : 1.0
             beautifully = True              pos : neg    =      7.6 : 1.0
                flawless = True              pos : neg    =      7.6 : 1.0
                 destine = True              pos : neg    =      7.3 : 1.0
                  martha = True              neg : pos    =      7.3 : 1.0