In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
import string
from sklearn.model_selection import train_test_split
import random

In [2]:
def get_simple_pos_tag(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [3]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))    

In [4]:
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

lemmatizer = WordNetLemmatizer() 

def clean_review(word_list):
    final_words = []
    for word in word_list:
        if word.lower() not in stop_words:
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word, pos = get_simple_pos_tag(pos[0][1]))
            final_words.append(clean_word.lower())
    return final_words

In [5]:
documents = [(clean_review(document_words), category) for document_words, category in documents] 

In [6]:
random.shuffle(documents)

In [7]:
documents_train = documents[0: 1500]
documents_test = documents[1500:]

In [8]:
all_words = []
for doc in documents_train:
    all_words += doc[0]

In [9]:
frequency = FreqDist(all_words)
features = [i[0] for i in frequency.most_common(3000)]

In [10]:
def get_feature_dictionary(document_words):
    document_features = {}
    document_words = set(document_words)
    for word in features:
        document_features[word] = (word in document_words)
    return document_features

In [11]:
training_data = [(get_feature_dictionary(document_words), category) for document_words, category in documents_train]
testing_data = [(get_feature_dictionary(document_words), category) for document_words, category in documents_test]

In [12]:
classifier = NaiveBayesClassifier.train(training_data)

In [13]:
nltk.classify.accuracy(classifier, testing_data)

0.798

In [14]:
classifier.show_most_informative_features(20)

Most Informative Features
             beautifully = True              pos : neg    =     12.7 : 1.0
                  seagal = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.3 : 1.0
             outstanding = True              pos : neg    =     10.8 : 1.0
                     era = True              pos : neg    =      9.5 : 1.0
                   inept = True              neg : pos    =      9.0 : 1.0
                   jolie = True              neg : pos    =      9.0 : 1.0
                   anger = True              pos : neg    =      8.8 : 1.0
                  turkey = True              neg : pos    =      6.7 : 1.0
             wonderfully = True              pos : neg    =      6.6 : 1.0
            breathtaking = True              pos : neg    =      6.4 : 1.0
                   awful = True              neg : pos    =      6.2 : 1.0
                 unusual = True              pos : neg    =      5.9 : 1.0