In [11]:
from nltk.corpus import movie_reviews
import nltk

In [12]:
movie_reviews.categories()

['neg', 'pos']

In [13]:
len(movie_reviews.fileids())

2000

In [14]:
len(movie_reviews.fileids('neg'))

1000

In [15]:
len(movie_reviews.fileids('pos'))

1000

In [16]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [17]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))        
documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [18]:
from nltk.corpus import stopwords #to get the stopwords
import string
from nltk import pos_tag #to tag the part of the sentence
import random #to randomly shuffle the dataset
from nltk.stem import WordNetLemmatizer #lemmatizer
from nltk.corpus import wordnet #

In [19]:
random.shuffle(documents)
documents[:5]

[(['"', 'spawn', '"', 'features', 'good', 'guys', ',', ...], 'neg'),
 (['are', 'you', 'tired', 'of', 'all', 'the', 'hot', ...], 'pos'),
 (['you', 'know', 'something', ',', 'christmas', 'is', ...], 'neg'),
 (['larry', 'flynt', 'is', 'a', 'self', 'proclaimed', ...], 'pos'),
 (['this', 'is', 'a', 'stagy', 'film', 'adapted', 'from', ...], 'pos')]

In [20]:
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english')) 
stops.update(list(string.punctuation))
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [21]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [22]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w]) ## pos_tag expects an array
            clean_words = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1])) 
            #by passing pos_tag[0][1] we enter tuple and access the part of the sentence it has denoted the word
            output_words.append(clean_words.lower())
    return output_words

In [25]:
documents = [(clean_review(document), category) for document,category in documents]

In [26]:
documents[0]

(['spawn',
  'feature',
  'good',
  'guy',
  'bad',
  'guy',
  'lot',
  'fight',
  'bloody',
  'violence',
  'leather',
  'clad',
  'machine',
  'gun',
  'chick',
  'gooey',
  'self',
  'heal',
  'bullet',
  'hole',
  'scatological',
  'humor',
  'man',
  'eat',
  'monster',
  'appear',
  'tailor',
  'make',
  'swarm',
  '12',
  '13',
  'year',
  'old',
  'boy',
  'appear',
  'make',
  'classic',
  'example',
  'tell',
  'show',
  'spawn',
  'open',
  'truckload',
  'mumbo',
  'jumbo',
  'force',
  'darkness',
  'force',
  'light',
  'men',
  'one',
  'create',
  'evil',
  'earth',
  'much',
  'message',
  'movie',
  'lurch',
  'forward',
  'plight',
  'al',
  'simmons',
  'michael',
  'jai',
  'white',
  'government',
  'assassin',
  'operative',
  'murder',
  'diabolical',
  'bos',
  'jason',
  'wynn',
  'martin',
  'sheen',
  'play',
  'scene',
  'like',
  'oscar',
  'clip',
  'top',
  'secret',
  'mission',
  'north',
  'korean',
  'biological',
  'weapon',
  'plant',
  'simmons',


In [27]:
training_documents = documents[:1500]
testing_documents = documents[1500:]

In [28]:
all_words = []
for doc in training_documents:
    all_words +=doc[0]

In [29]:
freq = nltk.FreqDist(all_words)

In [30]:
common = freq.most_common(3000)
common

[('film', 8319),
 ('movie', 5169),
 ('one', 4579),
 ('make', 3219),
 ('like', 2922),
 ('character', 2889),
 ('get', 2740),
 ('see', 2337),
 ('go', 2265),
 ('time', 2220),
 ('well', 2107),
 ('even', 1964),
 ('scene', 1932),
 ('good', 1772),
 ('story', 1748),
 ('take', 1631),
 ('would', 1537),
 ('much', 1523),
 ('come', 1484),
 ('bad', 1470),
 ('give', 1463),
 ('life', 1457),
 ('also', 1449),
 ('way', 1441),
 ('two', 1418),
 ('look', 1403),
 ('--', 1401),
 ('know', 1394),
 ('first', 1393),
 ('end', 1362),
 ('seem', 1358),
 ('year', 1304),
 ('work', 1263),
 ('thing', 1261),
 ('plot', 1182),
 ('say', 1169),
 ('little', 1147),
 ('really', 1142),
 ('play', 1124),
 ('could', 1073),
 ('show', 1069),
 ('people', 1065),
 ('star', 1045),
 ('director', 1033),
 ('try', 1033),
 ('love', 1031),
 ('man', 1029),
 ('best', 1005),
 ('never', 1003),
 ('big', 992),
 ('new', 972),
 ('great', 972),
 ('actor', 963),
 ('performance', 960),
 ('many', 955),
 ('u', 932),
 ('want', 923),
 ('find', 915),
 ('action'

In [31]:
features = [i[0] for i in common]

In [32]:
def features_dic(words):
    word_set = set(words)
    curr_features = {}
    for w in features:
        curr_features[w] = w in word_set
    return curr_features

In [33]:
training_data = [(features_dic(doc), category) for doc, category in training_documents]
testing_data = [(features_dic(doc), category) for doc, category in testing_documents]

In [34]:
from nltk import NaiveBayesClassifier

In [35]:
clf = NaiveBayesClassifier.train(training_data)

In [36]:
nltk.classify.accuracy(clf,testing_data)

0.81

In [37]:
clf.show_most_informative_features(15)

Most Informative Features
               stupidity = True              neg : pos    =     10.8 : 1.0
             outstanding = True              pos : neg    =      9.7 : 1.0
             wonderfully = True              pos : neg    =      9.5 : 1.0
                   mulan = True              pos : neg    =      8.8 : 1.0
                 idiotic = True              neg : pos    =      7.7 : 1.0
            breathtaking = True              pos : neg    =      7.7 : 1.0
                 destine = True              pos : neg    =      7.2 : 1.0
                    jedi = True              pos : neg    =      6.5 : 1.0
                  welles = True              neg : pos    =      6.5 : 1.0
                 balance = True              pos : neg    =      6.3 : 1.0
                   anger = True              pos : neg    =      6.2 : 1.0
                  poorly = True              neg : pos    =      6.2 : 1.0
                lifeless = True              neg : pos    =      6.0 : 1.0

In [38]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [39]:
svc = SVC()

In [40]:
classifier_sklearn = SklearnClassifier(svc)

In [41]:
classifier_sklearn.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))>

In [42]:
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.802

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)
classifier_sklearn1.train(training_data)
nltk.classify.accuracy(classifier_sklearn1,testing_data)



0.678