In [1]:
import nltk
from nltk.corpus import movie_reviews
import random
import pandas as pd

### 1.Load data

In [2]:
type(movie_reviews)

nltk.corpus.util.LazyCorpusLoader

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

In [5]:
random.shuffle(documents)

In [6]:
type(documents)

list

In [7]:
len(documents)

2000

In [8]:
print(documents[0])
# documents: list of 2000 tuple, each tuple contain data & target

(['while', 'watching', 'wes', 'anderson', "'", 's', 'rushmore', ',', 'it', 'may', 'be', 'surprising', 'to', 'think', 'that', 'the', 'role', 'of', 'max', 'fischer', ',', 'the', 'film', "'", 's', 'anti', '-', 'hero', ',', 'was', 'not', 'written', 'with', 'jason', 'schwartzman', 'in', 'mind', '.', 'the', 'young', 'actor', ',', 'making', 'his', 'film', 'debut', 'as', 'a', '15', 'year', 'old', 'student', 'at', 'the', 'exclusive', 'rushmore', 'academy', 'is', 'so', 'perfect', 'in', 'the', 'role', 'that', 'it', 'comes', 'as', 'a', 'shock', 'that', 'anderson', 'auditioned', 'some', 'two', 'thousand', 'other', 'prospective', 'stars', 'before', 'discovering', 'schwarztman', '.', 'schwartzman', 'plays', 'max', ',', 'a', 'student', 'who', 'is', 'more', 'interested', 'in', 'his', 'extra', '-', 'curricular', 'activities', '(', 'such', 'as', 'writing', 'edgy', 'plays', 'like', '"', 'serpico', '"', 'and', 'being', 'president', 'of', 'the', 'rushmore', 'beekeepers', ')', 'but', 'sees', 'his', 'grades',

In [9]:
pd.DataFrame(documents).head()

Unnamed: 0,0,1
0,"[while, watching, wes, anderson, ', s, rushmor...",pos
1,"[the, most, popular, trend, of, the, last, cou...",neg
2,"[what, were, they, thinking, ?, nostalgia, for...",neg
3,"[i, guess, that, if, a, very, wild, bachelor, ...",neg
4,"[for, his, directoral, debut, ,, gary, oldman,...",neg


### 2. Obtain a list of the 2000 most frequent words

**Dictionary of unique words with frequency**

In [10]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
# Dictionary: key : unique words, values: freq of each words

In [11]:
type(all_words)

nltk.probability.FreqDist

In [12]:
len(all_words)

39768

In [13]:
all_words

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

**List of 2000 most frequent words**

In [14]:
word_features = list(all_words)[:2000]
# obtain a list of the most 2000 most frequent words in the documents

In [15]:
print(word_features)

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life', 'off', 'too', 'any', 'does', 'really', 'had', 'while', 'films', 'how', 'plot', 'little', 'where', 'people', 'over', 'could', 'then', 'me', 'scene', 'man', 'bad', 'my', 'never', 'being', 'best', 'these', 'don', 'new', 'doesn', 'scenes', 'many', 'director', 'such', 'know', 'were', 'mo

### 3. Create features from Data:
- a list of 2000 tuple, each tuple is have 2 elements (a dictionary of features & target)

In [16]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [17]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [18]:
print(type(featuresets))
print(len(featuresets))
print(featuresets[0])

<class 'list'>
2000
({'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': True, 'contains(i)': True, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': False, 'contains(t)': True, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': False, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': False, 'contains(from)': True, 'contains(at)': True, 'contains(was)': True, 'contains(have)': False, 'contains(they)': False, 'contains(has)': True, 'contains

In [19]:
pd.DataFrame(featuresets)

Unnamed: 0,0,1
0,"{'contains(,)': True, 'contains(the)': True, '...",pos
1,"{'contains(,)': True, 'contains(the)': True, '...",neg
2,"{'contains(,)': True, 'contains(the)': True, '...",neg
3,"{'contains(,)': True, 'contains(the)': True, '...",neg
4,"{'contains(,)': True, 'contains(the)': True, '...",neg
...,...,...
1995,"{'contains(,)': True, 'contains(the)': True, '...",pos
1996,"{'contains(,)': True, 'contains(the)': True, '...",pos
1997,"{'contains(,)': True, 'contains(the)': True, '...",neg
1998,"{'contains(,)': True, 'contains(the)': True, '...",pos


### 4. Split and train model

In [20]:
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [21]:
nltk.classify.accuracy(classifier, test_set)

0.82

In [22]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.4 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
        contains(seagal) = True              neg : pos    =      8.2 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
        contains(wasted) = True              neg : pos    =      6.0 : 1.0
