In [1]:
#nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import numpy as np
from nltk.stem.porter import PorterStemmer

In [2]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r') # open the file as read only
    text = file.read() # read all text
    file.close() # close the file
    return text

[Download Dataset](http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

In [3]:
# load the document
filename = 'imdb_review/pos/cv000_29590.txt'
text = load_doc(filename)
text

'films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes brothers to direct this seem

In [4]:
doc = text[:]
# split into tokens by white space
tokens = doc.split()
print(tokens)

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', "they're", 'about', 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(', 'casper', ')', 'or', 'the', 'arthouse', 'crowd', '(', 'ghost', 'world', ')', ',', 'but', "there's", 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '.', 'for', 'starters', ',', 'it', 'was', 'created', 'by', 'alan', 'moore', '(', 'and', 'eddie', 'campbell', ')', ',', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', "'80s", 'with', 'a', '12-part', 'series', 'called', 'the', 'watchmen', '.', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', '.', 'the', 'book', '(', 'or', '"', 'graphic', 'novel', ',', '"', 'if

In [5]:
# remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
print(tokens)

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', '', 'whether', 'theyre', 'about', 'superheroes', '', 'batman', '', 'superman', '', 'spawn', '', '', 'or', 'geared', 'toward', 'kids', '', 'casper', '', 'or', 'the', 'arthouse', 'crowd', '', 'ghost', 'world', '', '', 'but', 'theres', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '', 'for', 'starters', '', 'it', 'was', 'created', 'by', 'alan', 'moore', '', 'and', 'eddie', 'campbell', '', '', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', '80s', 'with', 'a', '12part', 'series', 'called', 'the', 'watchmen', '', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', '', 'the', 'book', '', 'or', '', 'graphic', 'novel', '', '', 'if', 'you', 'will', '', 'is'

In [6]:
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
print(tokens)

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', 'whether', 'theyre', 'about', 'superheroes', 'batman', 'superman', 'spawn', 'or', 'geared', 'toward', 'kids', 'casper', 'or', 'the', 'arthouse', 'crowd', 'ghost', 'world', 'but', 'theres', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', 'for', 'starters', 'it', 'was', 'created', 'by', 'alan', 'moore', 'and', 'eddie', 'campbell', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', 'with', 'a', 'series', 'called', 'the', 'watchmen', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', 'the', 'book', 'or', 'graphic', 'novel', 'if', 'you', 'will', 'is', 'over', 'pages', 'long', 'and', 'includes', 'nearly', 'more', 'that', 'consist', 'of', 'nothing', 'but', 'f

In [7]:
# filter out stop words
stop_words = set(stopwords.words('english'))
print('Stop Words :%s \n' % stop_words)
tokens = [w for w in tokens if not w in stop_words]
print(tokens)

Stop Words :{'ain', 'just', 'through', 'those', 'but', 'on', "that'll", 'ours', 'again', 'herself', 'our', 'other', 'won', 'be', 'how', 'mightn', 'once', 'who', 'why', 'no', 'only', 'hers', 'or', 'himself', 'over', 'up', 'too', "isn't", "you'd", "wouldn't", 'now', 'd', "don't", 'haven', 'has', 'here', 'theirs', 'off', "you're", 'after', 'whom', 'yours', 'there', 'the', 've', 'by', 'did', 'should', 'them', 'so', "it's", 'been', 'against', 'few', 'under', 'doing', 'they', 'not', 'into', 'because', 'have', 'me', 'themselves', 'of', 'we', "you'll", 'about', "should've", 'that', "mightn't", 'while', 'wasn', 'is', 'own', 'nor', 'are', 'you', 'am', 'weren', 'until', "she's", "wasn't", 'which', 'isn', 'an', 'its', 'ma', 'his', 'each', 'above', 'can', 'with', 'then', 'having', 'what', 'had', "you've", 'being', 'down', 'do', 'ourselves', 'during', 'most', 'further', 'these', "weren't", 'it', 'couldn', 'to', 'a', 'out', 'as', 'both', 'when', 'will', 'if', 'more', 'didn', 'wouldn', 'him', 'aren', 

In [8]:
ps = PorterStemmer()
tokens = [ps.stem(word) for word in tokens]
print(tokens)

['film', 'adapt', 'comic', 'book', 'plenti', 'success', 'whether', 'theyr', 'superhero', 'batman', 'superman', 'spawn', 'gear', 'toward', 'kid', 'casper', 'arthous', 'crowd', 'ghost', 'world', 'there', 'never', 'realli', 'comic', 'book', 'like', 'hell', 'starter', 'creat', 'alan', 'moor', 'eddi', 'campbel', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'seri', 'call', 'watchmen', 'say', 'moor', 'campbel', 'thoroughli', 'research', 'subject', 'jack', 'ripper', 'would', 'like', 'say', 'michael', 'jackson', 'start', 'look', 'littl', 'odd', 'book', 'graphic', 'novel', 'page', 'long', 'includ', 'nearli', 'consist', 'noth', 'footnot', 'word', 'dont', 'dismiss', 'film', 'sourc', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'anoth', 'stumbl', 'block', 'hell', 'director', 'albert', 'allen', 'hugh', 'get', 'hugh', 'brother', 'direct', 'seem', 'almost', 'ludicr', 'cast', 'carrot', 'top', 'well', 'anyth', 'riddl', 'better', 'direct', 'film', 'that', 'set', 'ghetto', 'f

In [9]:
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
print(tokens)

['film', 'adapt', 'comic', 'book', 'plenti', 'success', 'whether', 'theyr', 'superhero', 'batman', 'superman', 'spawn', 'gear', 'toward', 'kid', 'casper', 'arthous', 'crowd', 'ghost', 'world', 'there', 'never', 'realli', 'comic', 'book', 'like', 'hell', 'starter', 'creat', 'alan', 'moor', 'eddi', 'campbel', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'seri', 'call', 'watchmen', 'say', 'moor', 'campbel', 'thoroughli', 'research', 'subject', 'jack', 'ripper', 'would', 'like', 'say', 'michael', 'jackson', 'start', 'look', 'littl', 'odd', 'book', 'graphic', 'novel', 'page', 'long', 'includ', 'nearli', 'consist', 'noth', 'footnot', 'word', 'dont', 'dismiss', 'film', 'sourc', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'anoth', 'stumbl', 'block', 'hell', 'director', 'albert', 'allen', 'hugh', 'get', 'hugh', 'brother', 'direct', 'seem', 'almost', 'ludicr', 'cast', 'carrot', 'top', 'well', 'anyth', 'riddl', 'better', 'direct', 'film', 'that', 'set', 'ghetto', 'f

In [10]:
# turn a review doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

### Define a Vocabulary

In [11]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

In [12]:
from collections import Counter
from os import listdir

In [13]:
print(listdir('imdb_review/pos'))

['cv000_29590.txt', 'cv001_18431.txt', 'cv002_15918.txt', 'cv003_11664.txt', 'cv004_11636.txt', 'cv005_29443.txt', 'cv006_15448.txt', 'cv007_4968.txt', 'cv008_29435.txt', 'cv009_29592.txt', 'cv010_29198.txt', 'cv011_12166.txt', 'cv012_29576.txt', 'cv013_10159.txt', 'cv014_13924.txt', 'cv015_29439.txt', 'cv016_4659.txt', 'cv017_22464.txt', 'cv018_20137.txt', 'cv019_14482.txt', 'cv020_8825.txt', 'cv021_15838.txt', 'cv022_12864.txt', 'cv023_12672.txt', 'cv024_6778.txt', 'cv025_3108.txt', 'cv026_29325.txt', 'cv027_25219.txt', 'cv028_26746.txt', 'cv029_18643.txt', 'cv030_21593.txt', 'cv031_18452.txt', 'cv032_22550.txt', 'cv033_24444.txt', 'cv034_29647.txt', 'cv035_3954.txt', 'cv036_16831.txt', 'cv037_18510.txt', 'cv038_9749.txt', 'cv039_6170.txt', 'cv040_8276.txt', 'cv041_21113.txt', 'cv042_10982.txt', 'cv043_15013.txt', 'cv044_16969.txt', 'cv045_23923.txt', 'cv046_10188.txt', 'cv047_1754.txt', 'cv048_16828.txt', 'cv049_20471.txt', 'cv050_11175.txt', 'cv051_10306.txt', 'cv052_29378.txt', 'c

In [14]:
# load all docs in a directory
def process_doc(directory, vocab):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [15]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_doc('imdb_review/pos', vocab)
process_doc('imdb_review/neg', vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

29580
[('film', 9984), ('movi', 6064), ('one', 5156), ('like', 3597), ('charact', 3439), ('get', 2870), ('make', 2812), ('time', 2608), ('scene', 2376), ('even', 2306), ('play', 2155), ('good', 2141), ('stori', 2092), ('see', 1977), ('would', 1844), ('much', 1825), ('also', 1757), ('go', 1742), ('way', 1683), ('seem', 1662), ('two', 1643), ('end', 1635), ('take', 1625), ('look', 1617), ('first', 1589), ('come', 1588), ('well', 1572), ('work', 1522), ('thing', 1481), ('realli', 1407), ('know', 1402), ('year', 1390), ('plot', 1376), ('perform', 1363), ('littl', 1354), ('life', 1345), ('peopl', 1304), ('love', 1272), ('bad', 1256), ('could', 1248), ('man', 1212), ('show', 1205), ('never', 1201), ('tri', 1192), ('best', 1182), ('new', 1140), ('give', 1137), ('mani', 1130), ('star', 1121), ('doesnt', 1118)]


In [16]:
# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
print(vocab)

17575


In [17]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [18]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

### Reviews to Lines of Tokens

In [19]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [20]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian=True):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

In [21]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [22]:
# load all training reviews
positive_lines = process_docs('imdb_review/pos', vocab)
negative_lines = process_docs('imdb_review/neg', vocab)
# summarize what we have
print(len(positive_lines), len(negative_lines))

900 900


### Reviews to Bag-of-Words Vectors

In [23]:
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [24]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
docs_train = positive_lines + negative_lines
tokenizer.fit_on_texts(docs_train)

In [25]:
# encode training data set
Xtrain = tokenizer.texts_to_matrix(docs_train, mode='binary')
print(Xtrain.shape)
print(Xtrain[:5])

(1800, 17576)
[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]


In [26]:
# load all test reviews
positive_lines_test = process_docs('imdb_review/pos', vocab, False)
negative_lines_test = process_docs('imdb_review/neg', vocab, False)
docs_test = positive_lines_test + negative_lines_test 
# encode training data set
Xtest = tokenizer.texts_to_matrix(docs_test, mode='binary')
print(Xtest.shape)

(200, 17576)


### Sentiment Analysis Model

In [27]:
ytrain = np.array([1 for _ in range(900)] + [0 for _ in range(900)])
ytest = np.array([1 for _ in range(100)] + [0 for _ in range(100)])

In [28]:
print(len(ytrain))
print(len(ytest))

1800
200


In [29]:
# Fitting Naive Bayes to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=40) #gaussian kernel
classifier.fit(Xtrain, ytrain)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=40, shrinking=True,
  tol=0.001, verbose=False)

In [30]:
# Predicting the Test set results
y_pred = classifier.predict(Xtest)

In [31]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
cm

array([[96,  4],
       [66, 34]], dtype=int64)

In [32]:
print('Accuracy: %.2f' % (np.sum(cm.diagonal())/np.sum(cm)))

Accuracy: 0.65


### Prediction for New Reviews

In [33]:
def predict_sentiment(review, vocab, tokenizer, model):
    # clean
    tokens = clean_doc(review)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    # convert to line
    line = ' '.join(tokens)
    # encode
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    # prediction
    yhat = classifier.predict(encoded)
    return yhat

In [34]:
# test negative text
text = 'This is a bad movie.'
print(predict_sentiment(text, vocab, tokenizer, classifier))

[0]
