In [1]:
import nltk
#nltk.download('stopwords')

In [2]:
from nltk.corpus import stopwords
import string
import numpy as np
from nltk.stem.porter import PorterStemmer

In [3]:
# load doc into memory
def load_doc(filename):
    "Read the text file and return it. Pass the file name as string argument"
    file = open(filename, 'r') # open the file as read only
    text = file.read() # read all text
    file.close() # close the file
    return text

[Download Dataset](http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

#### Read imdb review from the Text file

In [4]:
# load the document
filename = 'imdb_review/pos/cv000_29590.txt'
text = load_doc(filename)
text

'films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes brothers to direct this seem

** Tokenization : ** Tokenizing a sentence in a document set means splitting up a sentence into individual words using a delimiter. The delimiter specifies what character we will use to identify the beginning and the end of a word. We use a single space as the delimiter for identifying words in our data

In [5]:
doc = text[:]
# split into tokens by white space
tokens = doc.split()
print(tokens)

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', "they're", 'about', 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(', 'casper', ')', 'or', 'the', 'arthouse', 'crowd', '(', 'ghost', 'world', ')', ',', 'but', "there's", 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '.', 'for', 'starters', ',', 'it', 'was', 'created', 'by', 'alan', 'moore', '(', 'and', 'eddie', 'campbell', ')', ',', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', "'80s", 'with', 'a', '12-part', 'series', 'called', 'the', 'watchmen', '.', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', '.', 'the', 'book', '(', 'or', '"', 'graphic', 'novel', ',', '"', 'if

### Remove Punctuation

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
# remove punctuation from each token
table = str.maketrans('', '', string.punctuation)

In [8]:
[w.translate(table) for w in ['remove@','<punctuations>', 'from..', 'tokens!']]

['remove', 'punctuations', 'from', 'tokens']

In [9]:
tokens = [w.translate(table) for w in tokens]
print(tokens)

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', '', 'whether', 'theyre', 'about', 'superheroes', '', 'batman', '', 'superman', '', 'spawn', '', '', 'or', 'geared', 'toward', 'kids', '', 'casper', '', 'or', 'the', 'arthouse', 'crowd', '', 'ghost', 'world', '', '', 'but', 'theres', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '', 'for', 'starters', '', 'it', 'was', 'created', 'by', 'alan', 'moore', '', 'and', 'eddie', 'campbell', '', '', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', '80s', 'with', 'a', '12part', 'series', 'called', 'the', 'watchmen', '', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', '', 'the', 'book', '', 'or', '', 'graphic', 'novel', '', '', 'if', 'you', 'will', '', 'is'

### Remove non alphabetic tokens

In [10]:
'123'.isalpha()

False

In [11]:
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
print(tokens)

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', 'whether', 'theyre', 'about', 'superheroes', 'batman', 'superman', 'spawn', 'or', 'geared', 'toward', 'kids', 'casper', 'or', 'the', 'arthouse', 'crowd', 'ghost', 'world', 'but', 'theres', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', 'for', 'starters', 'it', 'was', 'created', 'by', 'alan', 'moore', 'and', 'eddie', 'campbell', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', 'with', 'a', 'series', 'called', 'the', 'watchmen', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', 'the', 'book', 'or', 'graphic', 'novel', 'if', 'you', 'will', 'is', 'over', 'pages', 'long', 'and', 'includes', 'nearly', 'more', 'that', 'consist', 'of', 'nothing', 'but', 'f

### Stop-Word Removal

In [12]:
stop_words = set(stopwords.words('english'))
print('Stop Words :%s \n' % stop_words)

Stop Words :{'after', 'mustn', 'haven', 'her', 'd', 'weren', 'does', "wasn't", 'himself', 'myself', 'been', 'because', 'into', 'am', 'i', 'or', 'in', 'are', 'a', 'the', 'very', 'below', 'other', 'on', 'most', "hadn't", 'll', 'having', 'have', "you're", 'who', 'o', 'under', 'me', "that'll", 'aren', 'only', 'itself', 'with', 'if', 'when', 'all', 'is', 'from', 'through', 'about', 'had', 'until', 'our', "couldn't", 'own', 'ain', 'shan', 'where', 'than', 'wouldn', 'their', 'both', 'should', 'mightn', 'nor', "it's", 'will', 'once', 'did', 'yourself', 'at', 'there', 'off', 'any', 'your', 'be', 'we', 'and', 'didn', 'as', "you'll", 'whom', 'too', "mightn't", 'of', 'this', "doesn't", 'so', 'his', 'has', 'here', 'that', 'above', 'yourselves', 'over', 'few', 'being', 'same', 'just', 'shouldn', 'won', 't', "she's", 'again', 'its', 'now', 'but', 'these', 'yours', 'further', 'my', "shan't", 're', 'for', 'more', 'ours', "mustn't", 'to', 'wasn', 'him', "shouldn't", 'what', 'can', 'y', 'it', 'out', 'eac

In [13]:
# filter out stop words
tokens = [w for w in tokens if not w in stop_words]
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

### Stemming

The Porter stemming algorithm (or ‘Porter stemmer’) is a process for removing the common morphological and inflexional endings from words in English. Its main use is as part of a term normalisation process

It is often taken to be a crude error that a stemming algorithm does not leave a real word after removing the stem. But the purpose of stemming is to bring variant forms of a word together, not to map a word onto its ‘paradigm’ form.

In [14]:
ps = PorterStemmer()
tokens = [ps.stem(word) for word in tokens]
print(tokens)

['film', 'adapt', 'comic', 'book', 'plenti', 'success', 'whether', 'theyr', 'superhero', 'batman', 'superman', 'spawn', 'gear', 'toward', 'kid', 'casper', 'arthous', 'crowd', 'ghost', 'world', 'there', 'never', 'realli', 'comic', 'book', 'like', 'hell', 'starter', 'creat', 'alan', 'moor', 'eddi', 'campbel', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'seri', 'call', 'watchmen', 'say', 'moor', 'campbel', 'thoroughli', 'research', 'subject', 'jack', 'ripper', 'would', 'like', 'say', 'michael', 'jackson', 'start', 'look', 'littl', 'odd', 'book', 'graphic', 'novel', 'page', 'long', 'includ', 'nearli', 'consist', 'noth', 'footnot', 'word', 'dont', 'dismiss', 'film', 'sourc', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'anoth', 'stumbl', 'block', 'hell', 'director', 'albert', 'allen', 'hugh', 'get', 'hugh', 'brother', 'direct', 'seem', 'almost', 'ludicr', 'cast', 'carrot', 'top', 'well', 'anyth', 'riddl', 'better', 'direct', 'film', 'that', 'set', 'ghetto', 'f

### Filter out short tokens

In [15]:
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
print(tokens)

['film', 'adapt', 'comic', 'book', 'plenti', 'success', 'whether', 'theyr', 'superhero', 'batman', 'superman', 'spawn', 'gear', 'toward', 'kid', 'casper', 'arthous', 'crowd', 'ghost', 'world', 'there', 'never', 'realli', 'comic', 'book', 'like', 'hell', 'starter', 'creat', 'alan', 'moor', 'eddi', 'campbel', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'seri', 'call', 'watchmen', 'say', 'moor', 'campbel', 'thoroughli', 'research', 'subject', 'jack', 'ripper', 'would', 'like', 'say', 'michael', 'jackson', 'start', 'look', 'littl', 'odd', 'book', 'graphic', 'novel', 'page', 'long', 'includ', 'nearli', 'consist', 'noth', 'footnot', 'word', 'dont', 'dismiss', 'film', 'sourc', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'anoth', 'stumbl', 'block', 'hell', 'director', 'albert', 'allen', 'hugh', 'get', 'hugh', 'brother', 'direct', 'seem', 'almost', 'ludicr', 'cast', 'carrot', 'top', 'well', 'anyth', 'riddl', 'better', 'direct', 'film', 'that', 'set', 'ghetto', 'f

In [16]:
# turn a review doc into clean tokens
def clean_doc(doc):
    "Pre-process the document. Pass document as a string argument"
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

### Define a Vocabulary

In [17]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

In [18]:
from collections import Counter
from os import listdir

In [19]:
print(listdir('imdb_review/pos'))

['cv000_29590.txt', 'cv001_18431.txt', 'cv002_15918.txt', 'cv003_11664.txt', 'cv004_11636.txt', 'cv005_29443.txt', 'cv006_15448.txt', 'cv007_4968.txt', 'cv008_29435.txt', 'cv009_29592.txt', 'cv010_29198.txt', 'cv011_12166.txt', 'cv012_29576.txt', 'cv013_10159.txt', 'cv014_13924.txt', 'cv015_29439.txt', 'cv016_4659.txt', 'cv017_22464.txt', 'cv018_20137.txt', 'cv019_14482.txt', 'cv020_8825.txt', 'cv021_15838.txt', 'cv022_12864.txt', 'cv023_12672.txt', 'cv024_6778.txt', 'cv025_3108.txt', 'cv026_29325.txt', 'cv027_25219.txt', 'cv028_26746.txt', 'cv029_18643.txt', 'cv030_21593.txt', 'cv031_18452.txt', 'cv032_22550.txt', 'cv033_24444.txt', 'cv034_29647.txt', 'cv035_3954.txt', 'cv036_16831.txt', 'cv037_18510.txt', 'cv038_9749.txt', 'cv039_6170.txt', 'cv040_8276.txt', 'cv041_21113.txt', 'cv042_10982.txt', 'cv043_15013.txt', 'cv044_16969.txt', 'cv045_23923.txt', 'cv046_10188.txt', 'cv047_1754.txt', 'cv048_16828.txt', 'cv049_20471.txt', 'cv050_11175.txt', 'cv051_10306.txt', 'cv052_29378.txt', 'c

In [20]:
# load all docs in a directory
def process_doc(directory, vocab):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [21]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_doc('imdb_review/pos', vocab)
process_doc('imdb_review/neg', vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

29580
[('film', 9984), ('movi', 6064), ('one', 5156), ('like', 3597), ('charact', 3439), ('get', 2870), ('make', 2812), ('time', 2608), ('scene', 2376), ('even', 2306), ('play', 2155), ('good', 2141), ('stori', 2092), ('see', 1977), ('would', 1844), ('much', 1825), ('also', 1757), ('go', 1742), ('way', 1683), ('seem', 1662), ('two', 1643), ('end', 1635), ('take', 1625), ('look', 1617), ('first', 1589), ('come', 1588), ('well', 1572), ('work', 1522), ('thing', 1481), ('realli', 1407), ('know', 1402), ('year', 1390), ('plot', 1376), ('perform', 1363), ('littl', 1354), ('life', 1345), ('peopl', 1304), ('love', 1272), ('bad', 1256), ('could', 1248), ('man', 1212), ('show', 1205), ('never', 1201), ('tri', 1192), ('best', 1182), ('new', 1140), ('give', 1137), ('mani', 1130), ('star', 1121), ('doesnt', 1118)]


In [22]:
# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
print(tokens)

17575
['film', 'adapt', 'comic', 'book', 'plenti', 'success', 'whether', 'theyr', 'superhero', 'batman', 'superman', 'spawn', 'gear', 'toward', 'kid', 'casper', 'arthous', 'crowd', 'ghost', 'world', 'there', 'never', 'realli', 'like', 'hell', 'starter', 'creat', 'alan', 'moor', 'eddi', 'campbel', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'seri', 'call', 'say', 'thoroughli', 'research', 'subject', 'jack', 'ripper', 'would', 'michael', 'jackson', 'start', 'look', 'littl', 'odd', 'graphic', 'novel', 'page', 'long', 'includ', 'nearli', 'consist', 'noth', 'footnot', 'word', 'dont', 'dismiss', 'sourc', 'get', 'past', 'thing', 'might', 'find', 'anoth', 'stumbl', 'block', 'director', 'albert', 'allen', 'hugh', 'brother', 'direct', 'seem', 'almost', 'ludicr', 'cast', 'carrot', 'top', 'well', 'anyth', 'riddl', 'better', 'that', 'set', 'ghetto', 'featur', 'violent', 'street', 'crime', 'mad', 'genius', 'behind', 'menac', 'ii', 'societi', 'question', 'cours', 'whitechapel', 'london', 'ea

In [23]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [24]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

### Reviews to Lines of Tokens

In [25]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [26]:
# load all docs in a directory
def process_docs(directory, vocab, is_train=True):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

In [27]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [28]:
# load all training reviews
positive_lines = process_docs('imdb_review/pos', vocab)
negative_lines = process_docs('imdb_review/neg', vocab)
# summarize what we have
print(len(positive_lines), len(negative_lines))

900 900


### Reviews to Bag-of-Words Vectors

In [29]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [30]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
docs_train = positive_lines + negative_lines
tokenizer.fit_on_texts(docs_train)

In [31]:
#docs_train

In [32]:
# encode training data set
Xtrain = tokenizer.texts_to_matrix(docs_train, mode='binary')
print(Xtrain.shape)
print(Xtrain[:5])

(1800, 17576)
[[ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]]


In [33]:
# load all test reviews
positive_lines_test = process_docs('imdb_review/pos', vocab, False)
negative_lines_test = process_docs('imdb_review/neg', vocab, False)
docs_test = positive_lines_test + negative_lines_test 
# encode training data set
Xtest = tokenizer.texts_to_matrix(docs_test, mode='binary')
print(Xtest.shape)

(200, 17576)


### Sentiment Analysis Model

In [34]:
ytrain = np.array([1 for _ in range(900)] + [0 for _ in range(900)])
ytest = np.array([1 for _ in range(100)] + [0 for _ in range(100)])

In [35]:
print(len(ytrain))
print(len(ytest))

1800
200


#### Naive Bayers Model

In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(Xtrain, ytrain)

# Predicting the Test set results
y_pred = naive_bayes_classifier.predict(Xtest)

# Making the Confusion Matrix
cm = confusion_matrix(ytest, y_pred)
cm

array([[90, 10],
       [18, 82]], dtype=int64)

In [38]:
print('Test Accuracy: %.2f' % (np.sum(cm.diagonal())/np.sum(cm)))

Test Accuracy: 0.86


In [39]:
cm_train = confusion_matrix(ytrain, naive_bayes_classifier.predict(Xtrain))
print('Train Accuracy: %.2f' % (np.sum(cm.diagonal())/np.sum(cm)))

Train Accuracy: 0.86


### Prediction for New Reviews

In [40]:
def predict_sentiment(review, vocab, tokenizer, model):
    # clean
    tokens = clean_doc(review)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    # convert to line
    line = ' '.join(tokens)
    print(line)
    # encode
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    print(encoded)
    # prediction
    yhat = model.predict(encoded)
    return yhat

In [41]:
# test negative text
text = 'This is a bad movie.'
print(predict_sentiment(text, vocab, tokenizer, naive_bayes_classifier))

thi bad movi
[[ 0.  0.  1. ...,  0.  0.  0.]]
[0]


In [42]:
# test negative text
text = 'This is awesome and have great story line '
print(predict_sentiment(text, vocab, tokenizer, naive_bayes_classifier))

thi awesom great stori line
[[ 0.  0.  0. ...,  0.  0.  0.]]
[1]
