### Sentiment analysis of movie (IMDB) reviews using dataset provided by the ACL 2011 paper, see http://ai.stanford.edu/~amaas/data/sentiment/.

#### Dataset can be downloaded separately from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz, but wont be necessary as the download process has been embedded in the notebook and source file.

In [88]:
!pip install nltk
!pip install --upgrade gensim

import numpy as np
import os
import os.path

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import glob
from gensim.models import Word2Vec  

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [89]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
print('On the MacOSX, you will need to install wget, see https://www.mkyong.com/mac/wget-on-mac-os-x/')

if not os.path.isfile('aclImdb_v1.tar.gz'):
  !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

if not os.path.isfile('aclImdb'):  
  !tar -xf aclImdb_v1.tar.gz 


On the MacOSX, you will need to install wget, see https://www.mkyong.com/mac/wget-on-mac-os-x/


In [0]:
SAMPLE_SIZE=600
positive_sample_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r', encoding='utf8')
	# read all text
	text = re.sub('<[^>]*>', ' ', file.read())
	# close the file
	file.close()
	return text


# New Section

In [91]:
positive_strings = [load_doc(x) for x in positive_sample_file_list]
print(positive_strings[:10])

negative_strings = [load_doc(x) for x in negative_sample_file_list]
print(negative_strings[:10])
    

["A beautiful piece of children's cinema buried in a world of archaic Celticism. Setting the story around the famous Book of Kels, believed to have been comprised by monks from the small island of Iona, off the western coast of Scotland.  Telling the tale of a young abbots apprentice who goes off into the forest in search of Crom-Cruic, the fierce headless horseman of pagan mythology. In hopes of recovering a lost artefact.  The films true beauty lies in its' animation. Cell shaded in a bright and inspirational style of deep complexity resulting in a look of seem less simplicity. Deriving much from the artistic style of the brilliant Cartoon Network series 'Samurai Jack' for its genius use of mark making and background depth, The Secret of Kels creates a consistently affective Celtic world living under the shadow of Viking invasion.  The history may be intensely inaccurate and the ways of life portrayed lacking realism but these facts are utterly irrelevant as the film sets itself in a

In [92]:
positive_labels = np.array(SAMPLE_SIZE * [1])
print(positive_labels)

[1 1 1 ... 1 1 1]


In [93]:
negative_labels = np.array(SAMPLE_SIZE * [0])
print(negative_labels)

[0 0 0 ... 0 0 0]


In [94]:
positive_tokenized = [word_tokenize(s) for s in positive_strings]
print(positive_tokenized[1])
print(positive_tokenized[2])

['Raising', 'Victor', 'Vargas', 'is', 'a', 'movie', 'you', 'definitely', 'need', 'to', 'see', '.', 'It', 'was', 'very', 'heart', 'felt', 'and', 'had', 'a', 'lot', 'of', 'humor', 'that', 'gets', 'you', 'sucked', 'right', 'in', '.', 'It', 'is', 'so', 'much', 'like', 'real', 'life', 'with', 'what', 'teenagers', 'have', 'to', 'go', 'through', '.', 'Victor', ',', 'a', 'cocky', 'teen', ',', 'but', 'with', 'a', 'good', 'heart', 'at', 'the', 'end', 'deals', 'with', 'love', 'in', 'all', 'the', 'right', 'places', 'dealing', 'with', 'girls', 'and', 'family', '.', 'At', 'the', 'end', 'Victor', 'learns', 'the', 'true', 'meaning', 'of', 'love', 'after', 'dealing', 'with', 'a', 'old', 'fashioned', 'grandmother', 'and', 'a', 'girl', 'who', 'he', 'wants', 'to', 'use', 'is', 'actually', 'using', 'him', 'too.I', 'recommend', 'people', 'to', 'watch', 'this', 'movie', 'because', 'it', 'will', 'be', 'like', 'you', 'are', 'watching', 'a', 'real', 'family', '.', 'Thats', 'how', 'much', 'feeling', 'this', 'mov

In [95]:
negative_tokenized = [word_tokenize(s) for s in negative_strings]
print(negative_tokenized[1])
print(negative_tokenized[2])

['OK', ',', 'I', 'knew', 'this', 'would', 'be', 'a', 'back', 'alley', 'F-film', '(', 'well', 'below', 'B-film', 'standards', ')', 'going', 'into', 'it', ',', 'so', 'I', 'thought', ',', '``', 'Man', ',', 'I', 'could', 'use', 'a', 'good', 'laugh', ',', 'so', 'let', "'s", 'see', 'some', 'nether-beings', 'kill', 'each', 'other', '.', "''", 'Well', ',', 'what', 'I', 'got', 'could', 'have', 'been', 'found', 'at', 'your', 'local', '``', 'love', 'toy', "''", 'store', '.', 'Random', 'lesbian', 'scenes', ',', 'very', 'little', 'fighting', ',', 'and', 'no', 'plot', '.', 'For', 'example', ',', 'one', 'scene', 'in', 'particular', 'I', 'remember', '(', 'for', 'its', 'sheer', 'stupidity', 'only', ';', 'I', "'ve", 'seen', 'better', 'porn', 'on', 'ABC', ')', 'is', 'where', 'the', 'two', 'main', 'characters', '(', 'I', 'ca', "n't", 'remember', 'their', 'names', 'offhand', '...', 'great', 'movie', ',', 'huh', '?', ')', 'are', 'driving', 'along', ',', 'as', 'they', 'mostly', 'did', ',', 'and', 'the', 'dri

In [96]:
# load doc into memory
with open('aclImdb/imdb.vocab') as f:
  content = f.readlines()
universe_vocabulary = [x.strip() for x in content]


print(sum([len(token) for token in positive_tokenized]))
stripped_positive_tokenized = []
for tokens in positive_tokenized:
  stripped_positive_tokenized.append([token.lower() for token in tokens if token.lower() in universe_vocabulary])

print(sum([len(token) for token in stripped_positive_tokenized]))

1347352


KeyboardInterrupt: ignored

In [57]:
print(positive_tokenized[0:5])
print(stripped_positive_tokenized[0:5])

[['A', 'beautiful', 'piece', 'of', 'children', "'s", 'cinema', 'buried', 'in', 'a', 'world', 'of', 'archaic', 'Celticism', '.', 'Setting', 'the', 'story', 'around', 'the', 'famous', 'Book', 'of', 'Kels', ',', 'believed', 'to', 'have', 'been', 'comprised', 'by', 'monks', 'from', 'the', 'small', 'island', 'of', 'Iona', ',', 'off', 'the', 'western', 'coast', 'of', 'Scotland.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'Telling', 'the', 'tale', 'of', 'a', 'young', 'abbots', 'apprentice', 'who', 'goes', 'off', 'into', 'the', 'forest', 'in', 'search', 'of', 'Crom-Cruic', ',', 'the', 'fierce', 'headless', 'horseman', 'of', 'pagan', 'mythology', '.', 'In', 'hopes', 'of', 'recovering', 'a', 'lost', 'artefact.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'The', 'films', 'true', 'beauty', 'lies', 'in', 'its', "'", 'animation', '.', 'Cell', 'shaded', 'in', 'a', 'bright', 'and', 'inspirational', 'style', 'of', 'deep', 'complexity', 'resulting', 'in', 'a', 'look', 'of', 'seem', 'less', 'simplicity', 

In [58]:
print(sum([len(token) for token in positive_tokenized]))
stripped_negative_tokenized = []
for tokens in negative_tokenized:
  stripped_negative_tokenized.append([token.lower() for token in tokens if token.lower() in universe_vocabulary])

print(sum([len(token) for token in stripped_negative_tokenized]))

166351
146040


In [59]:
print(negative_tokenized[0:5])
print(stripped_negative_tokenized[0:5])

[['How', 'viewers', 'react', 'to', 'this', 'new', '``', 'adaption', "''", 'of', 'Shirley', 'Jackson', "'s", 'book', ',', 'which', 'was', 'promoted', 'as', 'NOT', 'being', 'a', 'remake', 'of', 'the', 'original', '1963', 'movie', '(', 'true', 'enough', ')', ',', 'will', 'be', 'based', ',', 'I', 'suspect', ',', 'on', 'the', 'following', ':', 'those', 'who', 'were', 'big', 'fans', 'of', 'either', 'the', 'book', 'or', 'original', 'movie', 'are', 'not', 'going', 'to', 'think', 'much', 'of', 'this', 'one', '...', 'and', 'those', 'who', 'have', 'never', 'been', 'exposed', 'to', 'either', ',', 'and', 'who', 'are', 'big', 'fans', 'of', 'Hollywood', "'s", 'current', 'trend', 'towards', '``', 'special', 'effects', "''", 'being', 'the', 'first', 'and', 'last', 'word', 'in', 'how', '``', 'good', "''", 'a', 'film', 'is', ',', 'are', 'going', 'to', 'love', 'it.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'Things', 'I', 'did', 'not', 'like', 'about', 'this', 'adaption', ':', '<', 'br', '/', '>', '<', '

In [11]:
model_ted = Word2Vec(sentences=positive_tokenized, size=100, window=5, min_count=5, workers=1, sg=0, seed=42)
model_ted.wv.most_similar("brother")

print(np.linalg.norm(model_ted.wv['man'] - model_ted.wv['woman']))
print(np.linalg.norm(model_ted.wv['father'] - model_ted.wv['mother']))
print(np.linalg.norm(model_ted.wv['brother'] - model_ted.wv['sister']))
print(np.linalg.norm(model_ted.wv['house'] - model_ted.wv['road']))  ### boat or ship does not exist in the corpus so we get an error if we use them

print(np.linalg.norm(model_ted.wv['father'] - model_ted.wv['mother']))
print(np.linalg.norm(model_ted.wv['sister'] - model_ted.wv['mother']))

1.0623424
0.6094943
0.12904908
1.8528767
0.6094943
0.6138573


  if np.issubdtype(vec.dtype, np.int):


In [0]:
features = np.array(stripped_positive_tokenized + stripped_negative_tokenized)
labels = np.concatenate([positive_labels, negative_labels])
# print(features.shape)
# print(features)
# print(labels.shape)
# print(labels)

from keras.preprocessing import text


# GitHub reference: https://github.com/tensorflow/workshops/blob/master/extras/keras-bag-of-words/keras-bow-model.ipynb
# Blog: https://cloud.google.com/blog/products/gcp/intro-to-text-classification-with-keras-automatically-tagging-stack-overflow-posts

vocab_size = 1000
tokenize = text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(features)
tokenized_features = tokenize.texts_to_matrix(features)


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(tokenized_features, labels, test_size=0.25)

print(x_train[1])
# print(x_train.shape)
# print(x_test.shape)
# print(y_train.shape)
# print(y_test.shape)

**Simple models**

- Logistic
- Random Forst
- LSTM
- GRU
- CNN

**Vectorisation techniques**
- Bag of Words
- Word2Vec
- TFIDF (probability scores)
- FastText
- Glove

In [87]:
from sklearn.linear_model import LogisticRegression

# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()

logisticRegr.fit(x_train, y_train)

score = logisticRegr.score(x_test, y_test)
print("Score: ", score)
y_test = logisticRegr.predict(x_test)

Score:  0.73
