### Sentiment analysis of movie (IMDB) reviews using dataset provided by the ACL 2011 paper, see http://ai.stanford.edu/~amaas/data/sentiment/.

#### Dataset can be downloaded separately from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz, but wont be necessary as the download process has been embedded in the notebook and source file.

In [3]:
!pip install nltk
!pip install --upgrade gensim

import numpy as np
import os
import os.path

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import glob
from gensim.models import Word2Vec  

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
if not os.path.isfile('aclImdb_v1.tar.gz'):
  !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

if not os.path.isfile('aclImdb'):  
  !tar -xf aclImdb_v1.tar.gz 


--2018-10-10 21:14:16--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2018-10-10 21:14:17 (71.8 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
SAMPLE_SIZE=600
positive_sample_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]


# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text


In [0]:
pos_strings = [load_doc(x) for x in positive_sample_file_list]
print(pos_strings[:10])

neg_strings = [load_doc(x) for x in negative_sample_file_list]
print(neg_strings[:10])
    

In [0]:
pos_labels = np.array(SAMPLE_SIZE * [[1,0]])
pos_labels

In [0]:
neg_labels = np.array(SAMPLE_SIZE * [[0,1]])
neg_labels

In [0]:
pos_tokenized = [word_tokenize(s) for s in pos_strings]
print(pos_tokenized[1])
print(pos_tokenized[2])

In [0]:
neg_tokenized = [word_tokenize(s) for s in neg_strings]
print(neg_tokenized[1])
print(neg_tokenized[2])

In [0]:
# load doc into memory
with open('aclImdb/imdb.vocab') as f:
  content = f.readlines()
universe_vocabulary = [x.strip() for x in content]

print(len(universe_vocabulary))
print(len(set(universe_vocabulary)))

In [0]:
model_ted = Word2Vec(sentences=pos_tokenized, size=100, window=5, min_count=5, workers=1, sg=0, seed=42)
model_ted.wv.most_similar("brother")

print(np.linalg.norm(model_ted.wv['man'] - model_ted.wv['woman']))
print(np.linalg.norm(model_ted.wv['father'] - model_ted.wv['mother']))
print(np.linalg.norm(model_ted.wv['brother'] - model_ted.wv['sister']))
print(np.linalg.norm(model_ted.wv['house'] - model_ted.wv['ship']))

print(np.linalg.norm(model_ted.wv['father'] - model_ted.wv['mother']))
print(np.linalg.norm(model_ted.wv['sister'] - model_ted.wv['mother']))