Sentiment analysis of movie (IMDB) reviews using dataset provided by the ACL 2011 paper, 
see http://ai.stanford.edu/~amaas/data/sentiment/
This notebook uses neural net models

The plan is to try a variety of neural net based models:
* dense neural network with bag of words
* dense neural network with fixed size input and words mapped to integers
* LSTM
* CNN


In [1]:
import numpy as np
import os
import os.path
import glob
import time

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk

[nltk_data] Downloading package punkt to /home/dns/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import wget
import tarfile

# By checking if the directory exists first, we allow people to delete the tarfile without the notebook re-downloading it
if os.path.isdir('aclImdb'):
    print("Dataset directory exists, taking no action")
else:    
    if not os.path.isfile('aclImdb_v1.tar.gz'):
        print("Downloading dataset")
        #!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
        wget.download('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    else:
        print("Dataset already downloaded")
    
    print("Unpacking dataset")
    #!tar -xf aclImdb_v1.tar.gz 
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall()
    tar.close()
    print("Dataset unpacked in aclImdb")

Dataset directory exists, taking no action


In [3]:
# configuration
SAMPLE_SIZE=1000

## Load data

In [4]:
time_beginning_of_notebook = time.time()
positive_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_file_list[:SAMPLE_SIZE]

negative_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text

In [5]:
positive_strings = [load_doc(x) for x in positive_sample_file_list]
negative_strings = [load_doc(x) for x in negative_sample_file_list]

positive_tokenized = [word_tokenize(s) for s in positive_strings]
negative_tokenized = [word_tokenize(s) for s in negative_strings]

In [6]:
from collections import Counter
import numpy as np

In [7]:
total_counts = Counter()
all_reviews = positive_tokenized + negative_tokenized
for r in all_reviews:
    for word in r:
        total_counts[word] += 1

In [8]:
vocab = set(total_counts.keys())

In [9]:
vocab_size = len(vocab)
print(vocab_size)

32437


In [10]:
# Create a dictionary of words in the vocabulary mapped to index positions
# (to be used in layer_0)
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

In [11]:
print("ID of 'movie' = {}".format(word2index['movie']))

ID of 'movie' = 12197


In [12]:
# display the map of words to indices
print("word indexes = {}".format(word2index))



In [16]:
import random

positive_labels = []
for i in range(len(positive_tokenized)):
    positive_labels.append('POSITIVE')
negative_labels = []
for i in range(len(negative_tokenized)):
    negative_labels.append('NEGATIVE')
    
reviews = positive_tokenized + negative_tokenized
labels = positive_labels + negative_labels
reviews_and_labels = list(zip(reviews, labels))
random.shuffle(reviews_and_labels)
reviews, labels = zip(*reviews_and_labels)