## Exloring the movie review data
In separate sheet to keep sheets running the models from being too cluttered

In [1]:
import os
import wget
import tarfile

# By checking if the directory exists first, we allow people to delete the tarfile without the notebook re-downloading it
if os.path.isdir('aclImdb'):
    print("Dataset directory exists, taking no action")
else:    
    if not os.path.isfile('aclImdb_v1.tar.gz'):
        print("Downloading dataset")
        #!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
        wget.download('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    else:
        print("Dataset already downloaded")
    
    print("Unpacking dataset")
    #!tar -xf aclImdb_v1.tar.gz 
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall()
    tar.close()
    print("Dataset unpacked in aclImdb")

Dataset directory exists, taking no action


In [2]:
# configuration
SAMPLE_SIZE=1000


In [3]:
import numpy as np
import os
import os.path
import glob
import time

time_beginning_of_notebook = time.time()
positive_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_file_list[:SAMPLE_SIZE]

negative_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text

In [4]:
positive_reviews = [load_doc(x) for x in positive_file_list]
negative_reviews = [load_doc(x) for x in negative_file_list]

In [5]:
def pretty_print_positive_and_negative(i):
    print(positive_reviews[i][:30] + "\t:\t" + negative_reviews[i][:30] + "...")

In [6]:
print(len(positive_reviews) + len(negative_reviews))
print('\n Positive reviews \n ', positive_reviews[2137][:50])
print('\n Negative reviews \n ', negative_reviews[2137][:50])

25000

 Positive reviews 
  This movie is a journey through the mind of a scre

 Negative reviews 
  While the original First Blood had its far-fetched


In [7]:
print("positive reviews \t : \t negative reviews\n")
pretty_print_positive_and_negative(2137)
pretty_print_positive_and_negative(12444)
pretty_print_positive_and_negative(6267)
pretty_print_positive_and_negative(5297)
pretty_print_positive_and_negative(4998)

positive reviews 	 : 	 negative reviews

This movie is a journey throug	:	While the original First Blood...
are highlights of this 1917 fe	:	OK I had higher hopes for this...
Here's the kind of love story 	:	What a disappointment!  This f...
A small pleasure in life is wa	:	Some nice scenery, but the sto...
This film was amazing. It had 	:	I remember when I first saw th...


In [8]:
from collections import Counter
import numpy as np

In [9]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [10]:
# Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
for i in range(len(positive_reviews)):
    for word in positive_reviews[i].split(" "):
        positive_counts[word] += 1
        total_counts[word] += 1
for i in range(len(negative_reviews)):
    for word in negative_reviews[i].split(" "):
        negative_counts[word] += 1
        total_counts[word] += 1


In [11]:
positive_counts.most_common()[:100]

[('the', 148466),
 ('and', 84295),
 ('a', 79438),
 ('of', 75349),
 ('to', 65216),
 ('is', 55366),
 ('in', 45802),
 ('I', 32622),
 ('that', 31948),
 ('', 27700),
 ('it', 26999),
 ('this', 26037),
 ('as', 23934),
 ('with', 22034),
 ('was', 21312),
 ('for', 20874),
 ('The', 20300),
 ('but', 16459),
 ('his', 16203),
 ('on', 15387),
 ('film', 14420),
 ('are', 14397),
 ('movie', 13375),
 ('not', 12493),
 ('you', 12416),
 ('have', 12270),
 ('he', 11771),
 ('be', 11696),
 ('by', 11462),
 ('an', 10794),
 ('one', 10686),
 ('at', 10231),
 ('who', 10152),
 ('from', 10134),
 ('all', 9159),
 ('has', 9032),
 ('her', 8999),
 ('like', 7981),
 ('about', 7829),
 ('very', 7796),
 ('they', 7714),
 ('This', 7437),
 ('so', 7383),
 ('or', 7013),
 ('more', 6825),
 ('out', 6692),
 ('some', 6664),
 ('just', 6533),
 ('It', 6238),
 ('when', 5987),
 ('what', 5903),
 ('their', 5893),
 ('good', 5797),
 ('which', 5645),
 ('she', 5402),
 ("it's", 5313),
 ('can', 5275),
 ('see', 5250),
 ('my', 5226),
 ('would', 5191),
 

In [12]:
negative_counts.most_common()[:100]

[('the', 138707),
 ('a', 75682),
 ('and', 68417),
 ('of', 67636),
 ('to', 67364),
 ('is', 47882),
 ('in', 39790),
 ('I', 37007),
 ('that', 32619),
 ('this', 31208),
 ('', 29753),
 ('it', 27455),
 ('was', 25393),
 ('The', 20694),
 ('for', 20202),
 ('with', 19694),
 ('as', 18587),
 ('but', 17340),
 ('movie', 17140),
 ('on', 15383),
 ('have', 14863),
 ('are', 14106),
 ('be', 13818),
 ('not', 13775),
 ('film', 12994),
 ('you', 12714),
 ('his', 11492),
 ('at', 11071),
 ('like', 10158),
 ('they', 10131),
 ('one', 10010),
 ('by', 9969),
 ('he', 9914),
 ('an', 9833),
 ('just', 9802),
 ('or', 9211),
 ('from', 9112),
 ('so', 8966),
 ('all', 8907),
 ('who', 8691),
 ('about', 8463),
 ('out', 7679),
 ('some', 7553),
 ('has', 7445),
 ('This', 7054),
 ('her', 6833),
 ('would', 6732),
 ('even', 6509),
 ('no', 6412),
 ('only', 6274),
 ('if', 6175),
 ('more', 6128),
 ('had', 5914),
 ('were', 5837),
 ('what', 5788),
 ('It', 5661),
 ('really', 5657),
 ('good', 5647),
 ('up', 5622),
 ('when', 5509),
 ("it'

In [13]:
print(len(positive_counts.items()))
print(len(negative_counts.items()))
print(len(total_counts.items()))
print(len(positive_counts.most_common()))
print(len(negative_counts.most_common()))
print(len(total_counts.most_common()))

169811
167430
265378
169811
167430
265378


In [14]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term, count in list(total_counts.most_common()):
    if(count > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [15]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0703492228278109
Pos-to-neg ratio for 'amazing' = 3.77720207253886
Pos-to-neg ratio for 'terrible' = 0.23886138613861385


In [16]:
# Convert ratios to logs
for word in pos_neg_ratios:
    pos_neg_ratios[word] = np.log(pos_neg_ratios[word])

In [17]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.0679849716991887
Pos-to-neg ratio for 'amazing' = 1.3289835431037726
Pos-to-neg ratio for 'terrible' = -1.4318718696162098


In [18]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()[:100]

[('7/10', 3.2733640101522705),
 ('8/10', 3.2255203675868693),
 ('Excellent', 3.1986731175506815),
 ('Highly', 2.929287174145838),
 ('9/10', 2.515678308454754),
 ('10/10', 2.4908413853078146),
 ('Matthau', 2.4849066497880004),
 ('Victoria', 2.332890442489375),
 ('perfect,', 2.312535423847214),
 ('superbly', 2.12389330425067),
 ('wonderfully', 2.120263536200091),
 ('amazing.', 2.094945728215801),
 ('superb.', 2.03688192726104),
 ('captures', 2.017566137961748),
 ('refreshing', 1.9387416595767009),
 ('wonderful.', 1.9379419794061366),
 ('Bourne', 1.9307583440347111),
 ('gripping', 1.9252908618525775),
 ('beautifully', 1.8536348729461425),
 ('breathtaking', 1.8495790401168812),
 ('perfect.', 1.8382794848629478),
 ('Powell', 1.807507826196194),
 ('excellent.', 1.8044984950054848),
 ('delightful', 1.7971214123694403),
 ('Nancy', 1.7439688053917064),
 ('brilliant.', 1.7376922479577792),
 ('finest', 1.7197859696029656),
 ('chilling', 1.7100814382137879),
 ('underrated', 1.692552819144607),
 ('

In [19]:
vocab = set(total_counts.keys())

In [20]:
vocab_size = len(vocab)
print(vocab_size)

265378
