In [2]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
#!pip install nltk
#!pip install --upgrade gensim

import numpy as np
import os
import os.path

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

[nltk_data] Downloading package punkt to /home/jeremie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('../aclImdb'):
    if not os.path.isfile('../aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('../aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

In [4]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=1000
positive_sample_file_list = glob.glob(os.path.join('../aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('../aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [5]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

Positive review(s): NYC model Alison Parker (Cristina Raines) rents a room in an old brownstone where she meets a few bi
Negative review(s): The reason the DVD releases of this film are in black and white is because nobody can get their hand


### LSTM with Keras (Sequential model)


Please note that the below code is executed on GPU instances on Colab, this wont work on your local machine, use the flag to enable/disable running in CPU or GPU mode, set `run_in_GPU_mode_on_colab=false` in order to be able to run in CPU mode.

In [6]:
import tensorflow as tf

def lstm_keras():
  from keras.models import Sequential
  from keras.layers import Dense, Activation, Embedding, LSTM
  from keras.preprocessing.text import Tokenizer
  from sklearn.preprocessing import LabelBinarizer


  X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

  vocab_size = 1000
  tokenize = Tokenizer(num_words=vocab_size)
  tokenize.fit_on_texts(X_train)

  encoded_X_train = tokenize.texts_to_matrix(X_train)
  encoded_X_test = tokenize.texts_to_matrix(X_test)

  encoder = LabelBinarizer()
  encoder.fit(y_train)
  encoded_y_train = encoder.transform(y_train)
  encoded_y_test = encoder.transform(y_test)

  max_features = 1000
  model = Sequential()
  model.add(Embedding(max_features, 128, dropout=0.2))
  model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  # try using a GRU instead, for fun
  model.add(Dense(1))
  model.add(Activation('sigmoid'))

  model.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])

  batch_size=64
  epochs=1
  history = model.fit(encoded_X_train, encoded_y_train, 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  verbose=1, 
                  validation_split=0.1)

  score = model.evaluate(encoded_X_test, encoded_y_test, 
                         batch_size=batch_size, verbose=1)
  print('Test score:', score[0])
  print('Test accuracy:', score[1])


run_in_GPU_mode_on_colab=False

if run_in_GPU_mode_on_colab:  
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True

  with tf.device('/gpu:0'):
    session_gpu = tf.Session(config=config)
    session_gpu.run(tf.global_variables_initializer())
    session_gpu.run(tf.tables_initializer())
    start = time.time()
    session_gpu.run(lstm_keras())
    end = time.time()
    gpu_time = end - start
    print('Duration on the GPU: {} seconds'.format(gpu_time))
else:
  start = time.time()
  lstm_keras()
  end = time.time()
  cpu_time = end - start
  print('Duration on the CPU: {} seconds'.format(cpu_time))

Using TensorFlow backend.


Train on 1350 samples, validate on 150 samples
Epoch 1/1
Test score: 0.6975589475631714
Test accuracy: 0.44999999928474427
Duration on the CPU: 36.99505662918091 seconds
