In [61]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
#!pip install nltk
#!pip install --upgrade gensim

import numpy as np
import os
import os.path

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

[nltk_data] Downloading package punkt to /home/michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('../aclImdb'):
    if not os.path.isfile('../aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('../aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

In [62]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=3000
# SAMPLE_SIZE=12500
positive_sample_file_list = glob.glob(os.path.join('../aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('../aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [56]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

Positive review(s): There are few really hilarious films about science fiction but this one will knock your sox off. The
Negative review(s): I can find very little thats good to say about this film. I am sure the idea and script looked good 


In [63]:
len(X_train)

4500

### LSTM with Keras (Sequential model)


Please note that the below code is executed on GPU instances on Colab, this wont work on your local machine, use the flag to enable/disable running in CPU or GPU mode, set `run_in_GPU_mode_on_colab=false` in order to be able to run in CPU mode.

In [60]:
import tensorflow as tf

# def lstm_keras():
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing import sequence

# def lstm_keras():
X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

vocab_size = 1000

# Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',\
#           lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

tokenize = Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(X_train)

encoded_X_train = tokenize.texts_to_sequences(X_train)
encoded_X_test = tokenize.texts_to_sequences(X_test)

encoded_X_train = sequence.pad_sequences(encoded_X_train, maxlen=vocab_size)
encoded_X_test = sequence.pad_sequences(encoded_X_test, maxlen=vocab_size)


encoder = LabelBinarizer()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
encoded_y_test = encoder.transform(y_test)

In [8]:
max_features = 1000
model = Sequential()
model.add(Embedding(max_features, 512,input_length=vocab_size))
model.add(LSTM(128))  # try using a GRU instead, for fun
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

model.compile(loss='binary_crossentropy',
           optimizer='adam',
           metrics=['accuracy'])

batch_size=64
epochs=3
history = model.fit(encoded_X_train, encoded_y_train, 
              batch_size=batch_size, 
              epochs=epochs, 
              verbose=1, 
              validation_split=0.1)

score = model.evaluate(encoded_X_test, encoded_y_test, 
                     batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 512)         512000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               328192    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 840,321
Trainable params: 840,321
Non-trainable params: 0
_________________________________________________________________
None
Train on 16875 samples, validate on 1875 samples
Epoch 1/1
Test score: 0.38922087396144867
Test accuracy: 0.83216


### run with GPU

In [64]:
def lstm_keras():
    X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

    vocab_size = 1000

    # Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',\
    #           lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

    tokenize = Tokenizer(num_words=vocab_size)
    tokenize.fit_on_texts(X_train)

    encoded_X_train = tokenize.texts_to_sequences(X_train)
    encoded_X_test = tokenize.texts_to_sequences(X_test)

    encoded_X_train = sequence.pad_sequences(encoded_X_train, maxlen=vocab_size)
    encoded_X_test = sequence.pad_sequences(encoded_X_test, maxlen=vocab_size)

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    encoded_y_train = encoder.transform(y_train)
    encoded_y_test = encoder.transform(y_test)

    max_features = 1000
    model = Sequential()
    model.add(Embedding(max_features, 512,input_length=vocab_size))
    model.add(LSTM(128))  # try using a GRU instead, for fun
    model.add(Dense(1, activation='sigmoid'))

    print(model.summary())

    model.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])

    batch_size=64
    epochs=3

    history = model.fit(encoded_X_train, encoded_y_train, 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  verbose=1, 
                  validation_split=0.1)
    
    score = model.evaluate(encoded_X_test, encoded_y_test,batch_size=batch_size, verbose=1)
    
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    score_tensor  = tf.Variable(score, tf.float32, name="score_tensor")
    return score_tensor


In [73]:
global history

gpu_device_name = tf.test.gpu_device_name()
run_on_GPU = 'gpu' in gpu_device_name.lower() if gpu_device_name else False

if run_on_GPU:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.device('/gpu:0'):
        print('Running on GPU')
        result = lstm_keras()
        session_gpu = tf.Session(config=config)
        session_gpu.run(tf.tables_initializer())
        session_gpu.run(tf.global_variables_initializer())
        session_gpu.run(tf.local_variables_initializer())        
        start = time.time()
        session_gpu.run(result)
        end = time.time()
        gpu_time = end - start
        print('Duration on the GPU: {} seconds'.format(gpu_time))
else:
    start = time.time()
    lstm_keras()
    end = time.time()
    cpu_time = end - start
    print('Duration on the CPU: {} seconds'.format(cpu_time))

Running on GPU
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 1000, 512)         512000    
_________________________________________________________________
lstm_11 (LSTM)               (None, 128)               328192    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 129       
Total params: 840,321
Trainable params: 840,321
Non-trainable params: 0
_________________________________________________________________
None
Train on 4050 samples, validate on 450 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.44715462438265485
Test accuracy: 0.7813333338101704
Duration on the GPU: 0.09572267532348633 seconds


In [76]:
history_df = pd.DataFrame(history.history)
history_df = history_df[history_df['val_acc']==history_df.val_acc.max()]
history_df.reset_index(inplace=True)
history_df["title"]=["Keras LSTM NN"]
history_df["sample_size"]=[SAMPLE_SIZE]
history_df["nb_epochs"]=epochs
history_df.drop(labels="index",axis=1,inplace=True)
print(history_df)
history_df.to_csv(path_or_buf=history_df.iloc[0].title+".csv")

   val_loss   val_acc      loss       acc          title  sample_size  \
0  0.547166  0.802222  0.322978  0.860494  Keras LSTM NN         3000   

   nb_epochs  
0          3  
