In [0]:

SAMPLE_SIZE=50000
# SAMPLE_SIZE=1000
# TRAIN_TEST_RATIO=0.75
TRAIN_TEST_RATIO=0.50

BATCH_SIZE=64
EPOCHS_NB=3
# vocab_size = 10000
vocab_size = 5000
# vocab_size = 1000

In [2]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
!pip install nltk wget
!pip install --upgrade gensim

import numpy as np
import os
import os.path
import pandas as pd


from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.7.1)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import wget
import tarfile

# By checking if the directory exists first, we allow people to delete the tarfile without the notebook re-downloading it
if os.path.isdir('aclImdb'):
    print("Dataset directory exists, taking no action")
else:    
    if not os.path.isfile('aclImdb_v1.tar.gz'):
        print("Downloading dataset")
        #!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
        wget.download('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    else:
        print("Dataset already downloaded")
    
    print("Unpacking dataset")
    #!tar -xf aclImdb_v1.tar.gz 
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall()
    tar.close()
    print("Dataset unpacked in aclImdb")
      

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


Dataset directory exists, taking no action


In [0]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# notebook configuration
TRAINING_SET_SIZE=int(SAMPLE_SIZE * (1 - TRAIN_TEST_RATIO))
VALIDATION_SET_SIZE=int(SAMPLE_SIZE * TRAIN_TEST_RATIO)

dataframes={}
for type in ["train","test"]:
    if type=="train":
        SLICE = int(TRAINING_SET_SIZE / 2)
    elif type=="test":
        SLICE = int(VALIDATION_SET_SIZE / 2)

    positive_file_list = glob.glob(os.path.join('aclImdb/'+type+'/pos', "*.txt"))
    positive_sample_file_list = positive_file_list[:SLICE]
    df_positive = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SLICE)})
    
    negative_file_list = glob.glob(os.path.join('aclImdb/'+type+'/neg', "*.txt"))
    negative_sample_file_list = negative_file_list[:SLICE]
    df_negative = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SLICE)})

    dataframes[type]=pd.concat([df_positive,df_negative])
    dataframes[type]=shuffle(dataframes[type])

df=pd.concat([dataframes["train"],dataframes["test"]])
    
X_train=dataframes["train"]['reviews']
y_train=dataframes["train"]['sentiment']

X_test=dataframes["test"]['reviews']
y_test=dataframes["test"]['sentiment']

In [0]:
# positive_sample_file_list

In [6]:
# X_train.head()
X_train.shape

(25000,)

### LSTM with Keras (Sequential model)


Please note that the below code is executed on GPU instances on Colab, this wont work on your local machine, use the flag to enable/disable running in CPU or GPU mode, set `run_in_GPU_mode_on_colab=false` in order to be able to run in CPU mode.

In [7]:
import tensorflow as tf

# def lstm_keras():
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, EarlyStopping


tokenize = Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(X_train)

encoded_X_train = tokenize.texts_to_sequences(X_train)
encoded_X_test = tokenize.texts_to_sequences(X_test)

encoded_X_train = sequence.pad_sequences(encoded_X_train, maxlen=vocab_size)
encoded_X_test = sequence.pad_sequences(encoded_X_test, maxlen=vocab_size)


encoder = LabelBinarizer()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
encoded_y_test = encoder.transform(y_test)

Using TensorFlow backend.


In [0]:

model = Sequential()
model.add(Embedding(vocab_size, 512,input_length=vocab_size))
model.add(LSTM(128))  # try using a GRU instead, for fun
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

model.compile(loss='binary_crossentropy',
           optimizer='adam',
           metrics=['accuracy'])

model_checkpoint = ModelCheckpoint('best.weights', monitor='val_loss', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

callbacks = [model_checkpoint, early_stopping]

history = model.fit(encoded_X_train, encoded_y_train, 
              batch_size=BATCH_SIZE, 
              epochs=EPOCHS_NB, 
              verbose=1, 
              validation_data=(encoded_X_test,encoded_y_test),
                   callbacks=callbacks)


score = model.evaluate(encoded_X_test, encoded_y_test, 
                     batch_size=BATCH_SIZE, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])



Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5000, 512)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               328192    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,888,321
Trainable params: 2,888,321
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Train on 25000 samples, validate on 25000 samples
Epoch 1/3

In [0]:
history_df = pd.DataFrame(history.history)
history_df = history_df[history_df['val_acc']==history_df.val_acc.max()]
history_df.reset_index(inplace=True)
history_df["title"]=["Keras LSTM NN"]
history_df["sample_size"]=[SAMPLE_SIZE]
history_df["nb_epochs"]=EPOCHS_NB
history_df.drop(labels="index",axis=1,inplace=True)
print(history_df)
history_df.to_csv(path_or_buf=history_df.iloc[0].title+".csv")