# DataSet IMDB movie review
downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
datapath = '/data/imdb/labeledTrainData.tsv'

data = pd.read_csv(datapath, sep='\t')
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


# Tokenize and stem

In [3]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer


tkr = RegexpTokenizer('[a-zA-Z0-9@]+')
stemmer = LancasterStemmer()

tokenized_corpus = []

for review in data['review']:
    tokens = [stemmer.stem(t) for t in tkr.tokenize(review) ]
    tokenized_corpus.append(tokens)
  

# Gensim Word2Vec model

In [4]:
from gensim.models.word2vec import Word2Vec

vector_size = 50 # for small ram
window_size = 10

# Create Word2Vec
word2vec = Word2Vec(sentences=tokenized_corpus,
                    size=vector_size, 
                    window=window_size, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    )

X_vecs = word2vec.wv
del word2vec


# load data with Word2Vec Split to train and test

In [5]:
import keras.backend as K
from keras.utils import to_categorical

max_length = 100

X = np.zeros((len(data), max_length, vector_size), dtype=K.floatx())

for i in range(len(data)):
    for t, token in enumerate(tokenized_corpus[i]):
        if t < max_length and token in X_vecs:
            X[i,t,:] = X_vecs[token]
            
y = to_categorical(data.sentiment)
del data

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# Convlutional Model 

In [7]:
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 100, 32)           4832      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 32)           3104      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 32)           3104      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 100, 32)           3104      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 100, 32)           2080      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 100, 32)           2080      
__________

# Training

In [8]:
batch_size = 32
nb_epochs = 100

# Fit the model
model.fit(X_train, y_train,
          batch_size=batch_size,
          shuffle=True,
          epochs=nb_epochs,
          validation_data=(X_test, y_test),
          callbacks=[EarlyStopping(min_delta=0.00025, patience=2)])

Train on 16750 samples, validate on 8250 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


<keras.callbacks.History at 0x7fc8b199ed68>