In [76]:
import os
import glob
import numpy as np
import pandas as pd

import keras
from keras.models import Model,Sequential
from keras.layers import Input,Embedding,Dropout,Conv2D,BatchNormalization,Flatten,Dense
from keras.optimizers import Adam
from keras.utils import to_categorical
import sklearn
import pickle
from keras.preprocessing import image, sequence
from keras.layers import Conv1D,MaxPooling1D,LSTM

In [2]:
path='../../data/aclImdb/'

from keras.datasets import imdb
idx=imdb.get_word_index()

In [4]:
len(idx)

88584

In [8]:
idx_arr=sorted(idx,key=idx.get)

In [10]:
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [12]:
idx2word = {v:k for k,v in idx.items()}

In [22]:
f = open(path+'imdb_full.pkl', 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [23]:
len(x_train)

25000

In [27]:
' '.join([idx2word[ind] for ind in x_train[0]] )

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

Reducing vocab size to 5000 and setting the other vocabs to comman vocab of vocab_size-1

In [28]:
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [34]:
lens = list(map(len,trn))
max(lens),min(lens),np.mean(lens)

(2493, 10, 237.71364)

Making every trn idx consistent and choosing max len and padding sequences less than this length to zeros

In [36]:
seq_len = 500

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [37]:
trn.shape

(25000, 500)

### Training simple neural network

In [51]:
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=seq_len))
model.add(Flatten())
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

In [52]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [53]:
model.compile(optimizer=Adam(lr=0.001),loss='binary_crossentropy',metrics=['accuracy'])

In [54]:
model.fit(trn,labels_train,epochs=6,batch_size=64,validation_data=(test,labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x14e892eb8>

### With CNN 

In [71]:
model_conv = Sequential()
model_conv.add(Embedding(vocab_size,32, input_length=seq_len))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64,5,padding='same',activation='relu'))
model_conv.add(Dropout(0.2))
model_conv.add(MaxPooling1D())
model_conv.add(Flatten())
model_conv.add(Dense(100,activation='relu'))
model_conv.add(Dropout(0.7))
model_conv.add(Dense(1,activation='sigmoid'))

In [72]:
model_conv.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [73]:
model_conv.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_16 (Dropout)         (None, 500, 32)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 500, 64)           10304     
_________________________________________________________________
dropout_17 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 100)               1600100   
__________

In [74]:
model_conv.fit(trn,labels_train,epochs=3,batch_size=64,validation_data=(test,labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x151defc18>

### With LSTM

In [77]:
lstm_model = Sequential();
lstm_model.add(Embedding(vocab_size,32,input_length=seq_len))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(100,activation='tanh'))
lstm_model.add(Dense(1,activation='sigmoid'))

In [78]:
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_19 (Dropout)         (None, 500, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [79]:
lstm_model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [80]:
lstm_model.fit(trn,labels_train,epochs=3,batch_size=64,validation_data=
              (test,labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x159afe5f8>