# DataSet IMDB movie review

downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
datapath = '/data/imdb/labeledTrainData.tsv'

data = pd.read_csv(datapath, sep='\t')
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
# sample of review, need to remove the html tags and do some string cleaning

sample = data.review[0]
sample

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [4]:
def clean_str(text):
    text = text.lower()
    text = re.sub(r"<br\s\/>", ' ', text) #remove <br /> tags

    text = re.sub(r"\\", '', text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r'n\'t', ' not', text)
    text = re.sub(r"\'s", " \'s", text)

    return text 
    
sample = clean_str(sample)  
sample

"with all this stuff going down at the moment with mj i have started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again. maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. some of it has subtle messages about mj 's feeling towards the press and also the obvious message of drugs are bad m'kay.  visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring. some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.  the actual feature film bit when it finally starts is only on fo

In [5]:
from nltk import tokenize
tokenize.sent_tokenize(sample)

['with all this stuff going down at the moment with mj i have started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again.',
 'maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.',
 'moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.',
 "some of it has subtle messages about mj 's feeling towards the press and also the obvious message of drugs are bad m'kay.",
 'visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring.',
 'some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.',
 'the actual feature film bit when it final

In [6]:
reviews = data['review'].apply(clean_str).apply(tokenize.sent_tokenize).tolist()
reviews[0]

['with all this stuff going down at the moment with mj i have started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again.',
 'maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.',
 'moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.',
 "some of it has subtle messages about mj 's feeling towards the press and also the obvious message of drugs are bad m'kay.",
 'visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring.',
 'some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.',
 'the actual feature film bit when it final

In [7]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

# small vocab, short sentences
max_sents = 10       #15
max_sent_length = 50 #100
max_vocab = 2000     #20000
embedding_dim = 100
train_valid_split = 0.2


tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(sum(reviews,[]))

X = np.zeros((len(reviews), max_sents, max_sent_length), dtype = "int32")

for i, review in enumerate(reviews):
    for j, sentence in enumerate(review):
        if j<max_sents:
            tokens = text_to_word_sequence(sentence)
            for k, token in enumerate(tokens):
                if k<max_sent_length and tokenizer.word_index[token]<max_vocab:
                    X[i,j,k]=tokenizer.word_index[token]



y = to_categorical(data['sentiment'].values)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_valid_split, random_state=42, stratify=y)

In [9]:
X_train.shape, X_test.shape

((20000, 10, 50), (5000, 10, 50))

# Hierarchical BLSTM 

<img style="float: left;" src="img_files/hblstm.png">

In [10]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Conv1D, Embedding, Bidirectional, LSTM, TimeDistributed
from keras.layers.merge import Concatenate

embedding_layer = Embedding(max_vocab + 1,
                            embedding_dim,
#                             weights=[embedding_matrix],
                            input_length=max_sent_length,
                            trainable=True)

sentence_input = Input(shape=(max_sent_length,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

review_input = Input(shape=(max_sents, max_sent_length), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
preds = Dense(2, activation='softmax')(l_lstm_sent)
model = Model(review_input, preds)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10, 50)            0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 10, 200)           360900    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 402       
Total params: 602,102
Trainable params: 602,102
Non-trainable params: 0
_________________________________________________________________


In [11]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 10

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(X_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f74690c8710>

# Hierarchical BLSTM with attentions

<img style="float: left;" src="img_files/hblstm-att.png">

In [12]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPool2D, Conv2D, Embedding, Bidirectional, LSTM,TimeDistributed, Reshape, merge, Permute, Multiply



embedding_layer = Embedding(max_vocab + 1,
                            embedding_dim,
#                             weights=[embedding_matrix],
                            input_length=max_sent_length,
                            trainable=True)

sentence_input = Input(shape=(max_sent_length,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100, return_sequences=True))(embedded_sequences)

a_sent = Permute((2,1))(l_lstm)

a_sent = Dense(max_sent_length, activation='softmax')(a_sent)
a_sent_probs = Permute((2,1))(a_sent)

a_mul_sent = Multiply()([l_lstm, a_sent_probs])
a_mul_sent = Flatten()(a_mul_sent)

# l_lstm_output = Dense(6, activation="softmax")(a_mul_sent)

sentEncoder = Model(sentence_input, a_mul_sent)

review_input = Input(shape=(max_sents, max_sent_length), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100, return_sequences=True))(review_encoder)
a_rev = Permute((2,1))(l_lstm_sent)

a_rev = Dense(max_sents, activation='softmax')(a_rev)
a_probs_rev = Permute((2,1))(a_rev)

a_mul_rev = Multiply()([l_lstm_sent, a_probs_rev])
a_mul_rev = Flatten()(a_mul_rev)

# model_output = Dense(6, activation="softmax")(a_mul)


preds = Dense(2, activation='softmax')(a_mul_rev)
model = Model(review_input, preds)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy"])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 10, 50)        0                                            
____________________________________________________________________________________________________
time_distributed_2 (TimeDistribu (None, 10, 10000)     363450      input_4[0][0]                    
____________________________________________________________________________________________________
bidirectional_4 (Bidirectional)  (None, 10, 200)       8080800     time_distributed_2[0][0]         
____________________________________________________________________________________________________
permute_3 (Permute)              (None, 200, 10)       0           bidirectional_4[0][0]            
___________________________________________________________________________________________

<img style="float: left;" src="img_files/hblstm-att.png">

In [13]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 10

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(X_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f745dd7fbe0>