In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Embedding
from keras.layers import SimpleRNN
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

from keras.layers import LSTM
from keras.layers import Bidirectional, Dropout

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [13]:
import os
import numpy as np
import zipfile
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
train = pd.read_csv("train.csv", sep='|')
test = pd.read_csv("test.csv", sep='|')

In [5]:
train.overall.value_counts()

1    25000
0    15000
Name: overall, dtype: int64

In [6]:
train.head()

Unnamed: 0,overall,reviewText
0,0,Entertaining enough for those who don't think ...
1,1,I bought it yesterday havent started watching ...
2,1,This movie tells the story of three kids who g...
3,1,You wanna know what its like for a Black perso...
4,1,Warner Archive has finally released an epic fi...


In [7]:
TEXT_DATA_DIR = './'
TEXT_DATA_FILE = 'train.csv'
HEADER = True

VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

In [8]:
labels = np.asarray(train['overall'], dtype='int8')
data = np.asarray(train['reviewText'])
data_test = np.asarray(test['reviewText'])

In [9]:
# spliting original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(data, labels,
                     test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED, stratify=labels)

In [11]:
print(data_train.shape, labels_train.shape)

(36000,) (36000,)


In [12]:
print("Validation data shape: {}".format([data_val.shape, labels_val.shape]))

Validation data shape: [(4000,), (4000,)]


In [14]:
# initialize dictionary size and maximum sentence length
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 40

print("Original sentence:\n", data_train[0])

# create a dictionary with Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='#$%&()*+-/:;<=>@[\\]^{|}~\t\n,.!"')
tokenizer.fit_on_texts(data_train)

# replacing words with their indexes from our dictionary
X_train = tokenizer.texts_to_sequences(data_train)
X_val = tokenizer.texts_to_sequences(data_val)

print("Sentence in indexes:\n", X_train[0])

# fit each sentence to max length
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

print("Sentence fitted to max length:\n", X_train[0])

Original sentence:
 I came into this movie expecting the years greatest hit, like so many people told me.  You would need to pay me, pretty well, to see this movie again.  It's like they stole they plot from "Dude Where's My Car" and replaced the car with a guy.  The humor was extremely juvenile and repetitive.I understand it's supposed to be a funny movie, I enjoy most Will Ferrell movies for example, but this movie wasn't even funny.  I can't even say it had a few funny parts, it had maybe one, two tops.I kept waiting for the "movie of the year" type scenes and they never came.Simply awful.
Sentence in indexes:
 [8, 335, 79, 9, 16, 1115, 1, 147, 734, 629, 33, 30, 108, 97, 595, 71, 19, 56, 346, 5, 875, 71, 202, 64, 5, 70, 9, 16, 157, 46, 33, 29, 4314, 29, 142, 34, 3267, 6519, 48, 648, 2, 2533, 1, 648, 14, 3, 292, 1, 374, 12, 714, 5384, 2, 4406, 8, 420, 46, 542, 5, 26, 3, 175, 16, 8, 239, 88, 61, 5965, 85, 13, 612, 17, 9, 16, 313, 62, 175, 8, 191, 62, 139, 10, 65, 3, 163, 175, 513, 10,

In [15]:
X_test = tokenizer.texts_to_sequences(data_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

In [16]:
X_train.shape

(36000, 40)

In [17]:
# path to embeddings file
EMBEDDINGS_DIR = '../embeddings'
EMBEDDINGS_FILE = 'glove.6B.50d.txt'

EMBEDDING_DIM = 50

first_10000 = {k: v for k, v in tokenizer.word_index.items() if v < 10000}

# upload embeddings
embeddings = {}
with zipfile.ZipFile(os.path.join(EMBEDDINGS_DIR, EMBEDDINGS_FILE+'.zip')) as myzip:
    with myzip.open(EMBEDDINGS_FILE) as f:
        for line in f:
            values = line.split()
            word = values[0].decode('UTF-8')
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        del values, word, coefs, line
print("Number of words with vector representation:", len(embeddings))

Number of words with vector representation: 400000


In [18]:
# prepare embeddings matrix where each row is word index

embedding_matrix = np.zeros((tokenizer.num_words, EMBEDDING_DIM))
for word, i in first_10000.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Classifiers

In [19]:
NAME = "bidirectional_lstm"

embedding_layer = Embedding(tokenizer.num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            mask_zero=True)
                            
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(100, dropout=0.1, recurrent_dropout=0.1)))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            500000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 40, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               120800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 621,001
Trainable params: 121,001
Non-trainable params: 500,000
________________________________________________________________

In [20]:
# stop training model if accuracy does not increase more than five epochs
callback_1 = EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=0, mode='auto')
# best model saving
callback_2 = ModelCheckpoint("../models/model_{}.hdf5".format(NAME), monitor='val_acc',
                                 save_best_only=True, verbose=1)

In [21]:
model.fit(X_train, labels_train, validation_data=[X_val, labels_val], 
          batch_size=1024, epochs=10, callbacks=[callback_1, callback_2])

Train on 36000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff2801cbd30>

In [22]:
pred = model.predict_proba(X_test)



In [57]:
pred[:3]

array([[ 0.71437711],
       [ 0.10330234],
       [ 0.31465235]], dtype=float32)