###Set the seed

In [1]:
import numpy as np

###Load data
Data can be downloaded from Kaggle -> https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [2]:
import pandas as pd

df = pd.read_csv('labeledTrainData.tsv.zip',  #filepath
                 header=0, delimiter="\t", quoting=3)

print(df.shape)

(25000, 3)


In [3]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


## Data Preprocessing

1.Split Data into Training and Test Data

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

2.Build Tokenizer to get Number sequences for Each review

In [5]:
X_train[1]

'"\\"The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [6]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

#Vocab size
top_words = 10000

t = Tokenizer(num_words=top_words)
t.fit_on_texts(X_train.tolist())

#Get the word index for each of the word in the review
X_train = t.texts_to_sequences(X_train.tolist())
X_test = t.texts_to_sequences(X_test.tolist())

In [7]:
#X_train[1]

In [8]:
len(X_train[100])

386

3.Pad sequences to make each review size equalGet the word index for each of the word in the review

In [9]:
from tensorflow.python.keras.preprocessing import sequence

#Each review size
max_review_length = 300

X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')

In [10]:
len(X_train[1])

300

## Build Embedding Matrix from Pre-Trained Word2Vec model

Load pre-trained Gensim Embeddings

In [11]:
#Install gensim
#!pip install gensim --quiet

#Load pre-trained model
import gensim
word2vec = gensim.models.Word2Vec.load('word2vec-movie-50')

#Embedding Length
embedding_vector_length = word2vec.wv.vectors.shape[1]

print('Loaded word2vec model..')
print('Model shape: ', word2vec.wv.vectors.shape)

Loaded word2vec model..
Model shape:  (28322, 50)


In [12]:
word2vec.wv.vectors.shape

(28322, 50)

Build matrix for current data

In [13]:
#Initialize embedding matrix to all zeros
embedding_matrix = np.zeros((top_words + 1, #Vocablury size + 1
                             embedding_vector_length))

#Steps for populating embedding matrix

#1. Check each word in tokenizer vocablury to see if it exist in pre-trained
# word2vec model.
#2. If found, update embedding matrix with embeddings for the word 
# from word2vec model

for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > top_words:
        break
    if word in word2vec.wv.vocab:
        embedding_vector = word2vec.wv[word]
        embedding_matrix[i] = embedding_vector

In [14]:
#Check embeddings for word 'great'
embedding_matrix[t.word_index['great']]

array([-0.73459101, -0.34348151,  4.09425545, -0.76835114,  0.98946816,
        1.87112844, -1.21309519,  0.02420728,  0.24738404, -3.80001116,
        1.00591576, -0.00599149, -1.10720205, -0.64441431,  1.55634487,
       -0.67932558,  2.68729401,  3.22928667,  1.98245931,  2.35130262,
       -2.0172646 ,  2.68419147,  5.51142311, -1.83284128, -0.6304661 ,
        1.92983949, -1.51213527, -2.33458519,  1.14439762,  0.23554215,
       -3.18900323, -1.71647346, -2.8396821 ,  2.10814762,  1.59747708,
       -2.05685472,  0.02134195, -1.49737895,  0.7752192 , -0.22689784,
       -0.89800125, -1.75604141,  1.26355755, -2.64244699,  2.0314641 ,
        1.48112845,  0.27211079, -0.79587841,  1.56275535, -4.55970097])

## Build the Graph

In [15]:
import tensorflow as tf

In [16]:
tf.keras.backend.clear_session()

In [17]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten

#Build a sequential model
model = Sequential()

Add Embedding layer

In [18]:
model.add(Embedding(top_words + 1, #Indexes that we need to deal with                    
                    embedding_vector_length, #embedding_size i.e 50 in this case
                    input_length=max_review_length, #Size of each review i.e 300 in this case
                    weights=[embedding_matrix], #Pre-trained embedding
                    trainable=False #We do not want to change embedding
                   )
         )

In [19]:
model.output

<tf.Tensor 'embedding/Identity:0' shape=(None, 300, 50) dtype=float32>

Output from Embedding is 3 dimension 
- batch_size x max_review_length x embedding_vector_length. 

We need to flatten the output for Dense layer

In [20]:
#Flatten embedding layer output and flatten layers
model.add(Flatten())
model.add(Dense(200,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(60,activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(30,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           500050    
_________________________________________________________________
flatten (Flatten)            (None, 15000)             0         
_________________________________________________________________
dense (Dense)                (None, 200)               3000200   
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 60)                6060      
_________________________________________________________________
dropout_1 (Dropout)          (None, 60)                0

## Execute the graph

In [None]:
model.fit(X_train,y_train,
          epochs=1,
          batch_size=128,          
          validation_data=(X_test, y_test))

In [None]:
model.predict(X_test[100:102])