# Import Movie Review Data

Set the seed

In [1]:
import numpy as np

In [2]:
np.random.seed(42)

Import the dataset as pandas dataframe

In [3]:
import pandas as pd

Data can be downloaded from Kaggle at the following URL

- https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [4]:
df = pd.read_csv('labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)

In [34]:
df

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
...,...,...,...
24995,"""3453_3""",0,"""It seems like more consideration has gone int..."
24996,"""5064_1""",0,"""I don't believe they made this film. Complete..."
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui..."
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the..."


Split Data into Training and Test Data

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

# Build the Tokenizer

In [7]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [8]:
top_words = 10000

In [9]:
t = Tokenizer(num_words=top_words) # num_words -> Vocablury size

In [10]:
t.fit_on_texts(X_train.tolist())

# Prepare Training and Test Data

Get the word index for each of the word in the review

In [11]:
X_train = t.texts_to_sequences(X_train.tolist())

In [12]:
X_test = t.texts_to_sequences(X_test.tolist())

How many words in each review?

# Pad Sequences - Important

In [13]:
from tensorflow.python.keras.preprocessing import sequence

In [14]:
max_review_length = 300

In [15]:
X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,padding='post')

In [16]:
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')

# Build Embedding Matrix from Pre-Trained Word2Vec

Load pre-trained Gensim Embeddings

In [17]:
import gensim

ModuleNotFoundError: No module named 'gensim'

In [18]:
word2vec = gensim.models.Word2Vec.load('word2vec-movie-50')

NameError: name 'gensim' is not defined

Embedding Size

In [19]:
embedding_vector_length = word2vec.wv.syn0.shape[1]

NameError: name 'word2vec' is not defined

In [20]:
embedding_vector_length

NameError: name 'embedding_vector_length' is not defined

Build matrix for current data

In [21]:
embedding_matrix = np.zeros((top_words + 1, embedding_vector_length))

NameError: name 'embedding_vector_length' is not defined

In [22]:
embedding_matrix.shape

NameError: name 'embedding_matrix' is not defined

In [23]:
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > top_words:
        break
    if word in word2vec.wv.vocab:
        embedding_vector = word2vec.wv[word]
        embedding_matrix[i] = embedding_vector

NameError: name 'word2vec' is not defined

In [24]:
max_review_length

300

# Build the Graph

In [25]:
from tensorflow.python.keras.models import Sequential

In [26]:
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten, LSTM

In [27]:
model = Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [28]:
model.add(Embedding(top_words + 1,
                    embedding_vector_length,
                    input_length=max_review_length,
                   weights=[embedding_matrix],
                   trainable=False)
         )

NameError: name 'embedding_vector_length' is not defined

In [29]:
model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add Layer with 100 LSTM Memory Units

In [30]:
model.add(LSTM(100))

In [31]:
model.add(Dense(1,activation='sigmoid'))

In [32]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

# Execute the graph

In [33]:
model.fit(X_train,y_train,
          epochs=10,
          batch_size=128,          
          validation_data=(X_test, y_test),
         verbose=1)

Epoch 1/10


ValueError: in user code:

    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\keras\engine\training.py:754 train_step
        y_pred = self(x, training=True)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\Mahesh\anaconda3\envs\mkansakar\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:223 assert_input_compatibility
        str(tuple(shape)))

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 300)
