# **IMDB Rating review classification into Good or Bad**

In [0]:
%tensorflow_version 2.x

In [0]:
#As my dataset is in my drive, hence mounting my location to drive...
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#Importing the required packages...
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Activation, Embedding
from tensorflow.keras.models import Model

import nltk
nltk.download('punkt')
from nltk import word_tokenize

In [0]:
file= r'/content/imdb_labelled.txt'

In [0]:
imdb= pd.read_csv(file, sep='\t', header=None, names=['review','target'])

In [0]:
imdb.head()

Unnamed: 0,review,target
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


**Dowloading the pre-trained weights to use it for our model.**

In [0]:
##We can download Embedding directly from here using wget:
#Just copy the link for the Embedding that you want to use...
#https://nlp.stanford.edu/projects/glove/

In [0]:
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip

--2020-01-14 10:50:53--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2020-01-14 10:50:54--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2020-01-14 10:50:54--  http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]
Sav

In [0]:
!unzip /content/glove.42B.300d.zip 

Archive:  /content/glove.42B.300d.zip
  inflating: glove.42B.300d.txt      


In [0]:
embedding_index={}

f=open('glove.42B.300d.txt', encoding='utf-8')
for line in f:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:], dtype='float32')
  embedding_index[word]=coefs
f.close()

**Splitting our data into train and test**

In [0]:
from sklearn.model_selection import train_test_split
data_train, data_test= train_test_split(imdb, test_size=0.2)

In [0]:
x_train= data_train['review']
y_train= data_train['target']

x_test= data_test['review']
y_test= data_test['target']

**Data Preprocessing**

In [0]:
sent_lens= [len(word_tokenize(x)) for x in x_train]

In [0]:
np.percentile(sent_lens,95)

38.14999999999998

In [0]:
max_len=40

tk= Tokenizer(char_level=False, split=' ')
tk.fit_on_texts(x_train)

seq_train= tk.texts_to_sequences(x_train)
seq_test= tk.texts_to_sequences(x_test)

vocab_size= len(tk.word_index)

seq_train_matrix= sequence.pad_sequences(seq_train, maxlen=max_len)
seq_test_matrix= sequence.pad_sequences(seq_test, maxlen=max_len)

In [0]:
vocab_size

2718

In [0]:
##Creating our Embedding matrix to bring down the size to 300
#We will use 300D vector representation of the words from pretrained Embedding index that we downloaded..

In [0]:
embedding_matrix= np.zeros((vocab_size+1,300))

for word,i in tk.word_index.items():
  embed_vector= embedding_index.get(word)
  if embed_vector is not None:
    embedding_matrix[i]=embed_vector

**Network Architecture**

In [0]:
inputs=Input(shape=[max_len], name='text_input')
embed= Embedding(vocab_size+1,300, input_length=max_len, mask_zero=True, weights=[embedding_matrix], trainable=False)(inputs)
lstm_layer= LSTM(50)(embed)
dense1=Dense(10, activation='relu')(lstm_layer)
drop=Dropout(0.2)(dense1)
final_layer= Dense(1, activation='sigmoid')(drop)
model= Model(inputs=inputs, outputs=final_layer)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_input (InputLayer)      [(None, 40)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 300)           815700    
_________________________________________________________________
lstm (LSTM)                  (None, 50)                70200     
_________________________________________________________________
dense (Dense)                (None, 10)                510       
_________________________________________________________________
dropout (Dropout)            (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 886,421
Trainable params: 70,721
Non-trainable params: 815,700
__________________________________________________

In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(seq_train_matrix, y_train.values, validation_data=[seq_test_matrix, y_test.values], epochs=40, class_weight={0:1, 1:12}, batch_size=50)

Train on 598 samples, validate on 150 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fbc6034feb8>

In [0]:
p=model.predict(seq_test_matrix)

In [0]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,p)

0.90875