In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [3]:
cd /content/drive/MyDrive/archive

/content/drive/MyDrive/archive


In [4]:
import pandas as pd
%matplotlib inline 
from matplotlib import pyplot as plt 
import numpy as np


In [5]:
data = pd.read_csv('IMDB Dataset.csv')

In [6]:
print(data.head(10))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive


 Let us first convert the sentiments into a form that could be understood by the computer. We can do it by assigning 1 value to positive sentiment and 0 value for negative sentiment 




In [7]:
def Sentiment_conversion(sentiment):
  if sentiment =='positive':
    return  1
  else : return 0 

In [8]:
data['sentiment'] = [Sentiment_conversion(sentiment) for sentiment in data['sentiment']]

In [9]:
print(data['sentiment'].head(10))

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    0
8    0
9    1
Name: sentiment, dtype: int64


In [10]:
print(pd.value_counts(data['sentiment']))


1    25000
0    25000
Name: sentiment, dtype: int64


We have equal numbers of sentiments distributed randomly through out our data set

# **Text- Preprocessing**

In [11]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import re
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
tokenizer = RegexpTokenizer("[\w']+")
for i in range(len(data['review'])):
  data['review'][i] = tokenizer.tokenize(data['review'][i])

In [13]:
print(len(data['review'][1]))

164


**Word Tokeniizing**(another method)

In [15]:
#data['review'] =  [word_tokenize(sentence) for sentence in data['review']]
#print(len(data['review'][1]))

162


**Removing Stopwords**

In [None]:
stop_words = set(stopwords.words('english'))
for i in range(len(data['review'] )):
  pd.set_option('mode.chained_assignment',None)
  data['review'][i] = [word.lower() for word in data['review'][i] if word.lower() not in stop_words]
print(len(data['review'][1]))
# We have stemmed a lot of words 
print(data['review'][1])

**Stemming**

In [15]:
ps = PorterStemmer()
for i in range(len(data['review'] )):
  pd.set_option('mode.chained_assignment',None)
  data['review'][i] = [ps.stem(word) for word in data['review'][i] ]

**Lemmetizing**

In [16]:
lm = WordNetLemmatizer()
for i in range(len(data['review'] )):
  pd.set_option('mode.chained_assignment',None)
  data['review'][i] = [lm.lemmatize(word.lower()) for word in data['review'][i]]

In [None]:
data['review'][1]

In [18]:
data.head(10)

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, h...",1
1,"[wonder, littl, product, br, br, film, techniq...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, there', famili, littl, boy, jake, thin...",0
4,"[petter, mattei', love, time, money, visual, s...",1
5,"[probabl, time, favorit, movi, stori, selfless...",1
6,"[sure, would, like, see, resurrect, date, seah...",1
7,"[show, amaz, fresh, innov, idea, 70', first, a...",0
8,"[encourag, posit, comment, film, look, forward...",0
9,"[like, origin, gut, wrench, laughter, like, mo...",1


AHH ! finally our cleaned data is here

In [19]:
data.to_csv('Cleaned_data.csv')

# **Making The dictionary**
we have used corpora module from gensim for this task 

In [None]:
from gensim import corpora
review_dict= corpora.Dictionary(data['review'])
print(review_dict)
print(review_dict.token2id)

# **Indexing the words from the dictionary** 

In [21]:
def indexing(sentence):
  return [review_dict.token2id[word] for word in sentence]
data['review_index'] = data['review'].apply(indexing)

In [None]:
data['review_index'][1]

We will be padding the sentences and shortening the long sentences so that the training in batches could be done more speedly 

In [23]:
def padding(sentence,seq_len):
  features = np.zeros((len(sentence),seq_len),dtype = int)
  for ii, review in enumerate(sentence):
    if len(review) != 0:
      features[ii,-len(review):] = np.array(review)[:seq_len]
  return features

# **Splitting the datatset**

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(data['review_index'],data['sentiment'],shuffle =True,test_size = 0.3,random_state = 15)
print(Y_train.value_counts())
print(Y_test.value_counts())


0    17512
1    17488
Name: sentiment, dtype: int64
1    7512
0    7488
Name: sentiment, dtype: int64


In [26]:
seq_len = 200
X_train_pad = padding(X_train,seq_len)
X_test_pad = padding(X_test,seq_len)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

# **MODEL**

In [28]:
import torch 
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader

train_data = TensorDataset(torch.from_numpy(X_train_pad),torch.from_numpy(Y_train))
test_data = TensorDataset(torch.from_numpy(X_test_pad),torch.from_numpy(Y_test))

train_loader = DataLoader(train_data,shuffle= True,batch_size = 500)
test_loader = DataLoader(test_data,shuffle = True,batch_size= 500)

In [29]:
class LSTM_net(nn.Module):
  def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob = 0.5):
    super(LSTM_net,self).__init__()
    
    self.output_size = output_size
    self.hidden_dim = hidden_dim
    
    self.n_layers = n_layers
    #embeddings
    self.embeddings = nn.Embedding(vocab_size,embedding_dim)
    #lstm
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout = drop_prob,batch_first = True)
    # We have used two fully connected layers 
    self.dropout = nn.Dropout(drop_prob)
    self.fc1 = nn.Linear(hidden_dim,128)
    self.fc2 = nn.Linear(128,output_size)
    self.sigmoid = nn.Sigmoid()

  def forward(self,x,hidden):
    batch_size = x.size(0)
    x = x.long()
     #embedding and output of lstm
    embedding = self.embeddings(x)
    # the shape of the embedding tensor is  (x.shape,embedding_dim)
    # these embeddings will go as input to our lstm 
    lstm_output,hidden = self.lstm(embedding,hidden)

    lstm_output = lstm_output.contiguous().view(-1,self.hidden_dim)
     #dropout and fully connected layers
    lstm_output = self.dropout(lstm_output)
    output = self.fc1(lstm_output)
    output = self.dropout(output)
    output = self.fc2(output)
    #final output
    output = self.sigmoid(output)
    # reshaping to batch_size
    output = output.view(batch_size,-1)
    #getting the last batch of labels
    output = output[ : ,-1] 

    return output,hidden
    
  def initialize_hidden(self,batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers,batch_size,self.hidden_dim).zero_().to(device),weight.new(self.n_layers,batch_size,self.hidden_dim).zero_().to(device))
    return hidden

In [30]:
vocab_size = len(review_dict)+1
output_size = 1
embedding_dim = 200
n_layers = 2
hidden_dim = 256
batch_size = 500
model = LSTM_net(vocab_size,output_size,embedding_dim,hidden_dim,n_layers)
device = torch.device('cpu')
model.to(device)
print(model)

LSTM_net(
  (embeddings): Embedding(90078, 200)
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


# **Training our model**

In [31]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = lr)

In [35]:
print(str(1))

1


In [38]:
epochs = 3
clip = 5
batch_counter = 1
loss_at_epoch = []
for i in range(epochs):
  losses_train =[]
  train_acc = 0
  model.train()
  #initializing the hidden states
  h = model.initialize_hidden(batch_size)

  for inputs,labels in train_loader:
    h = tuple(e.data for e in h)
    inputs,labels = inputs.to(device),labels.to(device)
    #clearing all the accumulated gradients 
    model.zero_grad()
    # calculating the output
    output,h = model(inputs,h)
    #loss
    loss = criterion(output.squeeze(),labels.float())
    losses_train.append(loss.item())
    #accuracy
    prediction = torch.round(output.squeeze())
    train_acc += torch.sum(prediction == labels.squeeze()).item()
    # Back propogation
    loss.backward()
    #tackling the problem of vanishing gradient descent
    nn.utils.clip_grad_norm_(model.parameters(),clip)
    #updating the weights 
    optimizer.step()
  #calculating the total loss at the end of a epoch
  epoch_loss = np.mean(losses_train)
  loss_at_epoch.append(epoch_loss)
  #accuracy at the end of one epoch
  epoch_acc = train_acc/len(train_loader.dataset)
  print('Epoch :'+str(i+1))
  print('loss = '+str(epoch_loss))
  print("accuracy = "+str(epoch_acc))

Epoch :1
loss = 0.1835843203323228
accuracy = 0.9306857142857143
Epoch :2
loss = 0.14789513253739903
accuracy = 0.9467428571428571
Epoch :3
loss = 0.111133236810565
accuracy = 0.9604285714285714


# **Testing**

In [40]:
test_loss = []
test_acc = 0
model.eval() 
for inputs,labels in test_loader:
    h = tuple(e.data for e in h)
    inputs,labels = inputs.to(device),labels.to(device)
    #clearing all the accumulated gradients 
    model.zero_grad()
    # calculating the output
    output,h = model(inputs,h)
    #loss
    loss = criterion(output.squeeze(),labels.float())
    test_loss.append(loss.item())
    #accuracy
    prediction = torch.round(output.squeeze())
    test_acc += torch.sum(prediction == labels.squeeze()).item()
loss = np.mean(test_loss)
accuracy = test_acc/len(test_loader.dataset)
print('loss'+str(loss))
print('test accuracy'+str(accuracy))


loss0.33933846751848856
test accuracy0.8612666666666666
