In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv(r"C:\Users\USER\Downloads\code-main\DLNLP\Traindataset.csv")
testdf= pd.read_csv(r'C:\Users\USER\Downloads\code-main\DLNLP\E0334 Assignment2 Test Dataset.csv')

In [3]:
testdf.columns

Index(['review', 'sentiment'], dtype='object')

In [4]:
testdf.head()

Unnamed: 0,review,sentiment
0,Recap: Not entirely familiar with the Shakespe...,negative
1,"It has been 16 years since it's original run, ...",positive
2,I'm an animator myself and an all around buff ...,negative
3,The movie had no excitement and does not have ...,negative
4,i just got puzzled why damn FOX canceled the s...,positive


In [5]:
X,y = df['review'].values,df['sentiment'].values
x_train,x_val,y_train,y_val = train_test_split(X,y,stratify=y)
print(f'shape of train data is {x_train.shape}')
print(f'shape of validation data is {x_val.shape}')

shape of train data is (30000,)
shape of validation data is (10000,)


In [6]:
x_test,y_test = testdf['review'].values,testdf['sentiment'].values


In [7]:
df[df['sentiment']=='positive']

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
39983,"I loved it, having been a fan of the original ...",positive
39985,Imaginary Heroes is clearly the best film of t...,positive
39989,I got this one a few weeks ago and love it! It...,positive
39992,John Garfield plays a Marine who is blinded by...,positive


In [8]:
def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)

    return s

In [9]:
def tockenize(x_train,y_train,x_val,y_val,x_test,y_test):
    word_list = []

    stop_words = set(stopwords.words('english')) 
    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)
  
    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    
    # tockenize
    final_list_train,final_list_val ,final_list_test= [],[],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                     if preprocess_string(word) in onehot_dict.keys()])
    for sent in x_val:
            final_list_val.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                    if preprocess_string(word) in onehot_dict.keys()])
                                    
    for sent in x_test:
            final_list_test.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                    if preprocess_string(word) in onehot_dict.keys()])
    encoded_train = [1 if label =='positive' else 0 for label in y_train]  
    encoded_val = [1 if label =='positive' else 0 for label in y_val] 
    encoded_test = [1 if label =='positive' else 0 for label in y_test]
    return np.array(final_list_train), np.array(encoded_train),np.array(final_list_val), np.array(encoded_val),np.array(final_list_test), np.array(encoded_test),onehot_dict

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
Counter(['base','is','good'])

Counter({'base': 1, 'is': 1, 'good': 1})

In [12]:
x_train,y_train,x_val,y_val,x_test,y_test,vocab = tockenize(x_train,y_train,x_val,y_val,x_test,y_test)

  return np.array(final_list_train), np.array(encoded_train),np.array(final_list_val), np.array(encoded_val),np.array(final_list_test), np.array(encoded_test),onehot_dict


In [13]:
print(f'Length of vocabulary is {len(vocab)}')

Length of vocabulary is 1000


In [14]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [15]:
x_train_pad = padding_(x_train,500)
x_val_pad = padding_(x_val,500)
x_test_pad= padding_(x_test,500)

In [44]:
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_val_pad), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))


# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)



In [45]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([50, 500])
Sample input: 
 tensor([[  0,   0,   0,  ..., 618,  76,   1],
        [  0,   0,   0,  ..., 329, 191,   4],
        [  0,   0,   0,  ..., 499,  28,   9],
        ...,
        [  0,   0,   0,  ..., 706, 575, 421],
        [  0,   0,   0,  ...,  61, 293, 374],
        [  0,   0,   0,  ..., 621,  96, 453]], dtype=torch.int32)
Sample input: 
 tensor([0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 1], dtype=torch.int32)


In [46]:

from unicodedata import bidirectional


class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5,num=1):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num=num
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        self.bilstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True,bidirectional=True)
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        if self.num==1:
            self.fc = nn.Linear(self.hidden_dim, 128)
        else:
            self.fc = nn.Linear(self.hidden_dim*2, 128)
        self.fc2 =nn.Linear(128,self.output_dim)
        self.Relu=nn.ReLU()
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden0):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        if self.num==1:
            lstm_out, hidden = self.lstm(embeds, hidden0)
        else:
            lstm_out, hidden = self.bilstm(embeds, hidden0)
        lstm_out = lstm_out[:,-1,:]
        # print("1.",lstm_out.shape)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim*self.num) 

        
        # print(bilstm_out.shape)
        # print("2.",bihidden[0].shape)
        # print("3.",hidden[1].shape)
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out =self.dropout(out)
        out= self.Relu(out)
        out=self.fc2(out)

        # sigmoid function
        sig_out = self.sig(out)
        # print("4.",sig_out.shape)
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        # print("5.",sig_out.shape)
        sig_out = sig_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
        
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        if self.num==1:
            h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
            c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        else:
            h0 = torch.zeros((self.no_layers*2,batch_size,self.hidden_dim)).to(device)
            c0 = torch.zeros((self.no_layers*2,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden



In [47]:
device = torch.device("cuda")


In [48]:
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 300
output_dim = 1
hidden_dim = 256


model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5,num=2)

#moving to gpu
model.to(device)

print(model)

SentimentRNN(
  (embedding): Embedding(1001, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True)
  (bilstm): LSTM(300, 256, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (Relu): ReLU()
  (sig): Sigmoid()
)


In [49]:
model.lstm

LSTM(300, 256, num_layers=2, batch_first=True)

In [50]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=0.0001)

# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [51]:
clip = 5
epochs = 30
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state 
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)   
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        output,h = model(inputs,h)
        # print(output.shape,h.shape)
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        # nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
 
    
        
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())
            
            accuracy = acc(output,labels)
            val_acc += accuracy
            
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), 'state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(25*'==')

Epoch 1
train_loss : 0.48243220522999763 val_loss : 0.3807396486401558
train_accuracy : 77.15333333333334 val_accuracy : 83.12
Validation loss decreased (inf --> 0.380740).  Saving model ...
Epoch 2
train_loss : 0.3677494246264299 val_loss : 0.34873192474246023
train_accuracy : 84.62333333333333 val_accuracy : 85.08
Validation loss decreased (0.380740 --> 0.348732).  Saving model ...
Epoch 3
train_loss : 0.3253137772033612 val_loss : 0.3472865402698517
train_accuracy : 86.49 val_accuracy : 84.56
Validation loss decreased (0.348732 --> 0.347287).  Saving model ...
Epoch 4
train_loss : 0.30400524199008944 val_loss : 0.3423831091821194
train_accuracy : 87.54333333333332 val_accuracy : 84.97
Validation loss decreased (0.347287 --> 0.342383).  Saving model ...
Epoch 5
train_loss : 0.28975273134807744 val_loss : 0.32721810169517995
train_accuracy : 87.89333333333333 val_accuracy : 85.84
Validation loss decreased (0.342383 --> 0.327218).  Saving model ...
Epoch 6
train_loss : 0.27725533230851

KeyboardInterrupt: 

In [None]:
val_h = model.init_hidden(batch_size)
val_acc=0
for inputs, labels in test_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())
            
            accuracy = acc(output,labels)
            val_acc += accuracy

In [None]:
val_acc/len(test_loader.dataset)*100

83.89999999999999