In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manansinghmehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Dataset Generation 

## Read Data

In [2]:
df = pd.read_table('/Users/manansinghmehta/Downloads/amazon_reviews_us_Beauty_v1_00.tsv',on_bad_lines='skip')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Keep Reviews and Ratings

In [3]:
df = df[['review_body','star_rating']]
df.head()

Unnamed: 0,review_body,star_rating
0,"Love this, excellent sun block!!",5
1,The great thing about this cream is that it do...,5
2,"Great Product, I'm 65 years old and this is al...",5
3,I use them as shower caps & conditioning caps....,5
4,This is my go-to daily sunblock. It leaves no ...,5


In [4]:
#some start_ratings have dates removing those rows
df = df[ df['star_rating'].apply(lambda x: (isinstance(x, str) and len(x) == 1) or isinstance(x, int))]

In [5]:
# dropping nan values from df
df.dropna(axis = 0, inplace = True)

In [6]:
mappings = {'1':1,
            '2':1,
            '3':2,
            '4':3,
            '5':3,
            1:1,
            2:1,
            3:2,
            4:3,
            5:3
           }

df['star_rating'] = df['star_rating'].apply(lambda x : mappings[x])
df.head()

Unnamed: 0,review_body,star_rating
0,"Love this, excellent sun block!!",3
1,The great thing about this cream is that it do...,3
2,"Great Product, I'm 65 years old and this is al...",3
3,I use them as shower caps & conditioning caps....,3
4,This is my go-to daily sunblock. It leaves no ...,3


In [7]:
class_1 = df[ df['star_rating'] == 1 ].sample(n = 20000)
class_2 = df[ df['star_rating'] == 2 ].sample(n = 20000)
class_3 = df[ df['star_rating'] == 3 ].sample(n = 20000)

In [8]:
df = pd.concat([class_1, class_2,class_3], ignore_index=True)
df = df.sample(frac = 1)
df = df.reset_index(drop= True)
df.head()

Unnamed: 0,review_body,star_rating
0,Does anti-cellulite cream even exist?,1
1,It is not water resistant like it said it was....,2
2,Service good. Product to pricy to purchase. Ma...,1
3,so far i dont know if it is working so we shal...,2
4,I had given this product a great rating on ano...,2


# 2. Word Embedding (25 points)

## (a) Load the pretrained Word2Vec model and check semantic similarities

In [9]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [10]:
for index, word in enumerate(wv.index_to_key):
    if index == 5:
        break
    print(f"{index} — {word}")

0 — </s>
1 — in
2 — for
3 — that
4 — is


### Sample Word Embedding Extraction

In [11]:
wv['play']

array([ 0.01226807,  0.06225586,  0.10693359,  0.05810547,  0.23828125,
        0.03686523,  0.05151367, -0.20703125,  0.01989746,  0.10058594,
       -0.03759766, -0.1015625 , -0.15820312, -0.08105469, -0.0390625 ,
       -0.05053711,  0.16015625,  0.2578125 ,  0.10058594, -0.25976562,
        0.078125  ,  0.16113281,  0.12890625, -0.00318909,  0.00744629,
        0.10693359, -0.01696777, -0.22070312,  0.01239014,  0.08154297,
       -0.12158203,  0.24707031, -0.08105469, -0.21191406,  0.12695312,
        0.07568359,  0.12792969,  0.11035156,  0.09863281,  0.24316406,
        0.21875   ,  0.09716797,  0.13671875,  0.04272461, -0.05151367,
       -0.01257324, -0.11083984, -0.17089844,  0.07470703,  0.15625   ,
        0.04101562,  0.26953125,  0.01037598, -0.09814453,  0.10595703,
       -0.3203125 ,  0.10888672, -0.09228516,  0.05517578, -0.00756836,
       -0.03344727,  0.12207031, -0.07324219,  0.04467773,  0.046875  ,
        0.05957031,  0.06225586,  0.11035156,  0.24023438,  0.29

In [12]:
# calculating distance between two vectors
def distance(p1,p2):
    return np.sum((p1-p2)**2)


# K-nearest-neighbours algorithm to find the top K neighbours to a word
def knn(vector, trained = "pretrained", k = 3):
    d = []
    if trained == "pretrained":
        for key in wv.index_to_key:

            dist = distance(vector,wv[key])
            d.append( (dist,key) )

        d = np.array(sorted(d))[:,1]
        d = d[:k]
    else:
        for key in emb_model.wv.index_to_key:

            dist = distance(vector,emb_model.wv[key])
            d.append( (dist,key) )

        d = np.array(sorted(d))[:,1]
        d = d[:k]

    return d

###  checking semantic similarities
1. big − bigger + warmer = warmer, **warm**, chilly

In [13]:
#  big − bigger + warmer
semantic_meaning_1 = wv['big'] - wv['bigger'] + wv['warmer']
knn(semantic_meaning_1)

array(['warmer', 'warm', 'chilly'], dtype='<U98')

2. awesome = **amazing**, **fantastic**

In [14]:
# awesome
semantic_meaning_2 = wv['awesome']
knn(semantic_meaning_2)

array(['awesome', 'amazing', 'fantastic'], dtype='<U98')

3. book = **books**, **Booklocker.com**

In [15]:
# book  
semantic_meaning_3 = wv['book']
knn(semantic_meaning_3)

array(['book', 'books', 'Booklocker.com'], dtype='<U98')

## (b) Train a Word2Vec model using your own dataset

In [16]:
# prepare data for training word2vec model —> list of lists of tokens
sentences = []
for sent in df["review_body"]:
    sentences.append(sent.split(" "))

In [17]:
import gensim.models
# defining model
emb_model = gensim.models.Word2Vec(sentences=sentences, vector_size = 300, min_count = 9, window = 13)

### Semantic similarities using my model
1.  big − bigger + warmer = big, huge

In [18]:
semantic_meaning_1_mymodel = emb_model.wv['big'] - emb_model.wv['bigger'] + emb_model.wv['warmer']
knn(semantic_meaning_1_mymodel, trained="trained")

array(['big', 'huge', 'sometimes'], dtype='<U32')

2. awesome = **fabulous** , **fantastic**

In [19]:
semantic_meaning_2_mymodel = emb_model.wv['awesome']
knn(semantic_meaning_2_mymodel, trained = "trained")

array(['awesome', 'fabulous', 'fantastic'], dtype='<U32')

3. book = posts , detail

In [20]:
semantic_meaning_3_mymodel = emb_model.wv['book']
knn(semantic_meaning_3_mymodel, trained = "trained")

array(['book', 'posts', 'detail'], dtype='<U32')

### `Q — What do you conclude from comparing vectors generated by yourself and the pretrained model? Which of the Word2Vec models seems to encode semantic similarities between words better?`

**Ans —** From my observation I conclude that vectors from the pretrained model performed better than vectors trained by me. 

- Pretrained model gave better relevant results for the queries 1 and 3:— 

|    | Query                    | Pre-trained model | my model          |
|----|--------------------------|-------------------|-------------------|
| 1. | big - bigger + warmer =  | warm              | big/huge          |
| 2. | awesome =                | amazing/fantastic | fabulous/fantastic |
| 3. | book =                   | books, Booklocker.com  | posts/detail            |

- The pretrained model performed better on equations because I think it has a better understanding of word meanings. as it was trained on much larger corpora
- For simple words like awesome, both models performed well

# 3. Simple models (20 points)

In [21]:
# Creating Train/Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['review_body'], df['star_rating'], test_size=0.20, random_state=20)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

#### TF-IDF Feature Extraction

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer    
# using sk-learns TfidfVectorizer to generate tf-idf vectors
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = 5,max_features = 10000)

X_tfidf_train = vectorizer.fit_transform(X_train)
X_tfidf_test = vectorizer.transform(X_test)

#### Word2vec Feature extraction

In [23]:
X_word2v_train = []
X_word2v_test = []

# converting each training review to a word2vec average vector
for review in X_train:
    n = len(review)
    words = review.split(" ")
    
    avg_vector = np.zeros(300,)
    
    for word in words:
        try:
            avg_vector += wv[word]
        except:
            pass
    
    avg_vector /= n
    X_word2v_train.append(avg_vector)
    
# converting each test review to a word2vec average vector
for review in X_test:

    n = len(review)
    words = review.split(" ")
    
    avg_vector = np.zeros(300,)
    
    for word in words:
        try:
            avg_vector += wv[word]
        except:
            pass
    
    avg_vector /= n
    X_word2v_test.append(avg_vector)



## Perceptron

In [26]:
#importing perceptron and classification_report
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
#class names for 
target_names = ['class 1', 'class 2', 'class 3']

### Perceptron TF-IDF

In [27]:
perceptron = Perceptron(
    penalty= 'l1',      # regularization term
    alpha=0.000003,     # constant to attached to regularization term
    shuffle=True,    
    tol=1e-4,           # stopping criteria
)
perceptron.fit(X_tfidf_train , y_train)

y_test_pred_perceptron = perceptron.predict(X_tfidf_test)


print( "Results for Perceptron model TF_IDF")
print()
print("Accuracy = ", classification_report(y_test, y_test_pred_perceptron, output_dict = True,target_names=target_names)["accuracy"])

Results for Perceptron model TF_IDF

Accuracy =  0.6678333333333333


### Perceptron Word2Vec

In [28]:
perceptron = Perceptron(
    penalty= 'l1',      # regularization term
    alpha=0.000003,     # constant to attached to regularization term
    shuffle=True,    
    tol=1e-4,           # stopping criteria
)
perceptron.fit(X_word2v_train , y_train)

y_test_pred_perceptron = perceptron.predict(X_word2v_test)

print( "Results for Perceptron model Word2Vec")
print()
print("Accuracy = ", classification_report(y_test, y_test_pred_perceptron, output_dict = True,target_names=target_names)["accuracy"])

Results for Perceptron model Word2Vec

Accuracy =  0.4375


### Perceptron
- TF-IDF Accuracy = 0.667833
- Word2vec Accuracy = 0.4375

# SVM

In [29]:
from sklearn.svm import LinearSVC

In [30]:
svm_classifier = LinearSVC(
    penalty='l1',                  # Using L2-Norm
    max_iter=2500,
    loss='squared_hinge',          # Loss function
    dual=False,                    # since n_samples > n_features, preferred to set dual = False
    tol=1e-4,              
    C=0.5,                        # Regularization parameter
)

svm_classifier.fit(X_tfidf_train , y_train)
y_test_pred_svm = svm_classifier.predict(X_tfidf_test)

print( "Results for SVM model TF-IDF")
print()
print("Accuracy = ", classification_report(y_test, y_test_pred_svm, output_dict=True, target_names=target_names)["accuracy"])

Results for SVM model TF-IDF

Accuracy =  0.73725


In [31]:
# SVM
from sklearn.svm import LinearSVC

svm_classifier = LinearSVC(
    penalty='l1',                  # Using L2-Norm
    max_iter=2500,
    loss='squared_hinge',          # Loss function
    dual=False,                    # since n_samples > n_features, preferred to set dual = False
    tol=1e-4,              
    C=0.5,                        # Regularization parameter
)

svm_classifier.fit(X_word2v_train , y_train)
y_test_pred_svm = svm_classifier.predict(X_word2v_test)

print( "Results for SVM model Word2Vec")
print()
print("Accuracy = ", classification_report(y_test, y_test_pred_svm, output_dict=True, target_names=target_names)["accuracy"])

Results for SVM model Word2Vec

Accuracy =  0.6260833333333333


### SVM
- TF-IDF Accuracy = 0.73725
- word2vec Accuracy = 0.62608333

### `Q - What do you conclude from comparing performances for the models trained using the two different feature types`

**Ans —**

- I conclude that SVM performs better than than the perceptron model, for both tf-idf and word2vec features. The reason could be that SVM's can handle high-dimensional data and are less sensitive to noisy data.
- Tf-idf features performed better than word2vec for both models (SVM & perceptron). The reason could be that TF-IDF  works better than word2vec, when the classification task relies more heavily on the frequency of individual words rather than their semantic meaning. Which might be true since ours is a text classification task, presence of some words might help classify a review.
- There was a huge jump in accuracy for the word2vec features when trained on SVM model

# 4. Feedforward Neural Networks (25 points)

In [32]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import OneHotEncoder

In [33]:
# converting y target labels to One hot encoding for training
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

onehotencoder = OneHotEncoder(sparse = False).fit(y_train)

y_train_ohe = onehotencoder.transform(y_train)
y_test_ohe = onehotencoder.transform(y_test)

In [34]:
train_data = []
test_data = []

# creating training data of the format [avg_word2vec, one_hot_label]
for i in range(len(y_train)):
    train_data.append([X_word2v_train[i],y_train_ohe[i]])

# creating test data of the format [avg_word2vec, one_hot_label]
for i in range(len(y_test)):
    test_data.append([X_word2v_test[i],y_test_ohe[i]])

In [35]:
#creating train and test data_loaders for pyTorch training
train_loader = torch.utils.data.DataLoader( train_data, batch_size = 10000)
test_loader = torch.utils.data.DataLoader( test_data, batch_size = 1 )

In [36]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist

import torch.nn.functional as F

# define the NN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        # linear layer (300 -> hidden_1)
        self.fc1 = nn.Linear(300, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # linear layer (n_hidden -> 3)
        self.fc3 = nn.Linear(hidden_2, 3)


    def forward(self, x):
        # add hidden layer, with relu activation function
        x = x.to(torch.float32)
        x = F.relu(self.fc1(x))
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add output layer
        x = F.softmax(self.fc3(x),dim=1)
        return x

# initialize the NN
mlp = Net()
  
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(mlp.parameters(), lr=0.007, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)

In [37]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
# number of epochs to train the model
n_epochs = 250


valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    
    mlp.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = mlp(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    
    train_loss = train_loss/len(train_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        ))

Epoch: 1 	Training Loss: 1.099500
Epoch: 2 	Training Loss: 1.096505
Epoch: 3 	Training Loss: 1.090415
Epoch: 4 	Training Loss: 1.078495
Epoch: 5 	Training Loss: 1.057826
Epoch: 6 	Training Loss: 1.029468
Epoch: 7 	Training Loss: 1.001280
Epoch: 8 	Training Loss: 0.977913
Epoch: 9 	Training Loss: 0.959913
Epoch: 10 	Training Loss: 0.946585
Epoch: 11 	Training Loss: 0.937092
Epoch: 12 	Training Loss: 0.930173
Epoch: 13 	Training Loss: 0.924785
Epoch: 14 	Training Loss: 0.920523
Epoch: 15 	Training Loss: 0.917049
Epoch: 16 	Training Loss: 0.914253
Epoch: 17 	Training Loss: 0.912000
Epoch: 18 	Training Loss: 0.910093
Epoch: 19 	Training Loss: 0.908430
Epoch: 20 	Training Loss: 0.906939
Epoch: 21 	Training Loss: 0.905790
Epoch: 22 	Training Loss: 0.905088
Epoch: 23 	Training Loss: 0.904785
Epoch: 24 	Training Loss: 0.902876
Epoch: 25 	Training Loss: 0.900215
Epoch: 26 	Training Loss: 0.898803
Epoch: 27 	Training Loss: 0.898073
Epoch: 28 	Training Loss: 0.897722
Epoch: 29 	Training Loss: 0.8

Epoch: 232 	Training Loss: 0.864950
Epoch: 233 	Training Loss: 0.864811
Epoch: 234 	Training Loss: 0.864700
Epoch: 235 	Training Loss: 0.864493
Epoch: 236 	Training Loss: 0.864334
Epoch: 237 	Training Loss: 0.864050
Epoch: 238 	Training Loss: 0.863730
Epoch: 239 	Training Loss: 0.863348
Epoch: 240 	Training Loss: 0.863025
Epoch: 241 	Training Loss: 0.862740
Epoch: 242 	Training Loss: 0.862498
Epoch: 243 	Training Loss: 0.862336
Epoch: 244 	Training Loss: 0.862250
Epoch: 245 	Training Loss: 0.862221
Epoch: 246 	Training Loss: 0.862233
Epoch: 247 	Training Loss: 0.862249
Epoch: 248 	Training Loss: 0.862269
Epoch: 249 	Training Loss: 0.862320
Epoch: 250 	Training Loss: 0.862449


## 4 (a) accuracy on the testing split of MLP using `average Word2Vec vectors` = 0.6375833

In [38]:
def predict(model, dataloader):
    prediction_list = []
    acc = 0
    for batch, target in dataloader:
        outputs = model(batch)
        _, predicted = torch.max(outputs.data, 1) 
        _, actual = torch.max(target.data,1)

        prediction_list.append(predicted.cpu())
        acc += predicted.numpy()[0] ==  actual.numpy()[0]
    
    acc /= len(dataloader)
    return acc

accuracy_on_test = predict(mlp,test_loader)
print("Accuracy of MLP on average word2Vec vectors = ", accuracy_on_test)

Accuracy of MLP on average word2Vec vectors =  0.6375833333333333


## 4 (b) concatenating first 10 Word2Vec vectors

In [39]:
# getting vocab of all the words in pretrained glove vectors
vocabulary = set(wv.index_to_key)

### Building dataset of concatenated word embeddings

In [40]:
X_concatenated_train = []
X_concatenated_test = []

# Creating concatenated word vector features for training data 
for i in range(len(X_train)):

    concatenated_vect = np.array([])
    sent_length = len(X_train[i].split(" "))
    zero_vectors_to_add = 0
    
    if sent_length < 10:
        zero_vectors_to_add = 10 - sent_length
    

    # if sent_len >= 10 then just add all words in concatenated vectors
    if zero_vectors_to_add == 0:
        for idx,word in enumerate(X_train[i].split(" ")):
            if idx == 10:
                break
                
            if word in vocabulary:
                concatenated_vect = np.concatenate((concatenated_vect,wv[word]))
            else:
                concatenated_vect = np.concatenate((concatenated_vect,np.zeros(300,)))

    # add as many words as in the review then append by zeros
    else:
        for word in X_train[i].split(" "):
            if word in vocabulary:
                concatenated_vect = np.concatenate((concatenated_vect,wv[word]))
            else:
                concatenated_vect = np.concatenate((concatenated_vect,np.zeros(300,)))
        
        concatenated_vect = np.concatenate(( concatenated_vect, np.zeros( zero_vectors_to_add*300, ) ))
    
    X_concatenated_train.append(concatenated_vect)
    

# Creating concatenated word vector features for test data
for i in range(len(X_test)):

    concatenated_vect = np.array([])
    sent_length = len(X_test[i].split(" "))
    zero_vectors_to_add = 0
    
    if sent_length < 10:
        zero_vectors_to_add = 10 - sent_length
    
    # if sent_len >= 10 then just add all words in concatenated vectors
    if zero_vectors_to_add == 0:
        for idx,word in enumerate(X_test[i].split(" ")):
            if idx == 10:
                break
            if word in vocabulary:
                concatenated_vect = np.concatenate((concatenated_vect,wv[word]))
            else:
                concatenated_vect = np.concatenate((concatenated_vect,np.zeros(300,)))
    
    # add as many words as in the review then append by zeros
    else:
        for word in X_test[i].split(" "):
            if word in vocabulary:
                concatenated_vect = np.concatenate((concatenated_vect,wv[word]))
            else:
                concatenated_vect = np.concatenate((concatenated_vect,np.zeros(300,)))
        
        concatenated_vect = np.concatenate(( concatenated_vect, np.zeros( zero_vectors_to_add*300, ) ))
    
    X_concatenated_test.append(concatenated_vect)
    
    
    

In [41]:
train_data = []
test_data = []

# creating training data of the format [concatenated_word2vec, one_hot_label]
for i in range(len(y_train)):
    train_data.append([X_concatenated_train[i],y_train_ohe[i]])
    

# creating testing data of the format [concatenated_word2vec, one_hot_label]
for i in range(len(y_test)):
    test_data.append([X_concatenated_test[i],y_test_ohe[i]])

In [42]:
# creating train and test data loaders for PyTorch model training
train_loader = torch.utils.data.DataLoader( train_data, batch_size = 10000)
test_loader = torch.utils.data.DataLoader( test_data, batch_size = 1 )

In [43]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
import torch.nn.functional as F

# define the NN architecture
class Net(nn.Module):
    def __init__(self, input_dim):
        super(Net, self).__init__()        
        self.input_dim = input_dim
        hidden_1 = 100
        hidden_2 = 10
        # linear layer (300 -> hidden_1)
        self.fc1 = nn.Linear(input_dim, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # linear layer (n_hidden -> 3)
        self.fc3 = nn.Linear(hidden_2, 3)


    def forward(self, x):
        # add hidden layer, with relu activation function
        x = x.to(torch.float32)
        x = F.relu(self.fc1(x))
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add output layer
        x = F.softmax(self.fc3(x),dim=1)
        return x

# initialize the NN
mlp2 = Net(3000)
  
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(mlp2.parameters(), lr=0.007, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)

In [44]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
n_epochs = 200

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):

    train_loss = 0.0
    
    mlp2.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = mlp2(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    
    train_loss = train_loss/len(train_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        ))

Epoch: 1 	Training Loss: 1.087985
Epoch: 2 	Training Loss: 1.030642
Epoch: 3 	Training Loss: 0.980365
Epoch: 4 	Training Loss: 0.960201
Epoch: 5 	Training Loss: 0.944930
Epoch: 6 	Training Loss: 0.934093
Epoch: 7 	Training Loss: 0.923911
Epoch: 8 	Training Loss: 0.915669
Epoch: 9 	Training Loss: 0.911624
Epoch: 10 	Training Loss: 0.901316
Epoch: 11 	Training Loss: 0.909968
Epoch: 12 	Training Loss: 0.917659
Epoch: 13 	Training Loss: 0.903379
Epoch: 14 	Training Loss: 0.884749
Epoch: 15 	Training Loss: 0.892234
Epoch: 16 	Training Loss: 0.873976
Epoch: 17 	Training Loss: 0.870357
Epoch: 18 	Training Loss: 0.877344
Epoch: 19 	Training Loss: 0.901325
Epoch: 20 	Training Loss: 0.892001
Epoch: 21 	Training Loss: 0.864076
Epoch: 22 	Training Loss: 0.860680
Epoch: 23 	Training Loss: 0.857964
Epoch: 24 	Training Loss: 0.844885
Epoch: 25 	Training Loss: 0.830329
Epoch: 26 	Training Loss: 0.823035
Epoch: 27 	Training Loss: 0.820873
Epoch: 28 	Training Loss: 0.820785
Epoch: 29 	Training Loss: 0.8

## 4 (b) accuracy on the testing split of MLP using `concatenated Word2Vec features` =  0.538

In [45]:
accuracy_on_test = predict(mlp2,test_loader)
print("Accuracy of MLP on concatenated word2vec features = ",accuracy_on_test)

Accuracy of MLP on concatenated word2vec features =  0.538


### `What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section.`
**Ans —**
- The simple models trained using TF-IDF features performed better than MLP models trained on word2vec
- The MLP models performed better than simple models when only using word2vec features

# 5. Recurrent Neural Networks (30 points)

In [46]:
X_rnn_train = []
X_rnn_test = []

# preparing data for rnn-training
# of the form [ [emb_w1] , [emb_w2] , [emb_w3] ... ]
# shape = (1, 20, 300)
for i in range(len(X_train)):
    
    list_of_embs = []
    sent = X_train[i]
    words = sent.split(" ")
    
    len_of_rev = len(words)
    
    if len_of_rev >= 20:

        for idx,word in enumerate(words):
            
            if idx == 20:
                break

            if word in vocabulary:
                list_of_embs.append(wv[word])
            else:
                list_of_embs.append(np.zeros(300,))
    
    else:
        zeros_to_add = 20 - len_of_rev
        
        for word in words:
    
            if word in vocabulary:
                list_of_embs.append(wv[word])
            else:
                list_of_embs.append(np.zeros(300,))
        
        while zeros_to_add != 0:
            list_of_embs.append(np.zeros(300,))
            zeros_to_add -= 1
    
    
    X_rnn_train.append(torch.tensor(list_of_embs))
            

# preparing data for rnn test data
# of the form [ [emb_w1] , [emb_w2] , [emb_w3] ... ]
# shape = (1, 20, 300)

for i in range(len(X_test)):
    
    list_of_embs = []
    sent = X_test[i]
    words = sent.split(" ")
    
    len_of_rev = len(words)
    
    if len_of_rev >= 20:

        for idx,word in enumerate(words):
            
            if idx == 20:
                break

            if word in vocabulary:
                list_of_embs.append(wv[word])
            else:
                list_of_embs.append(np.zeros(300,))
    
    else:
        zeros_to_add = 20 - len_of_rev
        
        for word in words:
    
            if word in vocabulary:
                list_of_embs.append(wv[word])
            else:
                list_of_embs.append(np.zeros(300,))
        
        while zeros_to_add != 0:
            list_of_embs.append(np.zeros(300,))
            zeros_to_add -= 1
    
    
    X_rnn_test.append(torch.tensor(list_of_embs))

  X_rnn_train.append(torch.tensor(list_of_embs))


In [47]:
train_data = []
test_data = []

# creating training data of the format [ list of 20 embeddings , one_hot_label]
for i in range(len(y_train)):
    train_data.append([X_rnn_train[i],y_train_ohe[i]])

# creating testing data of the format [ list of 20 embeddings , one_hot_label]
for i in range(len(y_test)):
    test_data.append([X_rnn_test[i],y_test_ohe[i]])

train_loader = torch.utils.data.DataLoader( train_data, batch_size = 50)
test_loader = torch.utils.data.DataLoader( test_data, batch_size = 1 )

In [48]:
# took inspiration from — https://www.cs.toronto.edu/~lczhang/360/lec/w06/rnn.html
import torch.nn.functional as F
from torch.autograd import Variable

# Create RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        
        # Number of hidden dimensions
        self.hidden_dim = 20
        
        # Number of hidden layers
        self.layer_dim = 1
        
        # RNN
        self.rnn = nn.RNN(input_size = input_dim, hidden_size = hidden_dim, num_layers = layer_dim, batch_first=True, nonlinearity='relu')
        
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = x.to(torch.float32)

        # Initialize hidden state with zeros
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
            
        # One time step
        out, hn = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        
        return out


# initialize the NN
rnn_model = RNNModel(300, 20, 1, 3)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=0.007, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)
# optimizer = torch.optim.SGD(rnn_model.parameters(), lr = 0.01)

In [49]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
n_epochs = 200

valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    rnn_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = rnn_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    
    train_loss = train_loss/len(train_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        ))

Epoch: 1 	Training Loss: 1.073449
Epoch: 2 	Training Loss: 0.982466
Epoch: 3 	Training Loss: 0.924517
Epoch: 4 	Training Loss: 0.904837
Epoch: 5 	Training Loss: 0.886622
Epoch: 6 	Training Loss: 0.904191
Epoch: 7 	Training Loss: 0.876907
Epoch: 8 	Training Loss: 0.864370
Epoch: 9 	Training Loss: 0.857110
Epoch: 10 	Training Loss: 0.859337
Epoch: 11 	Training Loss: 0.859743
Epoch: 12 	Training Loss: 0.856843
Epoch: 13 	Training Loss: 0.846146
Epoch: 14 	Training Loss: 0.847738
Epoch: 15 	Training Loss: 0.843024
Epoch: 16 	Training Loss: 0.838097
Epoch: 17 	Training Loss: 0.837586
Epoch: 18 	Training Loss: 0.840363
Epoch: 19 	Training Loss: 0.829435
Epoch: 20 	Training Loss: 0.838169
Epoch: 21 	Training Loss: 0.834091
Epoch: 22 	Training Loss: 0.828550
Epoch: 23 	Training Loss: 0.846776
Epoch: 24 	Training Loss: 0.835508
Epoch: 25 	Training Loss: 0.835366
Epoch: 26 	Training Loss: 0.830530
Epoch: 27 	Training Loss: 0.827662
Epoch: 28 	Training Loss: 0.831931
Epoch: 29 	Training Loss: 0.8

## Accuracy of RNN mdoel in test set =  0.5745833

In [50]:
accuracy_on_test = predict(rnn_model,test_loader)
print("Accuracy of RNN mdoel in test set = ",accuracy_on_test)

Accuracy of RNN mdoel in test set =  0.5745833333333333


### `Q — What do you conclude by comparing accuracy values you obtain with those obtained with feedforward neural network models`

**Ans —**
- I do not observe much improvement using an RNN instead of simple Feed Forward Network. RNNs generally perform better than FNNs because they are specifically built for NLP sequence tasks. But since, a text classification problem relies heavily on presence of few words, I think the sequence understanding/order of words did not mater in this case and FNNs performed equally well.

- The FNNs got an accuracy of *0.637583333* on avg_word2vec and *0.538* on concatenated_word2vec. While the RNN got an accuracy of 0.5745

- The accuracy is better than concatenated vectors but not better than avg_word2vec

## 5 (b) gated recurrent unit cell

In [51]:
# took inspiration from — https://www.cs.toronto.edu/~lczhang/360/lec/w06/rnn.html
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(GRUModel, self).__init__()
        
        # Number of hidden dimensions
        self.hidden_dim = hidden_dim
        
        # Number of hidden layers
        self.layer_dim = layer_dim
        
        # RNN
        self.gru = nn.GRU(input_size = input_dim, hidden_size = hidden_dim, num_layers = layer_dim, batch_first=True)
        
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = x.to(torch.float32)

        # Initialize hidden state with zeros
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
            
        # One time step
        out, hn = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        
        return out


# initialize the NN
gru_model = GRUModel(300, 20, 1, 3)

criterion = nn.CrossEntropyLoss()

# optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)
optimizer = torch.optim.SGD(gru_model.parameters(), lr = 0.01)

In [52]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist

n_epochs = 200

valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    
    gru_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = gru_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    
    train_loss = train_loss/len(train_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        ))

Epoch: 1 	Training Loss: 1.100832
Epoch: 2 	Training Loss: 1.097882
Epoch: 3 	Training Loss: 1.095954
Epoch: 4 	Training Loss: 1.094364
Epoch: 5 	Training Loss: 1.092962
Epoch: 6 	Training Loss: 1.091654
Epoch: 7 	Training Loss: 1.090370
Epoch: 8 	Training Loss: 1.089049
Epoch: 9 	Training Loss: 1.087631
Epoch: 10 	Training Loss: 1.086047
Epoch: 11 	Training Loss: 1.084203
Epoch: 12 	Training Loss: 1.081935
Epoch: 13 	Training Loss: 1.078899
Epoch: 14 	Training Loss: 1.074129
Epoch: 15 	Training Loss: 1.062670
Epoch: 16 	Training Loss: 1.007904
Epoch: 17 	Training Loss: 0.967296
Epoch: 18 	Training Loss: 0.953046
Epoch: 19 	Training Loss: 0.944091
Epoch: 20 	Training Loss: 0.937424
Epoch: 21 	Training Loss: 0.931976
Epoch: 22 	Training Loss: 0.927312
Epoch: 23 	Training Loss: 0.923218
Epoch: 24 	Training Loss: 0.919553
Epoch: 25 	Training Loss: 0.916211
Epoch: 26 	Training Loss: 0.913102
Epoch: 27 	Training Loss: 0.910153
Epoch: 28 	Training Loss: 0.907296
Epoch: 29 	Training Loss: 0.9

## Accuracy for GRU model is = 0.6311666

In [53]:
accuracy_on_test = predict(gru_model,test_loader)
print("Accuracy for GRU model is = ",accuracy_on_test)

Accuracy for GRU model is =  0.6311666666666667


## 5 (c) LSTM model

In [54]:
# took inspiration from — https://www.cs.toronto.edu/~lczhang/360/lec/w06/rnn.html
class LSTMmodel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMmodel, self).__init__()
        
        # Number of hidden dimensions
        self.hidden_dim = hidden_dim
        
        # Number of hidden layers
        self.layer_dim = layer_dim
        
        # RNN
        self.lstm = nn.LSTM(input_size = input_dim, hidden_size = hidden_dim, num_layers = layer_dim, batch_first=True)
        
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = x.to(torch.float32)

        # Initialize hidden state with zeros
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
        c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
            
        # One time step
        out, (hn,cn) = self.lstm(x, (h0.detach(),c0.detach()) )   
        out = self.fc(out[:, -1, :])
        
        return out


# initialize the NN
lstm_model = LSTMmodel(300, 20, 1, 3)

criterion = nn.CrossEntropyLoss()

# optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=0.007, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)
optimizer = torch.optim.SGD(lstm_model.parameters(), lr = 0.01)

In [55]:
# took inspiration from the notebook : https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
# number of epochs to train the model
n_epochs = 200

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    lstm_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = lstm_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    
    train_loss = train_loss/len(train_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        ))

Epoch: 1 	Training Loss: 1.100098
Epoch: 2 	Training Loss: 1.099015
Epoch: 3 	Training Loss: 1.098353
Epoch: 4 	Training Loss: 1.097724
Epoch: 5 	Training Loss: 1.097109
Epoch: 6 	Training Loss: 1.096490
Epoch: 7 	Training Loss: 1.095854
Epoch: 8 	Training Loss: 1.095191
Epoch: 9 	Training Loss: 1.094492
Epoch: 10 	Training Loss: 1.093750
Epoch: 11 	Training Loss: 1.092954
Epoch: 12 	Training Loss: 1.092094
Epoch: 13 	Training Loss: 1.091153
Epoch: 14 	Training Loss: 1.090102
Epoch: 15 	Training Loss: 1.088891
Epoch: 16 	Training Loss: 1.087414
Epoch: 17 	Training Loss: 1.085436
Epoch: 18 	Training Loss: 1.082238
Epoch: 19 	Training Loss: 1.073470
Epoch: 20 	Training Loss: 1.018304
Epoch: 21 	Training Loss: 0.978978
Epoch: 22 	Training Loss: 0.963936
Epoch: 23 	Training Loss: 0.953896
Epoch: 24 	Training Loss: 0.945958
Epoch: 25 	Training Loss: 0.939814
Epoch: 26 	Training Loss: 0.935063
Epoch: 27 	Training Loss: 0.931097
Epoch: 28 	Training Loss: 0.927576
Epoch: 29 	Training Loss: 0.9

## Accuracy for LSTM model on test set is = 0.62975

In [56]:
accuracy_on_test = predict(lstm_model,test_loader)
print("Accuracy for LSTM model is = ",accuracy_on_test)

Accuracy for LSTM model is =  0.62975


### `Q — What do you conclude by comparing accuracy values you obtain by GRU, LSTM, and simple RNN.`

**Ans —**
- I conclude that GRU (63.11%) and LSTM (62.975%) performed better than RNN (57.45%). They performed better because they solve the long-term dependancy/vanishing gradient problem of RNNs and can understand text much better.
- All models were trained for 200 epochs
- GRU performed slightly better than LSTM by getting  63.11% compared to 62.975%