# 1. Initialisation

In [1]:
!pip install sentence_transformers > /dev/null
from sentence_transformers import SentenceTransformer

In [2]:
import nltk
import numpy as np
from string import punctuation
import re
from keras.preprocessing import text
import pandas as pd
nltk.download('stopwords')
from keras.preprocessing.sequence import skipgrams
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
from torch.optim import Adam, RMSprop
nltk.download('punkt')
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials
import re
from sklearn.utils import shuffle
import time
from datetime import datetime
import os
from tqdm.auto import tqdm
import json
from google.colab import drive
from torch.utils.data import DataLoader,TensorDataset

device = torch.device('cuda')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# 2. Getting data from question bank

In [3]:
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1RHA3WUqdXuJruEIxS1spgjRhUnmTujVml83D1wMt_HI')
test_data = pd.DataFrame(wb.worksheet('Test_data').get_all_values())
question_bank = pd.DataFrame(wb.worksheet('Question_Bank').get_all_values())

test_data.columns = test_data.iloc[0,:]
question_bank.columns = question_bank.iloc[0,:]
question_bank = question_bank.iloc[1:,:]
test_data = test_data.iloc[1:, :]
question_bank_answers = [answer for answer in question_bank.Answer.values if len(answer)>0]
question_bank_answers.extend(answer for answer in question_bank.Answer_2.values if len(answer)>0)
question_bank_answers.extend(answer for answer in question_bank.Answer_3.values if len(answer)>0)

In [15]:
question_bank_answers = [x for x in question_bank_answers if x is not np.nan]

# 3. BERT

In [4]:
df = pd.merge(left = test_data,right=question_bank,left_on='Question',right_on='Question',how='left'
              ).drop(columns=['Category','Unique_Question_ID','Answer','Difficulty level',
                              'MaxMarks'])[['Unique_ID', 'Topic','Question', 'Actual_answer',
                                            'Answer_2','Answer_3','Answer_variation','Expected_score']]

In [5]:
df.isnull().sum()

0
Unique_ID            0
Topic                0
Question             0
Actual_answer        0
Answer_2            23
Answer_3            23
Answer_variation     0
Expected_score       0
dtype: int64

In [6]:
df.replace(np.nan, '', inplace=True)

In [7]:
df.isnull().sum()

0
Unique_ID           0
Topic               0
Question            0
Actual_answer       0
Answer_2            0
Answer_3            0
Answer_variation    0
Expected_score      0
dtype: int64

In [8]:
bert = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [9]:
bert_answer1 = bert.encode([str(d).lower() for d in df.Actual_answer])
bert_answer2 = bert.encode([str(d).lower() for d in df.Answer_2])
bert_answer3 = bert.encode([str(d).lower() for d in df.Answer_3])
bert_variations = bert.encode([str(d).lower() for d in df.Answer_variation])

len(bert_answer1),len(bert_answer2),len(bert_answer3),len(bert_variations)

(209, 209, 209, 209)

In [10]:
test_data['BERT-1'] = [cosine_similarity(a.reshape(1, -1), b.reshape(1, -1)) for a,b in zip(bert_answer1, bert_variations)]
test_data['BERT-2'] = [cosine_similarity(a.reshape(1, -1), b.reshape(1, -1)) for a,b in zip(bert_answer2, bert_variations)]
test_data['BERT-3'] = [cosine_similarity(a.reshape(1, -1), b.reshape(1, -1)) for a,b in zip(bert_answer3, bert_variations)]
test_data['BERT-1'] = test_data['BERT-1'].apply(lambda x: x[0][0])
test_data['BERT-1'] = test_data['BERT-1'].apply(lambda x: 0 if x<0.1 else round(x,4))
test_data['BERT-2'] = test_data['BERT-2'].apply(lambda x: x[0][0])
test_data['BERT-2'] = test_data['BERT-2'].apply(lambda x: 0 if x<0.1 else round(x,4))
test_data['BERT-3'] = test_data['BERT-3'].apply(lambda x: x[0][0])
test_data['BERT-3'] = test_data['BERT-3'].apply(lambda x: 0 if x<0.1 else round(x,4))

"""

This part is new. If the average BERT score is greater than 0.5, I am taking the max of the BERT scores, else min of the BERT scores. We may use this logic or use the max as we are currently doing.
There is hardly any difference in the MSE on this test_set. But I think there will be appreciable results overall. We should discuss this.

"""
test_data['BERT-mean'] = test_data[['BERT-1','BERT-2','BERT-3']].mean(axis=1)
test_data['BERT-max'] = test_data[['BERT-1','BERT-2','BERT-3']].apply(lambda x: max(x), axis=1)
test_data['BERT-min'] = test_data[['BERT-1','BERT-2','BERT-3']].apply(lambda x: min(x), axis=1)
test_data['greater'] = test_data['BERT-mean'].apply(lambda x: x>0.5).astype('int')
test_data['BERT'] = test_data['greater']*test_data['BERT-max'] +(1- test_data['greater'])*test_data['BERT-min']

test_data['BERT'] = test_data['BERT-max']

In [11]:
test_data.drop(['LSA_TFIDF','LSA_Count','Doc2Vec','Minilml12v4','Minilml6v1','BERT-1','BERT-2','BERT-3','BERT-mean','BERT-min','BERT-max','greater'], axis=1, inplace=True)

In [12]:
test_data

Unnamed: 0,Unique_ID,Topic,Question,Actual_answer,Answer_variation,Expected_score,BERT
1,1,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,The decision boundary that helps to segregate ...,1,0.7607
2,2,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,It is the decision boundary in a multidimensio...,1,1.0000
3,3,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,decision boundary in higher dimensions to sepa...,0.9,0.8465
4,4,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,a plane that separates different classes,0.7,0.5157
5,5,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,plane supported by support vectors,0.6,0.9520
...,...,...,...,...,...,...,...
205,205,Decision Tree,Why is decision tree called as Greedy algorithm?,Because it only looks in the forward direction...,Optimal decision is made at each step and is n...,0.7,0.6610
206,206,Decision Tree,Why is decision tree called as Greedy algorithm?,Because it only looks in the forward direction...,No changes made in the tree structure later on,0.6,0.4993
207,207,Decision Tree,Why is decision tree called as Greedy algorithm?,Because it only looks in the forward direction...,A decision once made is final,0.4,0.6571
208,208,Decision Tree,Why is decision tree called as Greedy algorithm?,Because it only looks in the forward direction...,greedy is set as a hyperparameter for the algo...,0,0.1927


In [13]:
mean_squared_error(test_data['Expected_score'], test_data['BERT'])

0.10453045172248805

# 4. Loading tester model and skipgram class

In [19]:
class tester(nn.Module):

    def __init__(self, hidden_size, num_layers,e_layer ,embedding_dim, bidir=False):

        super(tester, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.bidir=bidir
        self.num_layers = num_layers

        self.embed = emb_layer

        self.gru1 = nn.GRU(input_size=self.embedding_dim, 
                            hidden_size=self.hidden_size, 
                            num_layers=self.num_layers, 
                            batch_first=True,
                            bidirectional = self.bidir,
                            dropout=0.0)
        self.gru2 = nn.GRU(input_size=self.embedding_dim, 
                            hidden_size=self.hidden_size, 
                            num_layers=self.num_layers, 
                            batch_first=True,
                            bidirectional = self.bidir,
                            dropout=0.0)
        

    def forward(self, x, y,hx,hy):
        x = self.embed(x.long()) 
        y = self.embed(y.long())


        x, hx = self.gru1(x,hx)
        y, hy = self.gru2(y,hy)

        x = x[:,-1]  #out[:,-1]
        y = y[:,-1]


        x = F.leaky_relu(x)
        y = F.leaky_relu(y)

        out = F.cosine_similarity(x,y)
        return out,hx,hy


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device)
        return hidden



class skipgram(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim=64):
        super(skipgram, self).__init__()

        self.u_embeddings = nn.Embedding(vocab_size, embedding_dim,sparse=False)   


    def forward(self, u_pos, v_pos ):

        embed_u = self.u_embeddings(u_pos.long())
        embed_v = self.u_embeddings(v_pos.long())


        return embed_u, embed_v

In [None]:
# !gdown https://drive.google.com/uc?id=1o88RIAaQlCrx2VCT9lhDb6cqPLlzqBbv
# !gdown https://drive.google.com/uc?id=1XWEpPo7OignfY2MOUNlTeDpRUzd6d6Dl
# !gdown https://drive.google.com/uc?id=1PJPWaKAx2nSkccRhCTp-getgfWPBxoxB

In [20]:
"""
Creating an instance of the skipgram class and loading the saved state_dict

"""



######### LOADING WORD2ID FROM JSON ################
# Opening JSON file
# Path need to be changed later
with open('word2id.json','r') as json_file:
    word2id = json.load(json_file)
  
    # Print the type of data variable
    print("Type:", type(word2id))

####################################################

embedding_dim = 64
vocab_size = 7709
embeddings = skipgram(vocab_size)
embeddings = embeddings.to(device)

if True:  # When you train word embedding above then make this true
    # embeddings.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/CAPSTONETERM3/FinalModels/embedding_network.pt')) # path need to be changed
    embeddings.load_state_dict(torch.load('/content/embedding_network.pt')) # path need to be changed
    embeddings = embeddings.to(device)
    print('model loaded')
for para in embeddings.parameters():
    para.requires_grad = False

emb_layer = embeddings.u_embeddings
# emb_layer = embeddings.v_embeddings

Type: <class 'dict'>
model loaded


In [21]:
"""
Creating an instance of the tester class and loading the saved state_dict

"""

## Also provide embedding layer i.e. e_layer=emb_layer
nnModel = tester(hidden_size=50,num_layers=2,e_layer=emb_layer,embedding_dim=64)

In [22]:
path = 'final_tester.pt'

if True:  # When you train word embedding above then make this true
    nnModel.load_state_dict(torch.load(path)) # path need to be changed
    nnModel = nnModel.to(device)
    print('model loaded')

for para in nnModel.parameters():
    para.requires_grad = False

model loaded


## PREDICTION USING NN MODEL

In [23]:
def coder(sentence, max_len):
    BAD_SYMBOLS_RE = re.compile("""[/$%{}^'"#\\\\|@,;[\]\-`~_]""")
    sent=[]
    try:
        sentence = re.sub('=','equals', sentence)
        sentence = re.sub(pattern=BAD_SYMBOLS_RE,repl=' ', string = sentence)
        sentence = re.sub('[^\w+]',' ', sentence)

        words = sentence.lower().split()
        
        for word in words:
            try:
                sent.append(word2id[word])
            except:
                pass
        
    except:
        pass
    
        
    x = torch.from_numpy(np.array(sent))
    pad_size = list(x.shape)
    pad_size[0] = max_len - x.size(0)
    x = torch.cat([x, torch.zeros(*pad_size)], dim=0)
    x = x[0:max_len]
    return x

In [24]:
coder('It is the decision boundary in a multidimensional space to separate different classes The error term for misclassifications and the constraint term for the misclassifications It means taking small, random portion of entire population to represent entire data set, where each member has an equal probability of being chosen ',50)

tensor([5.3000e+01, 1.5760e+03, 3.7460e+03, 4.3100e+02, 8.7100e+02, 1.2500e+02,
        3.8500e+02, 1.8000e+01, 3.0400e+02, 4.5040e+03, 3.6190e+03, 3.0400e+02,
        4.5040e+03, 1.5100e+02, 8.5300e+02, 1.4800e+02, 2.7800e+02, 2.7890e+03,
        5.5700e+02, 7.8000e+01, 1.5900e+02, 5.5700e+02, 1.0000e+00, 3.0000e+00,
        2.1590e+03, 2.0300e+02, 3.3000e+01, 3.8000e+02, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00])

In [25]:
test_data.head(2)

Unnamed: 0,Unique_ID,Topic,Question,Actual_answer,Answer_variation,Expected_score,BERT
1,1,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,The decision boundary that helps to segregate ...,1,0.7607
2,2,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,It is the decision boundary in a multidimensio...,1,1.0


In [27]:
max_len = 50
a1 = test_data.Actual_answer.tolist()
a2 = test_data.Answer_variation.tolist()

actuals_coded = np.array([coder(x, max_len).numpy() for x in a1])
variations_coded = np.array([coder(x, max_len).numpy() for x in a2])

In [28]:
actuals_coded

array([[  53., 1576., 3746., ...,    0.,    0.,    0.],
       [  53., 1576., 3746., ...,    0.,    0.,    0.],
       [  53., 1576., 3746., ...,    0.,    0.,    0.],
       ...,
       [1367.,  763.,  882., ...,    0.,    0.,    0.],
       [1367.,  763.,  882., ...,    0.,    0.,    0.],
       [1367.,  763.,  882., ...,    0.,    0.,    0.]], dtype=float32)

In [29]:
actuals_coded_t = torch.from_numpy(actuals_coded).to(device)
variations_coded_t = torch.from_numpy(variations_coded).to(device)

In [33]:
fthx = nnModel.init_hidden(batch_size=len(actuals_coded_t))
fthy = nnModel.init_hidden(batch_size=len(actuals_coded_t))

nnPred,thx,thy = nnModel(actuals_coded_t, variations_coded_t,fthx,fthy)
nnPred = nnPred.detach().cpu().numpy()

In [34]:
test_data['NN_scores'] = nnPred

In [35]:
test_data.head(10)

Unnamed: 0,Unique_ID,Topic,Question,Actual_answer,Answer_variation,Expected_score,BERT,NN_scores
1,1,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,The decision boundary that helps to segregate ...,1.0,0.7607,0.941322
2,2,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,It is the decision boundary in a multidimensio...,1.0,1.0,0.93238
3,3,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,decision boundary in higher dimensions to sepa...,0.9,0.8465,0.941478
4,4,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,a plane that separates different classes,0.7,0.5157,0.690361
5,5,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,plane supported by support vectors,0.6,0.952,0.847451
6,6,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,plane for classification,0.3,0.6185,0.291937
7,7,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,plane in multidimension space,0.2,0.4821,0.598775
8,8,SVM,What is a hyperplane in SVM,It is the decision boundary in a multidimensio...,support vector plane,0.0,0.8588,0.054217
9,9,SVM,What is C in SVM?,A model hyperparameter which is the regularisa...,It is a hyperparameter to control the strength...,1.0,0.7948,0.924951
10,10,SVM,What is C in SVM?,A model hyperparameter which is the regularisa...,A model hyperparameter which is the regularisa...,1.0,1.0,0.919996


In [36]:
mean_squared_error(test_data['Expected_score'].astype(np.float64),test_data['NN_scores'])

0.04316154935188397

# 5. Machine learning model

In [37]:
from sklearn.model_selection import train_test_split
X = test_data[['BERT','NN_scores']]
y = test_data['Expected_score'].astype('float')
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2, random_state=123)

In [65]:
from sklearn.ensemble import  RandomForestRegressor
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.ensemble import  AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm

r = RandomForestRegressor(random_state=1234,max_depth=12,max_features=1)
r.fit(X_train, y_train)
yp = r.predict(X_test)
print('Random Forest',mean_squared_error(y_test, yp))

g = GradientBoostingRegressor(random_state=12)
g.fit(X_train, y_train)
yp = g.predict(X_test)
print('GB Regressor',mean_squared_error(y_test, yp))

a = AdaBoostRegressor(random_state=12)
a.fit(X_train, y_train)
yp = a.predict(X_test)
print('AdaBoost',mean_squared_error(y_test, yp))

x = XGBRegressor(random_state=12)
x.fit(X_train, y_train)
yp = x.predict(X_test)
print('XGB Regressor',mean_squared_error(y_test, yp))
s = SVR(kernel='linear')

s.fit(X_train, y_train)
yp = s.predict(X_test)
print('SVR',mean_squared_error(y_test, yp))

d = DecisionTreeRegressor(criterion='mse',random_state=1234)
d.fit(X_train, y_train)
yp = d.predict(X_test)
print('Decision tree',mean_squared_error(y_test, yp))

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

l = sm.OLS(y_train, X_train).fit()
yy = l.predict(X_test)
print('Linear regression',mean_squared_error(y_test, yy))

Random Forest 0.037476300817980805
GB Regressor 0.05111779868885451
AdaBoost 0.04630706861508446
XGB Regressor 0.0479912624790758
SVR 0.03962937945835915
Decision tree 0.07214285714285715
Linear regression 0.03978637152651519


In [62]:
s = SVR(kernel='linear',C=0.7,gamma=0.3,epsilon=0.07,)

s.fit(X_train, y_train)
yp = s.predict(X_test)
print('SVR',mean_squared_error(y_test, yp))

SVR 0.04039813360034585


In [66]:
X = test_data[['BERT','NN_scores']]
y = test_data['Expected_score'].astype('float')

r = RandomForestRegressor(random_state=1234,max_depth=12,max_features=1)
r.fit(X, y)
yp = r.predict(X)
print('Random Forest',mean_squared_error(y, yp))

Random Forest 0.0058561164270328125


In [67]:
s = SVR(kernel='linear',C=0.5,gamma=0.2,epsilon=0.06,)

s.fit(X, y)
yp = s.predict(X)
print('SVR',mean_squared_error(y, yp))

SVR 0.036386090660539556


In [68]:
ml_model= r

In [69]:
test_data['ml_model'] = ml_model.predict(X)

In [70]:
test_data[['BERT','NN_scores','ml_model','Expected_score']].tail(50)

Unnamed: 0,BERT,NN_scores,ml_model,Expected_score
160,0.7457,0.150122,0.118,0.0
161,0.2757,0.62828,0.815,1.0
162,1.0,0.939055,0.9975,1.0
163,0.931,0.170901,0.667,0.9
164,0.8551,0.836176,0.732148,0.7
165,0.7347,0.100222,0.3915,0.5
166,0.786,0.11188,0.39,0.3
167,0.3676,0.101925,0.054773,0.0
168,0.3587,0.859899,0.842,1.0
169,1.0,0.958712,0.99545,1.0


In [73]:
# (intercept, BERT_score, NN_score)
ml_model.predict([[0.21,0.22]]), ml_model.predict([[0.9,0.7]])

(array([0.08727273]), array([0.81571429]))

# SAVING ML MODEL (RANDOM FOREST)
Hyperparameter for RF:

random_state=12,max_depth=9,max_features=1

In [74]:
import pickle

In [75]:
with open('rf.pkl','wb') as file:
    pickle.dump(ml_model,file)

In [76]:
# !gdown https://drive.google.com/uc?id=1YADTDb21RDbsLTQbwqNFl3_CpSwb4bg2
with open('rf.pkl','rb') as file:
    rf_model = pickle.load(file)

In [77]:
rf_model.predict(X)[0]

0.9715

In [78]:
print('Random Forest',mean_squared_error(y, rf_model.predict(X)))

Random Forest 0.0058561164270328125
