# **Installing Packages**

Note: Use Neptune Packages to Log Results

In [1]:
!pip install transformers
# !pip install neptune
# !pip install neptune-contrib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


Note: Restart Runtime After Successfull Installation

# **Importing Packages**

In [2]:
import os
import random
import warnings
import pandas as pd
import numpy as np
import multiprocessing
import collections
import nltk
import re
import pickle
import tensorflow as tf
from nltk.corpus import stopwords
import transformers as ppb
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import tensorflow as tf
from tensorflow import keras

from keras.layers import Embedding, Input, LSTM, Dense, Dropout, Lambda, Flatten, Bidirectional, Conv2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Sequential,Model, load_model, model_from_config
import keras.backend as K

import matplotlib.pyplot as plt
%matplotlib notebook

# import neptune
# from neptunecontrib.monitoring.keras import NeptuneMonitor

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# **Neptune AI**

In [None]:
run = neptune.init_run(
    project="YOUR PROJECT NAME",
    api_token="YOUR API TOKEN"
)  

# **Model Visualisations**

In [3]:
def plot_accuracy_curve(history):
  plt.plot(history.history['loss'])
  plt.plot(history.history['mae'])
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Test'], loc='upper left')
  plt.show()

def plot_acrchitecture(filename, model):
  from keras.utils import plot_model
  plot_model(model, to_file=str(filename) + '.png')

# **Mounting Google Drive**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
dataset_path = "/content/drive/MyDrive/IntelliTech-DataSet/training_set_rel3.tsv"
data = pd.read_csv(dataset_path, sep="\t", encoding="ISO-8859-1")
data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


# **Data Preprocessing**

In [6]:
cap = ['@CAPS'+str(i) for i in range(100)]
loc = ['@LOCATION'+str(i) for i in range(100)]
org =['@ORGANIZATION'+str(i) for i in range(100)]
per = ['@PERSON'+str(i) for i in range(100)]
date = ['@DATE'+str(i) for i in range(100)]
time = ['@TIME'+str(i) for i in range(100)]
money = ['@MONEY'+str(i) for i in range(100)]
ner =  cap + loc + org + per + date + time + money

In [7]:
top10 = collections.defaultdict(int)
def essay_to_wordlist(Essay):
    """
      Removes Named Entity Recognition (NER), Special Characters, and Stop Words.
      Also word tokenizes the essay.

      Args:
        Essay: Essay of each student 
      
      Returns: 
        Set<String>

    """
    Essay = re.sub("[^a-zA-Z]", " ", Essay)
    words = Essay.lower().split()
    
    stops = stopwords.words("english")
    stops.extend(ner)
    for word in words:
      if word not in stops:
        top10[word]+=1
    words = [w for w in words if not w in stops]
    return (words)

# **Feature Vector Creation**

In [8]:
def makeFeatureVec(words, model, num_features):
    """
      Make Feature Vector from the words list of an Essay.

      Args:
        words: Words of each essay
        model: Trained word2vec model
        params['num_features']: Number of features to be extracted 
      
      Returns: 
        numpy.array

    """
    featureVec = np.zeros((num_features,), dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            if word in model.wv:
                featureVec = np.add(featureVec, model.wv[word])
    if num_words > 0:
        featureVec = np.divide(featureVec, num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """
      Main function to generate the word vectors for word2vec model.

      Args:
        essays: Essay of each student
        model: Trained word2vec model
        params['num_features']: Number of features to be extracted 
      
      Returns: 
        numpy.array

    """
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

# **Bi-LSTM and LSTM Model Architecture**

In [9]:
def get_model(Hidden_dimension1=400, Hidden_dimension2=128, return_sequences = True, dropout=0.5, recurrent_dropout=0.4, input_size=768, activation='relu', bidirectional = False):
    """
      Defines the architecture for LSTM and Bi-LSTM Model

      Args:
        Hidden_dim1: 
        Hidden_dim2: 
        return_sequences:
        dropout: 
        recurrent_dropout: 
        input_size:
        activation: 
        bidirectional:
      
      Returns: 
        keras.model

    """
    model = Sequential()
    if bidirectional:
        model.add(Bidirectional(LSTM(Hidden_dimension1,return_sequences=return_sequences , dropout=0.4, recurrent_dropout=recurrent_dropout), input_shape=[1, input_size]))
        model.add(Bidirectional(LSTM(Hidden_dimension1, recurrent_dropout=recurrent_dropout)))
    else:
        model.add(LSTM(Hidden_dimension1, dropout=0.4, recurrent_dropout=recurrent_dropout, input_shape=[1, input_size], return_sequences=return_sequences))
        model.add(LSTM(Hidden_dimension2, recurrent_dropout=recurrent_dropout))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation=activation))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

# **Word2Vec Model Architecture**

In [10]:
def build_word2vec(train_sentences, num_workers, num_features, min_word_count, context,epochs):
    """
      Defines the architecture for Word2vec Model

      Args:
        train_sentences: 
        num_workers: 
        params['num_features']:
        min_word_count: 
        context: 
      
      Returns: 
        word2vec model
        collections.dictionary

    """
    model = Word2Vec(workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context)
    cores = multiprocessing.cpu_count()
    model.build_vocab(train_sentences, progress_per=10000)
    model.train(train_sentences, total_examples=model.corpus_count, epochs=epochs, report_delay=1)
    model.init_sims(replace=True)
    sorted_dic = sorted(top10.items(), key=lambda k: k[1], reverse=True)
    return model,sorted_dic

In [11]:
X = data
y = data['domain1_score']

Hyperparameters for word2vec


In [12]:
params_word2vec= {  
                    "num_features": 400,
                    "min_word_count": 40,
                    "num_workers": 4,
                    "context": 10,
                    "epochs":30
                  }

Hyperpaprameters for LSTM

In [13]:
params_lstm = { 
                "Hidden_dim1":300,
                "Hidden_dim2": 100,
                "return_sequences": True,
                "dropout":0.5,
                "recurrent_dropout":0.4,
                "input_size":400,
                "activation":'relu',
                "bidirectional":  True,
                "batch_size" : 64,
                "epoch": 70 
              }

# **Neptune Logger**

In [None]:
run["parameters_lstm"] = params_lstm
run["parameters_word2vec"] = params_word2vec
neptune_callback = NeptuneMonitor()  

class NeptuneLogger(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        for log_name, log_value in logs.items():
            run['logs/{}'.format(log_name)].log(log_value)

# **Model Training**

In [14]:
warnings.filterwarnings('ignore')

tf.keras.backend.clear_session()
cv = KFold(n_splits=2, shuffle=True)
cv_data = cv.split(X)
results = []
fold_count = 1

print(X.shape)
print(y.shape)

for traincv, testcv in cv_data:
    print("\n--------Fold {}--------\n".format(fold_count))
    # get the train and test from the dataset.
    X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[testcv], y.iloc[traincv], y.iloc[testcv]
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    train_sentences = []
    for essay in train_essays:
        train_sentences.append(essay_to_wordlist(essay))

    print("Converting sentences to word2vec model")
    model,_ = build_word2vec(train_sentences, params_word2vec['num_workers'], params_word2vec['num_features'], params_word2vec['min_word_count'], params_word2vec['context'], params_word2vec['epochs'])
    top10 = collections.defaultdict(int)

    trainDataVecs = np.array(getAvgFeatureVecs(train_sentences, model, params_word2vec['num_features']))
    test_sentences = []
    
    for essay_v in test_essays:
        test_sentences.append(essay_to_wordlist(essay_v))

    testDataVecs = np.array(getAvgFeatureVecs(test_sentences, model, params_word2vec['num_features']))

    trainDataVectors = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    
    testDataVectors = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model(Hidden_dimension1=params_lstm['Hidden_dim1'], Hidden_dimension2=params_lstm['Hidden_dim2'], return_sequences=params_lstm['return_sequences'],
                            dropout=params_lstm['dropout'], recurrent_dropout = params_lstm['recurrent_dropout'], input_size=params_lstm['input_size'],
                            activation=params_lstm['activation'], bidirectional=True )
    
    # Use this if you want to log your results on Neptune. 
    # history = lstm_model.fit(trainDataVectors, y_train, batch_size=params_lstm['batch_size'], epochs=params_lstm['epoch'], callbacks= [NeptuneLogger()])
    
    history = lstm_model.fit(trainDataVectors, y_train, batch_size=params_lstm['batch_size'], epochs=params_lstm['epoch'])

    y_pred = lstm_model.predict(testDataVectors)
    y_pred = np.around(y_pred)
    np.nan_to_num(y_pred)
    result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
    
    print("Kappa Score: {}".format(result))
    results.append(result)
    fold_count += 1

print("Average kappa score value is : {}".format(np.mean(np.asarray(results))))

(12976, 28)
(12976,)

--------Fold 1--------

Converting sentences to word2vec model




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 1, 600)           1682400   
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 600)              2162400   
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 600)               0         
                                                                 
 dense (Dense)               (None, 1)                 601       
                                                                 
Total params: 3,845,401
Trainable params: 3,845,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Ep



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (None, 1, 600)           1682400   
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 600)              2162400   
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 600)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 601       
                                                                 
Total params: 3,845,401
Trainable params: 3,845,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70


In [15]:
if 'mae' in lstm_model.metrics_names:
    print("MAE metric is included in the model.")
else:
    print("MAE metric is NOT included in the model.")

MAE metric is included in the model.


In [None]:
run['BiLSTM_train/mae'] = history.history['mae'][-1]
run['BiLSTM_train/loss'] = history.history['loss'][-1]
run['BiLSTM_train/cohen_kappa'] = result

## **Saving Models**

# **Testing Model**

In [18]:
df_test = pd.read_excel("/content/drive/MyDrive/IntelliTech-DataSet/Testing_Material/Testing_Dataset.xlsx")

In [19]:
df_test.head()

Unnamed: 0,essay_id,essay_set,essay,predicted_score,grade,essay_type
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",7,8,persuasive / narrative / expository
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,8,8,persuasive / narrative / expository
2,1790,1,"Dear Local newspaper, Have you been spending a...",9,8,persuasive / narrative / expository
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",9,8,persuasive / narrative / expository
4,1792,1,"Dear newspaper, I strongly believe that comput...",9,8,persuasive / narrative / expository


In [20]:
test_essays = df_test['essay']
y_test = df_test['predicted_score']
test_essay_sentences = []
for each_essay in test_essays:
  test_essay_sentences.append(essay_to_wordlist(each_essay))

test_features = np.array(getAvgFeatureVecs(test_essay_sentences, model, params_word2vec['num_features']))
test_features = np.reshape(test_features, (test_features.shape[0], 1, test_features.shape[1]))
y_predicted = lstm_model.predict(test_features)
y_predicted = np.around(y_predicted)
np.nan_to_num(y_predicted)
y_predicted_df = pd.DataFrame(y_predicted)



In [21]:
test_loss, test_mae = lstm_model.evaluate(test_features, y_test)



In [22]:
output_df = pd.DataFrame()
output_df['y_test'] = df_test.predicted_score
output_df["y_pred"] = y_predicted_df
result = cohen_kappa_score(df_test.predicted_score, y_predicted, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.9648363724072312


In [None]:
run['test/loss'] = test_loss
run['test/mae'] = test_mae
run['test/cohen_kappa_score'] = result

In [23]:
output_df['essay'] = df_test['essay']
output_df

Unnamed: 0,y_test,y_pred,essay
0,7,9.0,"Dear @ORGANIZATION1, @CAPS1 more and more peop..."
1,8,8.0,Dear @LOCATION1 Time @CAPS1 me tell you what I...
2,9,8.0,"Dear Local newspaper, Have you been spending a..."
3,9,9.0,"Dear Readers, @CAPS1 you imagine how life woul..."
4,9,7.0,"Dear newspaper, I strongly believe that comput..."
...,...,...,...
4213,33,39.0,Have you ever noticed that if two little kids...
4214,35,40.0,Laughter @CAPS1 I ...
4215,38,37.0,Laughter in @CAPS1 A laugh is not just an act...
4216,32,37.0,LAUGHTER @CAPS1 i was younger my friend live...


In [24]:
output_df.to_csv('test_results.csv')

## **Saving Models**

In [35]:
import pickle
with open('BiLstm_model.pkl', 'wb') as writer:
  pickle.dump(lstm_model, writer)

with open('Word2Vec_model.pkl', 'wb') as writer:
  pickle.dump(model, writer)

# **Query Testing**

In [37]:
num_features = 400

def Load_Model():
    lstm_model = pickle.load(open('/content/BiLstm_model.pkl', 'rb'))
    word2vec_model = pickle.load(open('/content/Word2Vec_model.pkl', 'rb'))
    return lstm_model, word2vec_model;

def predict_score(query_essay):
  lstm_model,word2vec_model = Load_Model()
  query_essay_sentences = []
  query_essay_sentences.append(essay_to_wordlist(query_essay))
  query_data_Vecs = np.array(getAvgFeatureVecs(query_essay_sentences, word2vec_model, num_features))
  query_data_Vectors = np.reshape(query_data_Vecs, (query_data_Vecs.shape[0], 1, query_data_Vecs.shape[1]))
  y_predicted = lstm_model.predict(query_data_Vectors)
  y_predicted = np.around(y_predicted)
  np.nan_to_num(y_predicted)
  return y_predicted[0][0]

In [38]:
essay = "My name is sara"
predict_score(essay)



3.0