# **Installing Packages**

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


# **Importing Packaages**

In [2]:
import os
import random
import warnings
import pandas as pd
import numpy as np
import multiprocessing
import collections
import nltk
import re
import tensorflow as tf
from nltk.corpus import stopwords
import transformers as ppb
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from keras.layers import Embedding, Input, LSTM, Dense, Dropout, Lambda, Flatten, Bidirectional, Conv2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Sequential,Model, load_model, model_from_config
import keras.backend as K

import matplotlib.pyplot as plt
%matplotlib notebook

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# **Model Visualisations**

In [3]:
# Declaring some visualization methods to plot accuracy and model diagram
def plot_accuracy_curve(history):
  plt.plot(history.history['loss'])
  plt.plot(history.history['mae'])
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Test'], loc='upper left')
  plt.show()

def plot_acrchitecture(filename, model):
  from keras.utils import plot_model
  plot_model(model, to_file=str(filename) + '.png')

# **Mounting Google Drive**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
dataset_path = "/content/drive/MyDrive/IntelliTech-DataSet/training_set_rel3.tsv"
data = pd.read_csv(dataset_path, sep="\t", encoding="ISO-8859-1")
data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


# **Data Preprocessing**

In [13]:
cap = ['@CAPS'+str(i) for i in range(100)]
loc = ['@LOCATION'+str(i) for i in range(100)]
org =['@ORGANIZATION'+str(i) for i in range(100)]
per = ['@PERSON'+str(i) for i in range(100)]
date = ['@DATE'+str(i) for i in range(100)]
time = ['@TIME'+str(i) for i in range(100)]
money = ['@MONEY'+str(i) for i in range(100)]
ner =  cap + loc + org + per + date + time + money

In [14]:
top10 = collections.defaultdict(int)
def essay_to_wordlist(Essay):
    """
      Removes Named Entity Recognition (NER), Special Characters, and Stop Words.
      Also word tokenizes the essay.

      Args:
        Essay: Essay of each student 
      
      Returns: 
        Set<String>

    """
    Essay = re.sub("[^a-zA-Z]", " ", Essay)
    words = Essay.lower().split()
    
    stops = stopwords.words("english")
    stops.extend(ner)
    for word in words:
      if word not in stops:
        top10[word]+=1
    words = [w for w in words if not w in stops]
    return (words)

# **Feature Vector Creation**

In [15]:
def makeFeatureVec(words, model, num_features):
    """
      Make Feature Vector from the words list of an Essay.

      Args:
        words: Words of each essay
        model: Trained word2vec model
        num_features: Number of features to be extracted 
      
      Returns: 
        numpy.array

    """
    featureVec = np.zeros((num_features,), dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            if word in model.wv:
                featureVec = np.add(featureVec, model.wv[word])
    if num_words > 0:
        featureVec = np.divide(featureVec, num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """
      Main function to generate the word vectors for word2vec model.

      Args:
        essays: Essay of each student
        model: Trained word2vec model
        num_features: Number of features to be extracted 
      
      Returns: 
        numpy.array

    """
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

# **Bi-LSTM and LSTM Model Architecture**

In [16]:
def get_model(Hidden_dimension1=400, Hidden_dimension2=128, return_sequences = True, dropout=0.5, recurrent_dropout=0.4, input_size=768, activation='relu', bidirectional = False):
    """
      Defines the architecture for LSTM and Bi-LSTM Model

      Args:
        Hidden_dim1: 
        Hidden_dim2: 
        return_sequences:
        dropout: 
        recurrent_dropout: 
        input_size:
        activation: 
        bidirectional:
      
      Returns: 
        keras.model

    """
    model = Sequential()
    if bidirectional:
        model.add(Bidirectional(LSTM(Hidden_dim1,return_sequences=return_sequences , dropout=0.4, recurrent_dropout=recurrent_dropout), input_shape=[1, input_size]))
        model.add(Bidirectional(LSTM(Hidden_dim2, recurrent_dropout=recurrent_dropout)))
    else:
        model.add(LSTM(Hidden_dim1, dropout=0.4, recurrent_dropout=recurrent_dropout, input_shape=[1, input_size], return_sequences=return_sequences))
        model.add(LSTM(Hidden_dim2, recurrent_dropout=recurrent_dropout))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation=activation))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

# **Word2Vec Model Architecture**

In [17]:
def build_word2vec(train_sentences, num_workers, num_features, min_word_count, context):
    """
      Defines the architecture for Word2vec Model

      Args:
        train_sentences: 
        num_workers: 
        num_features:
        min_word_count: 
        context: 
      
      Returns: 
        word2vec model
        collections.dictionary

    """
    model = Word2Vec(workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context)
    # saving the word2vec model
    # model.wv.save_word2vec_format('word2vec_'+ str(fold_count) +'.bin', binary=True)
    cores = multiprocessing.cpu_count()
    model.build_vocab(train_sentences, progress_per=10000)
    model.train(train_sentences, total_examples=model.corpus_count, epochs=epochs, report_delay=1)
    model.init_sims(replace=True)
    sorted_dic = sorted(top10.items(), key=lambda k: k[1], reverse=True)
    return model,sorted_dic

# **Model Training**

In [18]:
X = data
y = data['domain1_score']

Hyperparameters for word2vec


In [19]:
num_features = 400
min_word_count = 40
num_workers = 4
context = 10
epochs = 30

Hyperpaprameters for LSTM

In [20]:
Hidden_dim1=300
Hidden_dim2=100
return_sequences = True
dropout=0.5
recurrent_dropout=0.4
input_size=400
activation='relu'
bidirectional = True
batch_size = 64
epoch = 70

In [22]:
warnings.filterwarnings('ignore')

tf.keras.backend.clear_session()
cv = KFold(n_splits=2, shuffle=True)
cv_data = cv.split(X)
results = []
fold_count = 1

print(X.shape)
print(y.shape)

for traincv, testcv in cv_data:
    print("\n--------Fold {}--------\n".format(fold_count))
    # get the train and test from the dataset.
    X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[testcv], y.iloc[traincv], y.iloc[testcv]
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    train_sentences = []
    for essay in train_essays:
        # get all the sentences from the essay
        train_sentences.append(essay_to_wordlist(essay))

    print("Converting sentences to word2vec model")
    model,_ = build_word2vec(train_sentences, num_workers, num_features, min_word_count, context)
    top10 = collections.defaultdict(int)

    trainDataVecs = np.array(getAvgFeatureVecs(train_sentences, model, num_features))
    test_sentences = []
    for essay_v in test_essays:
        test_sentences.append(essay_to_wordlist(essay_v))

    testDataVecs = np.array(getAvgFeatureVecs(test_sentences, model, num_features))
    trainDataVectors = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVectors = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    lstm_model = get_model(Hidden_dimension1=Hidden_dim1, Hidden_dimension2=Hidden_dim2, return_sequences=return_sequences,
                            dropout=dropout, recurrent_dropout=recurrent_dropout, input_size=input_size,
                            activation=activation, bidirectional=False)
    history = lstm_model.fit(trainDataVectors, y_train, batch_size=batch_size, epochs=epoch)
    y_pred = lstm_model.predict(testDataVectors)
    y_pred = np.around(y_pred)
    np.nan_to_num(y_pred)
    result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)
    fold_count += 1

print("Average kappa score value is : {}".format(np.mean(np.asarray(results))))

(12976, 28)
(12976,)

--------Fold 1--------

Converting sentences to word2vec model




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 300)            841200    
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 1,001,701
Trainable params: 1,001,701
Non-trainable params: 0
_________________________________________________________________
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 1



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 1, 300)            841200    
                                                                 
 lstm_3 (LSTM)               (None, 100)               160400    
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,001,701
Trainable params: 1,001,701
Non-trainable params: 0
_________________________________________________________________
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch

# **Testing Model**

In [33]:
df_test = pd.read_excel("Testing_Dataset.xlsx")

In [34]:
df_test.head()

Unnamed: 0,essay_id,essay_set,essay,predicted_score,grade,essay_type
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",7,8,persuasive / narrative / expository
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,8,8,persuasive / narrative / expository
2,1790,1,"Dear Local newspaper, Have you been spending a...",9,8,persuasive / narrative / expository
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",9,8,persuasive / narrative / expository
4,1792,1,"Dear newspaper, I strongly believe that comput...",9,8,persuasive / narrative / expository


In [35]:
test_essays = df_test['essay']
test_essay_sentences = []
for each_essay in test_essays:
  test_essay_sentences.append(essay_to_wordlist(each_essay))

test_features = np.array(getAvgFeatureVecs(test_essay_sentences, model, num_features))
test_features = np.reshape(test_features, (test_features.shape[0], 1, test_features.shape[1]))
y_predicted = lstm_model.predict(test_features)
y_predicted = np.around(y_predicted)
np.nan_to_num(y_predicted)
y_predicted_df = pd.DataFrame(y_predicted)



In [36]:
output_df = pd.DataFrame()
output_df['y_test'] = df_test.predicted_score
output_df["y_pred"] = y_predicted_df
result = cohen_kappa_score(df_test.predicted_score, y_predicted, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.960484862009027


In [37]:
output_df['essay'] = df_test['essay']
output_df

Unnamed: 0,y_test,y_pred,essay
0,7,8.0,"Dear @ORGANIZATION1, @CAPS1 more and more peop..."
1,8,7.0,Dear @LOCATION1 Time @CAPS1 me tell you what I...
2,9,9.0,"Dear Local newspaper, Have you been spending a..."
3,9,9.0,"Dear Readers, @CAPS1 you imagine how life woul..."
4,9,8.0,"Dear newspaper, I strongly believe that comput..."
...,...,...,...
4213,33,34.0,Have you ever noticed that if two little kids...
4214,35,34.0,Laughter @CAPS1 I ...
4215,38,34.0,Laughter in @CAPS1 A laugh is not just an act...
4216,32,34.0,LAUGHTER @CAPS1 i was younger my friend live...


In [27]:
output_df.to_csv('test_results.csv')