In [1]:
# Imports
import json, os, re, shutil, sys, time
import seaborn as sns
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from collections import defaultdict
import xmltodict
import untangle
import xml.etree.ElementTree as ET
# NLTK for NLP utils and corpora
import nltk
from nltk.corpus import treebank
from nltk.text import Text
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

# Import spacy
import spacy

import pickle

import sklearn
from sklearn.model_selection import train_test_split

# NumPy, Pandas and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf

from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

from numpy.random import seed
from pandas import read_csv, DataFrame
from sklearn.preprocessing import minmax_scale

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Dropout, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.layers.embeddings import Embedding
import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Activation, Dropout, Conv1D, MaxPooling1D, Bidirectional, Flatten, TimeDistributed

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score
import keras.backend as K
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from datetime import datetime


Using TensorFlow backend.


## AEG Long Essay Sentence Level Classification

In [2]:
# Read the data
aeg_long = pd.read_csv("../data-DNC/AEG/training_set_rel3.tsv",sep='\t',encoding = "latin1")
#aeg_long.head(5)

In [3]:
# Split the data into train and test. We need to do this first to ensure that when we split to 
# sentence level, we have sentences of a given essay in either training or test but not on both.

train_comb,test_comb = train_test_split(aeg_long,test_size=0.2, random_state=42)

In [4]:
# Create train and test data frames with relevant fields

x_train_df = train_comb.filter(['essay_id','essay_set','essay','domain1_score'], axis=1)
x_test_df = test_comb.filter(['essay_id','essay_set','essay','domain1_score'], axis=1)

x_train_df = x_train_df.reset_index()
x_test_df = x_test_df.reset_index()
print(x_train_df.shape)
print(x_test_df.shape)
print(x_train_df.essay_id[:5].values)
print(x_test_df.essay_id[:5].values)

(10380, 5)
(2596, 5)
[ 3567  4233  6518 15174 18855]
[ 9908  9872   305 12771  6839]


#### Split essay into sentences

In [7]:
# We will use spacy to split the essay into sentences. 
# Load spacy large english module
nlp = spacy.load('en_core_web_lg')

In [171]:
# Define a function to split essay into sentences
def create_sentences_df(df):
    """ Function to split essay data into individual sentences. Returns a dataframe"""
    start=datetime.now()
    aeg_long_sentence = pd.DataFrame(columns=['essay_id','essay_set','sentence','domain1_score'])
    for i in range(len(df)):
        if i%1000 == 0:
            print("At iteration :",i)
            print("Duration: ",datetime.now()-start)
        sentence = nlp(df.essay[i])
        for s in sentence.sents:
            aeg_long_sentence = aeg_long_sentence.append({'essay_id' : df.essay_id[i],
                                                          'essay_set' : df.essay_set[i],'sentence' : s.text, 
                                                          'domain1_score' : df.domain1_score[i]},
                                                         ignore_index=True)
    return aeg_long_sentence

In [172]:
# Split train data into sentences
x_train_sentence_df = create_sentences_df(x_train_df)

At iteration : 0
Duration:  0:00:00.007296
At iteration : 1000
Duration:  0:01:23.314513
At iteration : 2000
Duration:  0:03:12.908393
At iteration : 3000
Duration:  0:05:40.725304
At iteration : 4000
Duration:  0:09:21.342668
At iteration : 5000
Duration:  0:13:23.004936
At iteration : 6000
Duration:  0:18:27.577065
At iteration : 7000
Duration:  0:23:38.768215
At iteration : 8000
Duration:  0:29:45.614656
At iteration : 9000
Duration:  0:36:41.560594
At iteration : 10000
Duration:  0:44:11.900203


In [173]:
# Split test data into sentences
x_test_sentence_df = create_sentences_df(x_test_df)

At iteration : 0
Duration:  0:00:00.005520
At iteration : 1000
Duration:  0:01:16.552397
At iteration : 2000
Duration:  0:02:52.433178


#### Normalize the scores

In [174]:
# Each essay set has a different scoring range. We need to normalize the scores to a standard scale for training.
def normalize_score(essay):
    """ Normalizes the domain score based on percentage"""
    score = 0
    score = float(essay[3])
    essay_set = essay[1]
    if essay_set == 1:
        div = 12
    elif essay_set == 2:
        div = 5
    elif essay_set == 3:
        div = 3
    elif essay_set == 4:
        div = 3
    elif essay_set == 5:
        div = 4
    elif essay_set == 6:
        div = 4
    elif essay_set == 7:
        div = 25
    elif essay_set == 8:
        div = 50
    return score/div

In [175]:
x_train_sentence_df['Norm_Score'] = x_train_sentence_df.apply(normalize_score,axis=1)
x_test_sentence_df['Norm_Score'] = x_test_sentence_df.apply(normalize_score,axis=1)

In [176]:
x_train_sentence_df.head(5)

Unnamed: 0,essay_id,essay_set,sentence,domain1_score,Norm_Score
0,3567,2,There are many types of reading materials for ...,4,0.8
1,3567,2,"You can find things on cars, trucks, sports, a...",4,0.8
2,3567,2,There are some materials in a library though t...,4,0.8
3,3567,2,But should those materials be removed from the...,4,0.8
4,3567,2,Some think that they should and others think t...,4,0.8


In [177]:
# Store this dataset in pickle format so that we don't have to redo the above steps.
x_train_sentence_df.to_pickle("./x_train_sentence_df.pkl")
x_test_sentence_df.to_pickle("./x_test_sentence_df.pkl")

In [5]:
# Open the pickled version
x_train_sentence_df = pd.read_pickle("./x_train_sentence_df.pkl")
x_test_sentence_df = pd.read_pickle("./x_test_sentence_df.pkl")
print(x_train_sentence_df.shape)
print(x_test_sentence_df.shape)

(147123, 5)
(36778, 5)


In [9]:
#def append_sentence(sid,sentence):
#    sentence = sid_dict[sid] + sentence
#    return sentence

In [17]:
#sid_dict = {1:"One. ",2:"Two. ",3:"Three. ",4:"Four. ",5:"Five ",6:"Six. ",7:"Seven. ",8:"Eight. "}

In [18]:
#x_train_sentence_df['mod_sentence'] = np.vectorize(append_sentence)(x_train_sentence_df['essay_set'],
#                                                                    x_train_sentence_df['sentence'])

#x_test_sentence_df['mod_sentence'] = np.vectorize(append_sentence)(x_test_sentence_df['essay_set'],
#                                                                    x_test_sentence_df['sentence'])

In [6]:
#x_train_sentence_df.head(5)

In [7]:
# Create train and test text and labels

x_train = x_train_sentence_df['sentence'].values
y_train = x_train_sentence_df['Norm_Score'].values
x_test = x_test_sentence_df['sentence'].values
y_test = x_test_sentence_df['Norm_Score'].values
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(147123,)
(147123,)
(36778,)
(36778,)


In [8]:
vocabulary_size = 50000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(x_train)
train_seq = tokenizer.texts_to_sequences(x_train)
train_data = pad_sequences(train_seq)

In [9]:
max_len_class = train_data.shape[1]
max_words_class = vocabulary_size

In [10]:
test_seq = tokenizer.texts_to_sequences(x_test)
test_data = pad_sequences(test_seq, maxlen=max_len_class)
test_data.shape

(36778, 175)

In [11]:
embeddings_index = dict()
f = open('/Users/kurapati/W266/data/glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1917494 word vectors.


In [12]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [19]:
# Common Functions:

# Define function to get the multiplication factor. We can get it back from the essay set, but
# it is better to do it from the score, because there are scores with value 0, and its MF should be 0

def find_mult_factor(x):
    """ Function to find the multiplication factor for denormalizing"""
    if x[1] == 0:
        return 0
    else:
        return np.around(x[0]/x[1])
    
def denormalize(x):
    """ Function to Denormalize the score"""
    return np.around(x[2] * x[3])

def find_max_min_mean_score(df):
    """ Function to find the max, min and rounded mean of sentence scores"""
    new_df = pd.DataFrame(columns=['essay_id','essay_set','Orig_Score','Max_Score',
                                   'Min_Score','Mean_Score'])
    essay_ids = np.unique(df.essay_id)
    for e_id in essay_ids:
        df_temp = df[df.essay_id == e_id]
        max_score = np.max(df_temp.pred_score)
        min_score = np.min(df_temp.pred_score)
        # we need to round the mean so that kappa score doesnt complain
        mean_score = np.around(np.mean(df_temp.pred_score))
        new_df = new_df.append({'essay_id':e_id,'essay_set':int(np.unique(df_temp.essay_set)),
                                'Orig_Score':int(np.unique(df_temp.orig_score)),
                                'Max_Score':max_score,'Min_Score':min_score,
                                'Mean_Score':mean_score},ignore_index=True)
    return new_df

def essay_set_metrics(df):
    """ Calculate per essay set metrics"""
    set_df = pd.DataFrame(columns=['essay_set','RMSE','Kappa','Kappa_Q','Accuracy'])
    e_sets = np.unique(df.essay_set)
    for e_s in e_sets:
        df_s = df[df.essay_set == e_s]
        original_score = df_s.Orig_Score.values.astype(int)
        predicted_score = df_s.Mean_Score.values.astype(int)
        rmse = RMSE(original_score,predicted_score)
        kappa = cohen_kappa_score(original_score,predicted_score)
        kappa_q = cohen_kappa_score(original_score,predicted_score,weights='quadratic')
        accuracy = accuracy_score(original_score,predicted_score)
        set_df = set_df.append({'essay_set':e_s,'RMSE':rmse,'Kappa':kappa,'Kappa_Q':kappa_q,
                                'Accuracy':accuracy},ignore_index=True)
    return set_df


def sentence_count(df):
    """ Returns the number of sentences in an essay """
    essay_count = df.groupby('essay_id').count()
    essay_count = essay_count.drop(['sentence','domain1_score','Norm_Score'],axis=1)
    essay_count.columns = ['Number_of_Sentences']
    return essay_count

In [14]:
def RMSE(actual, predict):
    diff = actual - predict
    diff = sum(diff**2) / len(actual)
    return np.sqrt(diff)

In [34]:
#model definitions

def FF_NN():
    """ Simple feed forward NN"""
    model_ff = Sequential()
    model_ff.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    model_ff.add(tf.keras.layers.Flatten())
    model_ff.add(tf.keras.layers.Dense(50,activation='tanh'))
    model_ff.add(tf.keras.layers.Dropout(0.1))
    model_ff.add(tf.keras.layers.Dense(1,activation='sigmoid'))
    model_ff.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_ff

def GRU():
    """ Gated Recurrent Unit"""
    model_gru = Sequential()
    model_gru.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    model_gru.add(tf.keras.layers.GRU(32,activation='tanh'))
    model_gru.add(tf.keras.layers.Dropout(0.1))
    model_gru.add(tf.keras.layers.Dense(1,name='out_layer'))
    model_gru.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_gru

def CNN_FF():
    """ CNN with Feed Forward NN """
    model_conv = Sequential()
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    model_conv.add(tf.keras.layers.Conv1D(64, 5, activation='relu'))
    model_conv.add(tf.keras.layers.MaxPooling1D(pool_size=4))
    model_conv.add(tf.keras.layers.Flatten())
    model_conv.add(tf.keras.layers.Dense(100))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    return model_conv

def CNN_lstm():
    """ CNN with single layer LSTM & Feed Forward NN"""
    model_conv = Sequential()
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    #model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.Conv1D(64, 5, activation='relu'))
    model_conv.add(tf.keras.layers.MaxPooling1D(pool_size=4))
    model_conv.add(tf.keras.layers.LSTM(100))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.Dense(100))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    return model_conv

def stack_lstm():
    """ Three layered stacked LSTM."""
    model_conv = Sequential()
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    model_conv.add(tf.keras.layers.LSTM(32,return_sequences=True))
    model_conv.add(tf.keras.layers.LSTM(32, return_sequences=True))
    model_conv.add(tf.keras.layers.Dropout(0.2))
    model_conv.add(tf.keras.layers.LSTM(32))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    return model_conv

def stateful_stacked_lstm():
    # In stateful, total samples needs to be divisible by batch size
    # we have 147026 samples, so selecting 6683 (6683*22=147026)
    # The test sample need to be a multiple of 6683 as well
    batch_size=2
    model_conv = Sequential()
    # In stateful, we have to pass batch_input_shape to the first layer
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], 
                                             trainable=False,batch_input_shape=(batch_size,max_len_class)))
    model_conv.add(tf.keras.layers.LSTM(32,stateful=True,return_sequences=True))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.LSTM(32))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.Dense(100))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    #sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    #model_conv.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_conv


#### Simple Feed Forward NN

In [16]:
# Train the model
estimator_nn = KerasRegressor(build_fn=FF_NN, epochs=20, batch_size=500)
estimator_nn.fit(train_data, y_train)

# Predict for test data
prediction_nn=estimator_nn.predict(test_data)
rmse_val = RMSE(y_test,prediction_nn)
# Find the overall RMSE value. This is not very relevant considering different scale for each set.
print("RMSE: ",rmse_val)

# Construct pandas data frame of scores of each sentences
nn_df = pd.DataFrame([x_test_sentence_df['domain1_score'].astype(np.double), y_test.astype(np.double),
                      prediction_nn.astype(np.double)]).transpose()
nn_df.columns = ['Orig_Score','Norm_Score','Pred_Score']
# Find Multiplication factor using the function we defined before
nn_df['Mult_Factor'] = nn_df.apply(find_mult_factor,axis=1)
# Find the denormalized predicted score
nn_df['Denorm_Pred_Score'] = nn_df.apply(denormalize,axis=1)

# Extract the scores of each sentences
orig_score = nn_df.Orig_Score.as_matrix(columns=None)
orig_score = orig_score.astype(np.int)
pred_score = nn_df.Denorm_Pred_Score.as_matrix(columns=None)
pred_score = pred_score.astype(np.int)

# Provide the metrics at sentence level
rmse_nn = RMSE(orig_score,pred_score)
cohen_kappa = cohen_kappa_score(orig_score,pred_score,weights='quadratic')
accuracy = accuracy_score(orig_score,pred_score)
print("**** METRICS BASED ON DENORMALIZED SCORE FOR FEED FORWARD NN****")
print("**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****")
print("RMSE: ",rmse_nn)
print("Kappa Quadratic Weighting: ",cohen_kappa)
print("Accuracy: ",accuracy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
RMSE:  0.17843482393896942
**** METRICS BASED ON DENORMALIZED SCORE FOR FEED FORWARD NN****
**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****
RMSE:  3.0158011916323653
Kappa Quadratic Weighting:  0.9710287577862428
Accuracy:  0.3076295611506879


In [20]:
# We will combine the sentences back to essay in this section

result_df = pd.DataFrame([x_test_sentence_df.essay_id.values,x_test_sentence_df.essay_set.values,
                         x_test_sentence_df.sentence.values,nn_df.Orig_Score.values,
                          nn_df.Denorm_Pred_Score.values]).transpose()
result_df.columns = ['essay_id','essay_set','sentence','orig_score','pred_score']
nn_test_df = find_max_min_mean_score(result_df)

#Convert the dataframe parameters into integers as they are returned as floats
nn_test_df.essay_id = nn_test_df.essay_id.astype(int)
nn_test_df.essay_set = nn_test_df.essay_set.astype(int)
nn_test_df.Orig_Score = nn_test_df.Orig_Score.astype(int)
nn_test_df.Max_Score = nn_test_df.Max_Score.astype(int)
nn_test_df.Min_Score = nn_test_df.Min_Score.astype(int)
nn_test_df.Mean_Score = nn_test_df.Mean_Score.astype(int)

# Extract the scores
orig_score = nn_test_df.Orig_Score.values
max_pred_score = nn_test_df.Max_Score.values
min_pred_score = nn_test_df.Min_Score.values
mean_pred_score = nn_test_df.Mean_Score.values

# Compare the metrics if Max score of all sentences are taken
rmse_max_nn = RMSE(orig_score,max_pred_score)
cohen_kappa_max_nn = cohen_kappa_score(orig_score,max_pred_score)
accuracy_max_nn = accuracy_score(orig_score,max_pred_score)

# Compare the metrics if Min score of all sentences are taken
rmse_min_nn = RMSE(orig_score,min_pred_score)
cohen_kappa_min_nn = cohen_kappa_score(orig_score,min_pred_score)
accuracy_min_nn = accuracy_score(orig_score,min_pred_score)

# Compare the metrics if Mean score of all sentences are taken
rmse_mean_nn = RMSE(orig_score,mean_pred_score)
cohen_kappa_mean_nn = cohen_kappa_score(orig_score,mean_pred_score)
cohen_kappa_mean_q_nn = cohen_kappa_score(orig_score,mean_pred_score,weights='quadratic')
accuracy_mean_nn = accuracy_score(orig_score,mean_pred_score)

print("### Simple FF NN : Essay level Metrics : After taking MAX score all Sentences ###")
print("RMSE: ",rmse_max_nn)
print("Cohen Kappa: ",cohen_kappa_max_nn)
print("Accuracy: ",accuracy_max_nn)

print("### Simple FF NN : Essay level Metrics : After taking Min Score of all Sentences ###")
print("RMSE: ",rmse_min_nn)
print("Cohen Kappa: ",cohen_kappa_min_nn)
print("Accuracy: ",accuracy_min_nn)

print("### Simple FF NN : Essay level Metrics : After taking Mean Score of all Sentences ###")
print("RMSE: ",rmse_mean_nn)
print("Cohen Kappa: ",cohen_kappa_mean_nn)
print("Cohen Kappa Quadratic Weight: ",cohen_kappa_mean_q_nn)
print("Accuracy: ",accuracy_mean_nn)

# Provide per-essay-set metrics based on mean score
essay_set_results = essay_set_metrics(nn_test_df)
essay_set_results

### Simple FF NN : Essay level Metrics : After taking MAX score all Sentences ###
RMSE:  3.1247187854979463
Cohen Kappa:  0.19024697334778928
Accuracy:  0.28197226502311246
### Simple FF NN : Essay level Metrics : After taking Min Score of all Sentences ###
RMSE:  3.4116066731746675
Cohen Kappa:  0.2087755187967566
Accuracy:  0.31124807395993837
### Simple FF NN : Essay level Metrics : After taking Mean Score of all Sentences ###
RMSE:  2.0490893915006625
Cohen Kappa:  0.28876797313603686
Cohen Kappa Quadratic Weight:  0.9727217498960006
Accuracy:  0.3817411402157165


Unnamed: 0,essay_set,RMSE,Kappa,Kappa_Q,Accuracy
0,1.0,1.372047,0.061841,0.253271,0.284153
1,2.0,0.852803,0.143785,0.256081,0.489305
2,3.0,0.777683,0.055493,0.231869,0.39521
3,4.0,0.647645,0.395261,0.728173,0.580556
4,5.0,0.923133,0.117823,0.317429,0.405797
5,6.0,0.849837,0.14391,0.466218,0.502646
6,7.0,4.238346,0.013377,0.243856,0.089404
7,8.0,5.121872,-0.010131,0.354521,0.043796


#### RNN GRU

In [24]:
# Train the model
estimator = KerasRegressor(build_fn=GRU, epochs=20, batch_size=500)
estimator.fit(train_data, y_train)

# Predict for test data
prediction_rnn=estimator.predict(test_data)
rmse_val = RMSE(y_test,prediction_rnn)
# Find the overall RMSE value. This is not very relevant considering different scale for each set.
print("RMSE: ",rmse_val)

# Construct pandas data frame of scores of each sentences
rnn_df = pd.DataFrame([x_test_sentence_df['domain1_score'].astype(np.double), y_test.astype(np.double),
                      prediction_rnn.astype(np.double)]).transpose()
rnn_df.columns = ['Orig_Score','Norm_Score','Pred_Score']
# Find Multiplication factor using the function we defined before
rnn_df['Mult_Factor'] = rnn_df.apply(find_mult_factor,axis=1)
# Find the denormalized predicted score
rnn_df['Denorm_Pred_Score'] = rnn_df.apply(denormalize,axis=1)

# Extract the scores of each sentences
orig_score = rnn_df.Orig_Score.as_matrix(columns=None)
orig_score = orig_score.astype(np.int)
pred_score = rnn_df.Denorm_Pred_Score.as_matrix(columns=None)
pred_score = pred_score.astype(np.int)

# Provide the metrics at sentence level
rmse_rnn = RMSE(orig_score,pred_score)
cohen_kappa = cohen_kappa_score(orig_score,pred_score,weights='quadratic')
accuracy = accuracy_score(orig_score,pred_score)

print("**** METRICS BASED ON DENORMALIZED SCORE FOR SIMPLE RNN****")
print("**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****")
print("RMSE: ",rmse_rnn)
print("Kappa Quadratic Weighting: ",cohen_kappa)
print("Accuracy: ",accuracy)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
RMSE:  0.16376953718129628
**** METRICS BASED ON DENORMALIZED SCORE FOR SIMPLE RNN****
**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****
RMSE:  2.684139715898777
Kappa Quadratic Weighting:  0.9764312881541207
Accuracy:  0.32976235793137204


In [25]:
# We will combine the sentences back to essay in this section

result_df = pd.DataFrame([x_test_sentence_df.essay_id.values,x_test_sentence_df.essay_set.values,
                         x_test_sentence_df.sentence.values,rnn_df.Orig_Score.values,
                          rnn_df.Denorm_Pred_Score.values]).transpose()
result_df.columns = ['essay_id','essay_set','sentence','orig_score','pred_score']
rnn_test_df = find_max_min_mean_score(result_df)

#Convert the dataframe parameters into integers as they are returned as floats
rnn_test_df.essay_id = rnn_test_df.essay_id.astype(int)
rnn_test_df.essay_set = rnn_test_df.essay_set.astype(int)
rnn_test_df.Orig_Score = rnn_test_df.Orig_Score.astype(int)
rnn_test_df.Max_Score = rnn_test_df.Max_Score.astype(int)
rnn_test_df.Min_Score = rnn_test_df.Min_Score.astype(int)
rnn_test_df.Mean_Score = rnn_test_df.Mean_Score.astype(int)

# Extract the scores
orig_score = rnn_test_df.Orig_Score.values
max_pred_score = rnn_test_df.Max_Score.values
min_pred_score = rnn_test_df.Min_Score.values
mean_pred_score = rnn_test_df.Mean_Score.values

# Compare the metrics if Max score of all sentences are taken
rmse_max_rnn = RMSE(orig_score,max_pred_score)
cohen_kappa_max_rnn = cohen_kappa_score(orig_score,max_pred_score)
accuracy_max_rnn = accuracy_score(orig_score,max_pred_score)

# Compare the metrics if Min score of all sentences are taken
rmse_min_rnn = RMSE(orig_score,min_pred_score)
cohen_kappa_min_rnn = cohen_kappa_score(orig_score,min_pred_score)
accuracy_min_rnn = accuracy_score(orig_score,min_pred_score)

# Compare the metrics if Mean score of all sentences are taken
rmse_mean_rnn = RMSE(orig_score,mean_pred_score)
cohen_kappa_mean_rnn = cohen_kappa_score(orig_score,mean_pred_score)
cohen_kappa_mean_q_rnn = cohen_kappa_score(orig_score,mean_pred_score,weights='quadratic')
accuracy_mean_rnn = accuracy_score(orig_score,mean_pred_score)

print("### RNN GRU Results : MAX Score of all Sentences ###")
print("RMSE: ",rmse_max_rnn)
print("Cohen Kappa: ",cohen_kappa_max_rnn)
print("Accuracy: ",accuracy_max_rnn)

print("### RNN GRU Results : Min Score of all Sentences ###")
print("RMSE: ",rmse_min_rnn)
print("Cohen Kappa: ",cohen_kappa_min_rnn)
print("Accuracy: ",accuracy_min_rnn)

print("### RNN GRU Results : Mean Score of all Sentences ###")
print("RMSE: ",rmse_mean_rnn)
print("Cohen Kappa: ",cohen_kappa_mean_rnn)
print("Cohen Kappa Quadratic: ",cohen_kappa_mean_q_rnn)
print("Accuracy: ",accuracy_mean_rnn)

# Provide per-essay-set metrics based on mean score
essay_set_results = essay_set_metrics(rnn_test_df)
essay_set_results

### RNN GRU Results : MAX Score of all Sentences ###
RMSE:  2.6294673630321324
Cohen Kappa:  0.23117369442253644
Accuracy:  0.3231895223420647
### RNN GRU Results : Min Score of all Sentences ###
RMSE:  2.5619444416368395
Cohen Kappa:  0.30009785322978433
Accuracy:  0.39599383667180277
### RNN GRU Results : Mean Score of all Sentences ###
RMSE:  2.038629256385843
Cohen Kappa:  0.2946304356259103
Cohen Kappa Quadratic:  0.972956212599746
Accuracy:  0.38751926040061635


Unnamed: 0,essay_set,RMSE,Kappa,Kappa_Q,Accuracy
0,1.0,1.423841,0.085902,0.21156,0.300546
1,2.0,0.797724,0.20253,0.340192,0.526738
2,3.0,0.775755,0.0608,0.239757,0.398204
3,4.0,0.634648,0.421658,0.743862,0.597222
4,5.0,0.915249,0.102742,0.316925,0.397101
5,6.0,0.859125,0.088058,0.448045,0.47619
6,7.0,4.141536,0.00317,0.261132,0.082781
7,8.0,5.207378,0.014167,0.320681,0.065693


#### CNN FF

In [35]:
# Train the model
estimator = KerasRegressor(build_fn=CNN_FF, epochs=20, batch_size=500)
estimator.fit(train_data, y_train)

# Precit for test data
prediction_cnn_glove=estimator.predict(test_data)
rmse_val = RMSE(y_test,prediction_cnn_glove)
print("RMSE BEFORE DENORMALIZING: ",rmse_val)

# Construct data frame for denormalizing the score
cnn_df = pd.DataFrame([x_test_sentence_df['domain1_score'].astype(np.double), y_test.astype(np.double),
                      prediction_cnn_glove.astype(np.double)]).transpose()
cnn_df.columns = ['Orig_Score','Norm_Score','Pred_Score']
cnn_df['Mult_Factor'] = cnn_df.apply(find_mult_factor,axis=1)
cnn_df['Denorm_Pred_Score'] = cnn_df.apply(denormalize,axis=1)

# Extract the scores
orig_score = cnn_df.Orig_Score.as_matrix(columns=None)
orig_score = orig_score.astype(np.int)
pred_score = cnn_df.Denorm_Pred_Score.as_matrix(columns=None)
pred_score = pred_score.astype(np.int)
rmse_cnn = RMSE(orig_score,pred_score)
cohen_kappa = cohen_kappa_score(orig_score,pred_score,weights='quadratic')
accuracy = accuracy_score(orig_score,pred_score)

print("**** METRICS BASED ON DENORMALIZED SCORE FOR SIMPLE CNN****")
print("**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****")
print("RMSE: ",rmse_cnn)
print("Kappa Quadratic Weighting: ",cohen_kappa)
print("Accuracy: ",accuracy)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
RMSE BEFORE DENORMALIZING:  0.17770103259475886
**** METRICS BASED ON DENORMALIZED SCORE FOR SIMPLE CNN****
**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****
RMSE:  2.9199863154915833
Kappa Quadratic Weighting:  0.9718479854168396
Accuracy:  0.2994725107401164


In [36]:
# We will combine the sentences back to essay in this section

result_df = pd.DataFrame([x_test_sentence_df.essay_id.values,x_test_sentence_df.essay_set.values,
                         x_test_sentence_df.sentence.values,cnn_df.Orig_Score.values,
                          cnn_df.Denorm_Pred_Score.values]).transpose()
result_df.columns = ['essay_id','essay_set','sentence','orig_score','pred_score']
cnn_test_df = find_max_min_mean_score(result_df)

#Convert the dataframe parameters into integers as they are returned as floats
cnn_test_df.essay_id = cnn_test_df.essay_id.astype(int)
cnn_test_df.essay_set = cnn_test_df.essay_set.astype(int)
cnn_test_df.Orig_Score = cnn_test_df.Orig_Score.astype(int)
cnn_test_df.Max_Score = cnn_test_df.Max_Score.astype(int)
cnn_test_df.Min_Score = cnn_test_df.Min_Score.astype(int)
cnn_test_df.Mean_Score = cnn_test_df.Mean_Score.astype(int)

# Extract the scores
orig_score = cnn_test_df.Orig_Score.values
max_pred_score = cnn_test_df.Max_Score.values
min_pred_score = cnn_test_df.Min_Score.values
mean_pred_score = cnn_test_df.Mean_Score.values

# Compare the metrics if Max score of all sentences are taken
rmse_max_cnn = RMSE(orig_score,max_pred_score)
cohen_kappa_max_cnn = cohen_kappa_score(orig_score,max_pred_score)
accuracy_max_cnn = accuracy_score(orig_score,max_pred_score)

# Compare the metrics if Min score of all sentences are taken
rmse_min_cnn = RMSE(orig_score,min_pred_score)
cohen_kappa_min_cnn = cohen_kappa_score(orig_score,min_pred_score)
accuracy_min_cnn = accuracy_score(orig_score,min_pred_score)

# Compare the metrics if Mean score of all sentences are taken
rmse_mean_cnn = RMSE(orig_score,mean_pred_score)
cohen_kappa_mean_cnn = cohen_kappa_score(orig_score,mean_pred_score)
cohen_kappa_mean_q_cnn = cohen_kappa_score(orig_score,mean_pred_score,weights='quadratic')
accuracy_mean_cnn = accuracy_score(orig_score,mean_pred_score)

print("### CNN FF Results : MAX Score of all Sentences ###")
print("RMSE: ",rmse_max_cnn)
print("Cohen Kappa: ",cohen_kappa_max_cnn)
print("Accuracy: ",accuracy_max_cnn)

print("### CNN FF Results : Min Score of all Sentences ###")
print("RMSE: ",rmse_min_cnn)
print("Cohen Kappa: ",cohen_kappa_min_cnn)
print("Accuracy: ",accuracy_min_cnn)

print("### CNN FF Results : Mean Score of all Sentences ###")
print("RMSE: ",rmse_mean_cnn)
print("Cohen Kappa: ",cohen_kappa_mean_cnn)
print("Cohen Kappa Quadratic: ",cohen_kappa_mean_q_cnn)
print("Accuracy: ",accuracy_mean_cnn)

# Provide per-essay-set metrics based on mean score
essay_set_results = essay_set_metrics(cnn_test_df)
essay_set_results

### CNN FF Results : MAX Score of all Sentences ###
RMSE:  3.174860321108886
Cohen Kappa:  0.21270831634041953
Accuracy:  0.3039291217257319
### CNN FF Results : Min Score of all Sentences ###
RMSE:  2.8558053219678627
Cohen Kappa:  0.255875235499437
Accuracy:  0.35554699537750384
### CNN FF Results : Mean Score of all Sentences ###
RMSE:  2.304447364973929
Cohen Kappa:  0.250191281372947
Cohen Kappa Quadratic:  0.9654299265751505
Accuracy:  0.3447611710323575


Unnamed: 0,essay_set,RMSE,Kappa,Kappa_Q,Accuracy
0,1.0,1.542654,0.009126,0.035933,0.196721
1,2.0,1.013281,-0.000246,0.007121,0.390374
2,3.0,0.789148,0.056893,0.213993,0.39521
3,4.0,0.703167,0.380702,0.706576,0.569444
4,5.0,0.963087,0.076782,0.252134,0.376812
5,6.0,0.85604,0.095241,0.448504,0.481481
6,7.0,4.792724,0.003852,0.05653,0.072848
7,8.0,5.80146,-0.009166,0.103354,0.043796


#### CNN_LSTM

In [37]:
# Train the model
estimator = KerasRegressor(build_fn=CNN_lstm, epochs=20, batch_size=500)
estimator.fit(train_data, y_train)

# Precit for test data
prediction_cnn_lstm=estimator.predict(test_data)
rmse_val = RMSE(y_test,prediction_cnn_lstm)
print("RMSE BEFORE DENORMALIZING: ",rmse_val)

# Construct data frame for denormalizing the score
cnn_df = pd.DataFrame([x_test_sentence_df['domain1_score'].astype(np.double), y_test.astype(np.double),
                      prediction_cnn_lstm.astype(np.double)]).transpose()
cnn_df.columns = ['Orig_Score','Norm_Score','Pred_Score']
cnn_df['Mult_Factor'] = cnn_df.apply(find_mult_factor,axis=1)
cnn_df['Denorm_Pred_Score'] = cnn_df.apply(denormalize,axis=1)

# Extract the scores
orig_score = cnn_df.Orig_Score.as_matrix(columns=None)
orig_score = orig_score.astype(np.int)
pred_score = cnn_df.Denorm_Pred_Score.as_matrix(columns=None)
pred_score = pred_score.astype(np.int)
rmse_cnn = RMSE(orig_score,pred_score)
cohen_kappa = cohen_kappa_score(orig_score,pred_score,weights='quadratic')
accuracy = accuracy_score(orig_score,pred_score)

print("**** METRICS BASED ON DENORMALIZED SCORE FOR CNN+LSTM****")
print("**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****")
print("RMSE: ",rmse_cnn)
print("Kappa Quadratic Weighting: ",cohen_kappa)
print("Accuracy: ",accuracy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
RMSE BEFORE DENORMALIZING:  0.1802107681054047
**** METRICS BASED ON DENORMALIZED SCORE FOR CNN+LSTM****
**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****
RMSE:  3.031107414698982
Kappa Quadratic Weighting:  0.9707114569611448
Accuracy:  0.3067594757735603


In [38]:
# We will combine the sentences back to essay in this section

result_df = pd.DataFrame([x_test_sentence_df.essay_id.values,x_test_sentence_df.essay_set.values,
                         x_test_sentence_df.sentence.values,cnn_df.Orig_Score.values,
                          cnn_df.Denorm_Pred_Score.values]).transpose()
result_df.columns = ['essay_id','essay_set','sentence','orig_score','pred_score']
cnn_test_df = find_max_min_mean_score(result_df)

#Convert the dataframe parameters into integers as they are returned as floats
cnn_test_df.essay_id = cnn_test_df.essay_id.astype(int)
cnn_test_df.essay_set = cnn_test_df.essay_set.astype(int)
cnn_test_df.Orig_Score = cnn_test_df.Orig_Score.astype(int)
cnn_test_df.Max_Score = cnn_test_df.Max_Score.astype(int)
cnn_test_df.Min_Score = cnn_test_df.Min_Score.astype(int)
cnn_test_df.Mean_Score = cnn_test_df.Mean_Score.astype(int)

# Extract the scores
orig_score = cnn_test_df.Orig_Score.values
max_pred_score = cnn_test_df.Max_Score.values
min_pred_score = cnn_test_df.Min_Score.values
mean_pred_score = cnn_test_df.Mean_Score.values

# Compare the metrics if Max score of all sentences are taken
rmse_max_cnn = RMSE(orig_score,max_pred_score)
cohen_kappa_max_cnn = cohen_kappa_score(orig_score,max_pred_score)
accuracy_max_cnn = accuracy_score(orig_score,max_pred_score)

# Compare the metrics if Min score of all sentences are taken
rmse_min_cnn = RMSE(orig_score,min_pred_score)
cohen_kappa_min_cnn = cohen_kappa_score(orig_score,min_pred_score)
accuracy_min_cnn = accuracy_score(orig_score,min_pred_score)

# Compare the metrics if Mean score of all sentences are taken
rmse_mean_cnn = RMSE(orig_score,mean_pred_score)
cohen_kappa_mean_cnn = cohen_kappa_score(orig_score,mean_pred_score)
cohen_kappa_mean_q_cnn = cohen_kappa_score(orig_score,mean_pred_score,weights='quadratic')
accuracy_mean_cnn = accuracy_score(orig_score,mean_pred_score)

print("### CNN LSTM Results : MAX Score of all Sentences ###")
print("RMSE: ",rmse_max_cnn)
print("Cohen Kappa: ",cohen_kappa_max_cnn)
print("Accuracy: ",accuracy_max_cnn)

print("### CNN LSTM Results : Min Score of all Sentences ###")
print("RMSE: ",rmse_min_cnn)
print("Cohen Kappa: ",cohen_kappa_min_cnn)
print("Accuracy: ",accuracy_min_cnn)

print("### CNN LSTM Results : Mean Score of all Sentences ###")
print("RMSE: ",rmse_mean_cnn)
print("Cohen Kappa: ",cohen_kappa_mean_cnn)
print("Cohen Kappa Quadratic: ",cohen_kappa_mean_q_cnn)
print("Accuracy: ",accuracy_mean_cnn)

# Provide per-essay-set metrics based on mean score
essay_set_results = essay_set_metrics(cnn_test_df)
essay_set_results

### CNN LSTM Results : MAX Score of all Sentences ###
RMSE:  3.4449221244840396
Cohen Kappa:  0.16762993097773926
Accuracy:  0.2569337442218798
### CNN LSTM Results : Min Score of all Sentences ###
RMSE:  3.275369161376681
Cohen Kappa:  0.22088848804757633
Accuracy:  0.32126348228043144
### CNN LSTM Results : Mean Score of all Sentences ###
RMSE:  2.1855386871090934
Cohen Kappa:  0.27847628160086124
Cohen Kappa Quadratic:  0.9696044936380251
Accuracy:  0.3697996918335901


Unnamed: 0,essay_set,RMSE,Kappa,Kappa_Q,Accuracy
0,1.0,1.43626,0.056193,0.211959,0.254098
1,2.0,0.922099,0.092158,0.157407,0.451872
2,3.0,0.787249,0.062492,0.234437,0.398204
3,4.0,0.643342,0.419355,0.740524,0.594444
4,5.0,0.921561,0.126952,0.328317,0.411594
5,6.0,0.875897,0.122077,0.450229,0.486772
6,7.0,4.521106,-0.014737,0.187675,0.056291
7,8.0,5.546749,0.005235,0.236956,0.058394


#### Stacked LSTM

In [31]:
# Train the model
estimator_lstm = KerasRegressor(build_fn=stack_lstm, epochs=20, batch_size=500)
estimator_lstm.fit(train_data, y_train)

# Predict for test data
prediction_lstm=estimator_lstm.predict(test_data)
rmse_val = RMSE(y_test,prediction_lstm)
# Find the overall RMSE value. This is not very relevant considering different scale for each set.
print("RMSE: ",rmse_val)

# Construct pandas data frame of scores of each sentences
lstm_df = pd.DataFrame([x_test_sentence_df['domain1_score'].astype(np.double), y_test.astype(np.double),
                      prediction_lstm.astype(np.double)]).transpose()
lstm_df.columns = ['Orig_Score','Norm_Score','Pred_Score']
# Find Multiplication factor using the function we defined before
lstm_df['Mult_Factor'] = lstm_df.apply(find_mult_factor,axis=1)
# Find the denormalized predicted score
lstm_df['Denorm_Pred_Score'] = lstm_df.apply(denormalize,axis=1)

# Extract the scores of each sentences
orig_score = lstm_df.Orig_Score.as_matrix(columns=None)
orig_score = orig_score.astype(np.int)
pred_score = lstm_df.Denorm_Pred_Score.as_matrix(columns=None)
pred_score = pred_score.astype(np.int)

# Provide the metrics at sentence level
rmse_lstm = RMSE(orig_score,pred_score)
cohen_kappa = cohen_kappa_score(orig_score,pred_score,weights='quadratic')
accuracy = accuracy_score(orig_score,pred_score)

print("**** METRICS BASED ON DENORMALIZED SCORE FOR SIMPLE CNN****")
print("**** NOTE: THIS IS STILL AT SENTENCE LEVEL. NEXT SECTION WILL MERGE SENTENCE SCORES****")
print("RMSE: ",rmse_lstm)
print("Kappa Quadratic Weighting: ",cohen_kappa)
print("Accuracy: ",accuracy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
RMSE:  0.16435884045638574
RMSE Sentence Level:  2.682513368191454
Kappa Sentence Level:  0.26781235443159734
Accuracy Sentence Level:  0.33120343683723963


In [32]:
# We will combine the sentences back to essay in this section

result_df = pd.DataFrame([x_test_sentence_df.essay_id.values,x_test_sentence_df.essay_set.values,
                         x_test_sentence_df.sentence.values,lstm_df.Orig_Score.values,
                          lstm_df.Denorm_Pred_Score.values]).transpose()
result_df.columns = ['essay_id','essay_set','sentence','orig_score','pred_score']
lstm_test_df = find_max_min_mean_score(result_df)

#Convert the dataframe parameters into integers as they are returned as floats
lstm_test_df.essay_id = lstm_test_df.essay_id.astype(int)
lstm_test_df.essay_set = lstm_test_df.essay_set.astype(int)
lstm_test_df.Orig_Score = lstm_test_df.Orig_Score.astype(int)
lstm_test_df.Max_Score = lstm_test_df.Max_Score.astype(int)
lstm_test_df.Min_Score = lstm_test_df.Min_Score.astype(int)
lstm_test_df.Mean_Score = lstm_test_df.Mean_Score.astype(int)

# Extract the scores
orig_score = lstm_test_df.Orig_Score.values
max_pred_score = lstm_test_df.Max_Score.values
min_pred_score = lstm_test_df.Min_Score.values
mean_pred_score = lstm_test_df.Mean_Score.values

# Compare the metrics if Max score of all sentences are taken
rmse_max_lstm = RMSE(orig_score,max_pred_score)
cohen_kappa_max_lstm = cohen_kappa_score(orig_score,max_pred_score)
accuracy_max_lstm = accuracy_score(orig_score,max_pred_score)

# Compare the metrics if Min score of all sentences are taken
rmse_min_lstm = RMSE(orig_score,min_pred_score)
cohen_kappa_min_lstm = cohen_kappa_score(orig_score,min_pred_score)
accuracy_min_lstm = accuracy_score(orig_score,min_pred_score)

# Compare the metrics if Mean score of all sentences are taken
rmse_mean_lstm = RMSE(orig_score,mean_pred_score)
cohen_kappa_mean_lstm = cohen_kappa_score(orig_score,mean_pred_score)
accuracy_mean_lstm = accuracy_score(orig_score,mean_pred_score)

print("### Stacked LSTM Results : MAX Score of all Sentences ###")
print("RMSE: ",rmse_max_lstm)
print("Cohen Kappa: ",cohen_kappa_max_lstm)
print("Accuracy: ",accuracy_max_lstm)

print("### Stacked LSTM Results : Min Score of all Sentences ###")
print("RMSE: ",rmse_min_lstm)
print("Cohen Kappa: ",cohen_kappa_min_lstm)
print("Accuracy: ",accuracy_min_lstm)

print("### Stacked LSTM Results : Mean Score of all Sentences ###")
print("RMSE: ",rmse_mean_lstm)
print("Cohen Kappa: ",cohen_kappa_mean_lstm)
print("Accuracy: ",accuracy_mean_lstm)

# Provide per-essay-set metrics based on mean score
essay_set_results = essay_set_metrics(lstm_test_df)
essay_set_results

### Stacked LSTM Results : MAX Score of all Sentences ###
RMSE:  2.6410881634680607
Cohen Kappa:  0.23329561791239495
Accuracy:  0.3274268104776579
### Stacked LSTM Results : Min Score of all Sentences ###
RMSE:  2.71892018355833
Cohen Kappa:  0.29402029329686785
Accuracy:  0.38906009244992296
### Stacked LSTM Results : Mean Score of all Sentences ###
RMSE:  2.0400459223206706
Cohen Kappa:  0.2959414229512113
Accuracy:  0.387904468412943


Unnamed: 0,essay_set,RMSE,Kappa,Accuracy
0,1.0,1.389854,0.076284,0.297814
1,2.0,0.822478,0.172556,0.505348
2,3.0,0.775755,0.064008,0.398204
3,4.0,0.628048,0.433737,0.605556
4,5.0,0.892805,0.109588,0.402899
5,6.0,0.81973,0.125403,0.494709
6,7.0,4.165055,0.003667,0.07947
7,8.0,5.219978,0.010692,0.058394
