In [60]:
# Imports
import json, os, re, shutil, sys, time
import seaborn as sns
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from collections import defaultdict
import xmltodict
import untangle
import xml.etree.ElementTree as ET
# NLTK for NLP utils and corpora
import nltk
from nltk.corpus import treebank
from nltk.text import Text
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

# Import spacy
import spacy

import pickle

import sklearn
from sklearn.model_selection import train_test_split

# NumPy, Pandas and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf

from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

from numpy.random import seed
from pandas import read_csv, DataFrame
from sklearn.preprocessing import minmax_scale

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Dropout, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.layers.embeddings import Embedding
import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Activation, Dropout, Conv1D, MaxPooling1D, Bidirectional, Flatten, TimeDistributed

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score
import keras.backend as K
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from datetime import datetime


## AEG Long Essay Sentence Level Classification

In [2]:
# Read the data
aeg_long = pd.read_csv("../data-DNC/AEG/training_set_rel3.tsv",sep='\t',encoding = "latin1")
aeg_long.head(5)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [5]:
# Split the data into train and test. We need to do this first to ensure that when we split to 
# sentence level, we have sentences of a given essay in either training or test but not on both.

tr_essay,ts_essay,tr_domain_score,ts_domain_score,tr_essay_id,ts_essay_id,tr_essay_set,ts_essay_set=train_test_split(
    np.asarray(aeg_long.essay),np.asarray(aeg_long.domain1_score),
    np.asarray(aeg_long.essay_id),np.asarray(aeg_long.essay_set),test_size=0.2)

In [6]:
# Check the shapes of training and test datasets
print(tr_essay.shape)
print(ts_essay.shape)

(10380,)
(2596,)


In [10]:
# Create train and test data frames with relevant fields

x_train_df = pd.DataFrame([tr_essay_id,tr_essay_set,tr_essay,tr_domain_score.astype(np.double)]).transpose()
x_train_df.columns = ['essay_id','essay_set','essay','domain1_score']
x_test_df = pd.DataFrame([ts_essay_id,ts_essay_set,ts_essay,ts_domain_score.astype(np.double)]).transpose()
x_test_df.columns = ['essay_id','essay_set','essay','domain1_score']
#print(x_train_df.head(5))
#print(x_test_df.head(5))

#### Split essay into sentences

In [7]:
# We will use spacy to split the essay into sentences. 
# Load spacy large english module
nlp = spacy.load('en_core_web_lg')

In [14]:
# Define a function to split essay into sentences
def create_sentences_df(df):
    """ Function to split essay data into individual sentences. Returns a dataframe"""
    start=datetime.now()
    aeg_long_sentence = pd.DataFrame(columns=['essay_id','essay_set','sentence','domain1_score'])
    for i in range(len(df)):
        if i%1000 == 0:
            print("At iteration :",i)
            print("Duration: ",datetime.now()-start)
        sentence = nlp(df.essay[i])
        for s in sentence.sents:
            aeg_long_sentence = aeg_long_sentence.append({'essay_id' : df.essay_id[i],
                                                          'essay_set' : df.essay_set[i],'sentence' : s.text, 
                                                          'domain1_score' : df.domain1_score[i]},
                                                         ignore_index=True)
    return aeg_long_sentence

In [None]:
# We also need to convert the spacy format text into regular text
def spacy_to_text(essay):
    return essay[2].text

In [15]:
# Split train data into sentences
x_train_sentence_df = create_sentences_df(x_train_df)

At iteration : 0
Duration:  0:00:00.001094
At iteration : 1000
Duration:  0:01:13.910230
At iteration : 2000
Duration:  0:02:41.471731
At iteration : 3000
Duration:  0:04:57.690647
At iteration : 4000
Duration:  0:07:51.580886
At iteration : 5000
Duration:  0:11:39.480142
At iteration : 6000
Duration:  0:16:11.116163
At iteration : 7000
Duration:  0:21:46.361166
At iteration : 8000
Duration:  0:27:57.529110
At iteration : 9000
Duration:  0:34:42.670452
At iteration : 10000
Duration:  0:42:12.653912


In [17]:
# Split test data into sentences
x_test_sentence_df = create_sentences_df(x_test_df)

At iteration : 0
Duration:  0:00:00.001098
At iteration : 1000
Duration:  0:01:15.384433
At iteration : 2000
Duration:  0:02:53.557499


#### Normalize the scores

In [19]:
# Each essay set has a different scoring range. We need to normalize the scores to a standard scale for training.
def normalize_score(essay):
    """ Normalizes the domain score based on percentage"""
    score = 0
    score = float(essay[3])
    essay_set = essay[1]
    if essay_set == 1:
        div = 12
    elif essay_set == 2:
        div = 5
    elif essay_set == 3:
        div = 3
    elif essay_set == 4:
        div = 3
    elif essay_set == 5:
        div = 4
    elif essay_set == 6:
        div = 4
    elif essay_set == 7:
        div = 25
    elif essay_set == 8:
        div = 50
    return score/div

In [20]:
x_train_sentence_df['Norm_Score'] = x_train_sentence_df.apply(normalize_score,axis=1)
x_test_sentence_df['Norm_Score'] = x_test_sentence_df.apply(normalize_score,axis=1)

In [25]:
x_train_sentence_df.head(5)

Unnamed: 0,essay_id,essay_set,sentence,domain1_score,Norm_Score
0,4075,2,Did your child ever bring home a book?,3.0,0.6
1,4075,2,"A piece of music, movie or magazine?",3.0,0.6
2,4075,2,Did you ever stop to think that the piece of i...,3.0,0.6
3,4075,2,There are points in time where there isn't an ...,3.0,0.6
4,4075,2,The driving question is do you want to remove ...,3.0,0.6


In [61]:
# Store this dataset in pickle format so that we don't have to redo the above steps.
x_train_sentence_df.to_pickle("./x_train_sentence_df.pkl")
x_test_sentence_df.to_pickle("./x_test_sentence_df.pkl")

In [30]:
# Create train and test text and labels

x_train = x_train_sentence_df['sentence'].values
y_train = x_train_sentence_df['Norm_Score'].values
x_test = x_test_sentence_df['sentence'].values
y_test = x_test_sentence_df['Norm_Score'].values
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(147647,)
(147647,)
(36254,)
(36254,)


In [31]:
vocabulary_size = 50000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(x_train)
train_seq = tokenizer.texts_to_sequences(x_train)
train_data = pad_sequences(train_seq)

In [32]:
max_len_class = train_data.shape[1]
max_words_class = vocabulary_size

In [33]:
test_seq = tokenizer.texts_to_sequences(x_test)
test_data = pad_sequences(test_seq, maxlen=max_len_class)
test_data.shape

(36254, 175)

In [34]:
embeddings_index = dict()
f = open('/Users/kurapati/W266/data/glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1917494 word vectors.


In [35]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [36]:
def RMSE(actual, predict):
    diff = actual - predict
    diff = sum(diff**2) / len(actual)
    return np.sqrt(diff)

In [124]:
#Cohen Kappa score as defined by the kaggle challenge/wikipedia
def CohenKappa(actual, predict):
    CohenDF = pd.DataFrame([actual.astype(np.double).round(), np.around(predict.astype(np.double))]).transpose()
    count = len(CohenDF)
    CohenDF.columns = ['actual','predict']
    correct = len(CohenDF[CohenDF.actual==CohenDF.predict])
    acc = correct / count
    pe = 0
    for value in CohenDF.actual.unique():
        pe += len(CohenDF[CohenDF.actual == value]) * len(CohenDF[CohenDF.predict == value])
    pe = pe / np.square(count)
    print("Count: ",count)
    print("pe:" ,pe)
    return(1 - (1-acc)/(1-pe))

In [38]:
import skll.metrics as metric

# Calculates the average quadratic kappa for the entire essay set
def get_average_kappa(targets, predictions):
        num = len(targets)
        total_kappa = 0

        for i in range(0, num):
                total_kappa += metric.kappa([targets], [predictions], 'quadratic')

        avg_kappa  = float(total_kappa) / float(num)
        return avg_kappa

In [39]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

In [41]:
tfidf_all = TfidfVectorizer()
tfidf_all.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [42]:
train_transform = tfidf_all.transform(x_train)
test_transform = tfidf_all.transform(x_test)

In [23]:
#gnb = GaussianNB()
#lr = LinearRegression(fit_intercept=True)
#rf = RandomForestClassifier(n_estimators = 100)

In [None]:
#gnb.fit(train_transform.toarray(), train_y_sentence)
#lr.fit(train_transform.toarray(), train_y_sentence)
#rf.fit(train_transform.toarray(), train_y_sentence)

In [43]:
train_transform.shape

(147647, 33809)

In [44]:
max_len_class = train_data.shape[1]
max_words_class = vocabulary_size

In [62]:
#model definitions

def CNN_lstm():
    """ CNN with single layer LSTM"""
    model_conv = Sequential()
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.Conv1D(64, 5, activation='relu'))
    model_conv.add(tf.keras.layers.MaxPooling1D(pool_size=4))
    model_conv.add(tf.keras.layers.LSTM(100))
    model_conv.add(tf.keras.layers.Dense(100))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    #sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    #model_conv.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_conv

def stack_lstm():
    """ Stacked LSTM"""
    model_conv = Sequential()
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], trainable=False))
    #model_conv.add(tf.keras.layers.Dropout(0.1))
    #model_conv.add(tf.keras.layers.Conv1D(64, 5, activation='relu'))
    #model_conv.add(tf.keras.layers.MaxPooling1D(pool_size=4))
    #model_conv.add(tf.keras.layers.Dropout(0.2))
    model_conv.add(tf.keras.layers.LSTM(32,return_sequences=True))
    model_conv.add(tf.keras.layers.LSTM(32, return_sequences=True))
    model_conv.add(tf.keras.layers.Dropout(0.2))
    model_conv.add(tf.keras.layers.LSTM(32))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    #sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    #model_conv.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_conv

def stateful_stacked_lstm():
    # In stateful, total samples needs to be divisible by batch size
    # we have 147026 samples, so selecting 6683 (6683*22=147026)
    batch_size=6683
    model_conv = Sequential()
    # In stateful, we have to pass batch_input_shape to the first layer
    model_conv.add(tf.keras.layers.Embedding(max_words_class, 300, input_length=max_len_class, weights=[embedding_matrix], 
                                             trainable=False,batch_input_shape=(batch_size,max_len_class)))
    model_conv.add(tf.keras.layers.LSTM(32,stateful=True,return_sequences=True))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.LSTM(32))
    model_conv.add(tf.keras.layers.Dropout(0.1))
    model_conv.add(tf.keras.layers.Dense(100))
    model_conv.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))
    #sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
    model_conv.compile(loss = 'mse', optimizer = tf.train.AdamOptimizer(), metrics = ['accuracy'])
    #model_conv.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_conv

#bidirectional RNN is screwed up atm
def Bidirectional_RNN():
    inputs = Input(name='inputs',shape=[max_len_class])
    layer = Embedding(max_words,200,input_length=max_len_class)(inputs)
    layer = Bidirectional(LSTM(10))(layer)
    layer = Dense(32,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

def feedforward_NN():
    model_ff = Sequential()
    model_ff.add(tf.keras.layers.Embedding(max_words_class,200,input_length=max_len_class))
    model_ff.add(tf.keras.layers.Flatten())
    model_ff.add(tf.keras.layers.Dense(50,activation='tanh'))
    model_ff.add(tf.keras.layers.Dense(1,activation='sigmoid'))
    model_ff.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_ff

def RNN():
    model_rnn = Sequential()
    model_rnn.add(tf.keras.layers.Embedding(max_words_class,300,input_length=max_len_class))
    model_rnn.add(tf.keras.layers.LSTM(10))
    model_rnn.add(tf.keras.layers.Dense(1,name='out_layer'))
    model_rnn.compile(optimizer=tf.train.AdamOptimizer(),loss='mse',metrics=['accuracy'])
    return model_rnn

#### CNN

In [46]:
estimator = KerasRegressor(build_fn=CNN_glove_1, epochs=20, batch_size=500)
#kfold = KFold(n_splits=5, random_state=43)
#results = np.sqrt(-1*cross_val_score(estimator, train_data, train_y_sentence,scoring= "neg_mean_squared_error", cv=kfold))
#print("Training RMSE mean and std from CV: {} {}".format(results.mean(),results.std()))

In [47]:
estimator.fit(train_data, y_train)
prediction_cnn_glove=estimator.predict(test_data)
rmse_val = RMSE(y_test,prediction_cnn_glove)
print("RMSE: ",rmse_val)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
RMSE:  0.16874617569397307


In [48]:
# Define function to get the multiplication factor. We can get it back from the essay set, but
# it is better to do it from the score, because there are scores with value 0, and its MF should be 0

def find_mult_factor(x):
    """ Function to find the multiplication factor for denormalizing"""
    if x[1] == 0:
        return 0
    else:
        return np.around(x[0]/x[1])

In [51]:
cnn_df = pd.DataFrame([x_test_sentence_df['domain1_score'].astype(np.double), y_test.astype(np.double),
                      prediction_cnn_glove.astype(np.double)]).transpose()
cnn_df.columns = ['Orig_Score','Norm_Score','Pred_Score']
cnn_df.head(10)

Unnamed: 0,Orig_Score,Norm_Score,Pred_Score
0,4.0,0.8,0.778362
1,4.0,0.8,0.766187
2,4.0,0.8,0.675432
3,4.0,0.8,0.782038
4,4.0,0.8,0.754179
5,4.0,0.8,0.739532
6,4.0,0.8,0.722847
7,4.0,0.8,0.726954
8,4.0,0.8,0.709436
9,4.0,0.8,0.801359


In [52]:
cnn_df['Mult_Factor'] = cnn_df.apply(find_mult_factor,axis=1)
cnn_df.head(10)

Unnamed: 0,Orig_Score,Norm_Score,Pred_Score,Mult_Factor
0,4.0,0.8,0.778362,5.0
1,4.0,0.8,0.766187,5.0
2,4.0,0.8,0.675432,5.0
3,4.0,0.8,0.782038,5.0
4,4.0,0.8,0.754179,5.0
5,4.0,0.8,0.739532,5.0
6,4.0,0.8,0.722847,5.0
7,4.0,0.8,0.726954,5.0
8,4.0,0.8,0.709436,5.0
9,4.0,0.8,0.801359,5.0


In [53]:
def denormalize(x):
    return np.around(x[2] * x[3])
cnn_df['Denorm_Pred_Score'] = cnn_df.apply(denormalize,axis=1)
cnn_df.head(10)

Unnamed: 0,Orig_Score,Norm_Score,Pred_Score,Mult_Factor,Denorm_Pred_Score
0,4.0,0.8,0.778362,5.0,4.0
1,4.0,0.8,0.766187,5.0,4.0
2,4.0,0.8,0.675432,5.0,3.0
3,4.0,0.8,0.782038,5.0,4.0
4,4.0,0.8,0.754179,5.0,4.0
5,4.0,0.8,0.739532,5.0,4.0
6,4.0,0.8,0.722847,5.0,4.0
7,4.0,0.8,0.726954,5.0,4.0
8,4.0,0.8,0.709436,5.0,4.0
9,4.0,0.8,0.801359,5.0,4.0


In [59]:
cnn_df.shape

(36254, 5)

In [56]:
orig_score = cnn_df.Orig_Score.as_matrix(columns=None)
orig_score = orig_score.astype(np.int)
pred_score = cnn_df.Denorm_Pred_Score.as_matrix(columns=None)
pred_score = pred_score.astype(np.int)
rmse_cnn = RMSE(orig_score,pred_score)
print("RMSE: ",rmse_cnn)

RMSE:  2.6334048274563844


In [57]:
cohen_kappa = cohen_kappa_score(orig_score,pred_score)
print("Cohen Kappa: ",cohen_kappa)

Cohen Kappa:  0.2625413498538345


In [58]:
accuracy = accuracy_score(orig_score,pred_score)
print("Accuracy: ",accuracy)

Accuracy:  0.3285430573178132


#### Calculating per essay score based on sentence prediction

In [96]:
result_df = pd.DataFrame([x_test_sentence_df.essay_id.values,x_test_sentence_df.essay_set.values,
                         x_test_sentence_df.sentence.values,cnn_df.Orig_Score.values,
                          cnn_df.Denorm_Pred_Score.values]).transpose()
result_df.columns = ['essay_id','essay_set','sentence','orig_score','pred_score']
result_df.head(50)

Unnamed: 0,essay_id,essay_set,sentence,orig_score,pred_score
0,4479,2,'All of us can think of a book that we hope no...,4,4
1,4479,2,But if I have the right to remove that book fr...,4,4
2,4479,2,and so does everyone else.,4,3
3,4479,2,And then we have no books left on the shelf fo...,4,4
4,4479,2,wrote by Katherine Paterson.,4,4
5,4479,2,If you think about it when you go into a libra...,4,4
6,4479,2,But how many do you think is appropiate for ev...,4,4
7,4479,2,"That is why they have sections for the books, ...",4,4
8,4479,2,How many times have you been in a library and ...,4,4
9,4479,2,"Many of times right, that can be accidental or...",4,4


In [99]:
def find_max_min_mean_score(df):
    new_df = pd.DataFrame(columns=['essay_id','essay_set','Orig_Score','Max_Score','Min_Score','Mean_Score'])
    essay_ids = np.unique(df.essay_id)
    for e_id in essay_ids:
        df_temp = df[df.essay_id == e_id]
        max_score = np.max(df_temp.pred_score)
        min_score = np.min(df_temp.pred_score)
        # we need to round the mean so that kappa score doesnt complain
        mean_score = np.around(np.mean(df_temp.pred_score))
        new_df = new_df.append({'essay_id':e_id,'essay_set':int(np.unique(df_temp.essay_set)),
                                'Orig_Score':int(np.unique(df_temp.orig_score)),'Max_Score':max_score,
                                'Min_Score':min_score,'Mean_Score':mean_score},ignore_index=True)
    return new_df

In [101]:
cnn_test_df = find_max_min_mean_score(result_df)
cnn_test_df.essay_id = cnn_test_df.essay_id.astype(int)
cnn_test_df.essay_set = cnn_test_df.essay_set.astype(int)
cnn_test_df.Orig_Score = cnn_test_df.Orig_Score.astype(int)
cnn_test_df.Max_Score = cnn_test_df.Max_Score.astype(int)
cnn_test_df.Min_Score = cnn_test_df.Min_Score.astype(int)
cnn_test_df.Mean_Score = cnn_test_df.Mean_Score.astype(int)

print(cnn_test_df.shape)
cnn_test_df.head(10)

(2596, 6)


Unnamed: 0,essay_id,essay_set,Orig_Score,Max_Score,Min_Score,Mean_Score
0,2,1,9,10,7,9
1,4,1,10,10,8,9
2,5,1,8,11,8,9
3,9,1,9,11,8,9
4,16,1,12,10,8,9
5,18,1,8,10,7,9
6,30,1,8,10,8,9
7,72,1,10,11,9,10
8,74,1,7,10,8,9
9,77,1,10,11,8,9


In [103]:
orig_score = cnn_test_df.Orig_Score.values
max_pred_score = cnn_test_df.Max_Score.values
min_pred_score = cnn_test_df.Min_Score.values
mean_pred_score = cnn_test_df.Mean_Score.values

rmse_max_cnn = RMSE(orig_score,max_pred_score)
cohen_kappa_max_cnn = cohen_kappa_score(orig_score,max_pred_score)
accuracy_max_cnn = accuracy_score(orig_score,max_pred_score)

rmse_min_cnn = RMSE(orig_score,min_pred_score)
cohen_kappa_min_cnn = cohen_kappa_score(orig_score,min_pred_score)
accuracy_min_cnn = accuracy_score(orig_score,min_pred_score)

rmse_mean_cnn = RMSE(orig_score,mean_pred_score)
cohen_kappa_mean_cnn = cohen_kappa_score(orig_score,mean_pred_score)
accuracy_mean_cnn = accuracy_score(orig_score,mean_pred_score)

print("### CNN_LSTM Results : MAX Score of all Sentences ###")
print("RMSE: ",rmse_max_cnn)
print("Cohen Kappa: ",cohen_kappa_max_cnn)
print("Accuracy: ",accuracy_max_cnn)

print("### CNN_LSTM Results : Min Score of all Sentences ###")
print("RMSE: ",rmse_min_cnn)
print("Cohen Kappa: ",cohen_kappa_min_cnn)
print("Accuracy: ",accuracy_min_cnn)

print("### CNN_LSTM Results : Mean Score of all Sentences ###")
print("RMSE: ",rmse_mean_cnn)
print("Cohen Kappa: ",cohen_kappa_mean_cnn)
print("Accuracy: ",accuracy_mean_cnn)


### CNN_LSTM Results : MAX Score of all Sentences ###
RMSE:  2.986551778460191
Cohen Kappa:  0.20126766063038737
Accuracy:  0.2942989214175655
### CNN_LSTM Results : Min Score of all Sentences ###
RMSE:  2.6227198623027017
Cohen Kappa:  0.2627931565262216
Accuracy:  0.3640215716486903
### CNN_LSTM Results : Mean Score of all Sentences ###
RMSE:  2.1025335073336424
Cohen Kappa:  0.270506422257236
Accuracy:  0.3647919876733436


In [127]:
# Since Mean Score is giving us best result, let us consider this as the main parameter 
# and calculate the per essay set scores

def essay_set_metrics(df):
    set_df = pd.DataFrame(columns=['essay_set','RMSE','Kappa','Accuracy'])
    e_sets = np.unique(df.essay_set)
    for e_s in e_sets:
        df_s = df[df.essay_set == e_s]
        original_score = df_s.Orig_Score.values.astype(int)
        predicted_score = df_s.Mean_Score.values.astype(int)
        rmse = RMSE(original_score,predicted_score)
        kappa = CohenKappa(original_score,predicted_score)
        accuracy = accuracy_score(original_score,predicted_score)
        set_df = set_df.append({'essay_set':e_s,'RMSE':rmse,'Kappa':kappa,'Accuracy':accuracy},
                              ignore_index=True)
    return set_df

In [128]:
essay_set_results = essay_set_metrics(cnn_test_df)
essay_set_results

Count:  366
pe: 0.2119501926005554
Count:  365
pe: 0.43579658472508914
Count:  341
pe: 0.391087107954008
Count:  350
pe: 0.31678367346938774
Count:  375
pe: 0.2891448888888889
Count:  371
pe: 0.43628715281057245
Count:  285
pe: 0.0718990458602647
Count:  143
pe: 0.04552789867475182


Unnamed: 0,essay_set,RMSE,Kappa,Accuracy
0,1.0,1.435308,0.053484,0.254098
1,2.0,0.889821,0.067664,0.473973
2,3.0,0.754265,0.080136,0.439883
3,4.0,0.645866,0.389442,0.582857
4,5.0,1.006645,0.047157,0.322667
5,6.0,0.878003,0.077162,0.479784
6,7.0,4.643804,-0.009418,0.063158
7,8.0,4.789995,0.025566,0.06993


#### LSTM

In [None]:
estimator_lstm = KerasRegressor(build_fn=stack_lstm, epochs=20, batch_size=50)
estimator_lstm.fit(train_data, train_y_sentence)
prediction_lstm=estimator_lstm.predict(test_data)
rmse_val_lstm = RMSE(test_y_sentence,prediction_lstm)
print("RMSE: ",rmse_val_lstm)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [113]:
cnn_df = pd.DataFrame([prediction.astype(np.double), prediction_lstm.astype(np.double), test_y_sentence.astype(np.double)]).transpose()
cnn_df.columns = ['CNN_Glove','Stacked LSTM','Actual']
cnn_df.head(10)

Unnamed: 0,CNN_Glove,Stacked LSTM,Actual
0,0.692931,0.723046,0.6
1,0.702375,0.678391,0.6
2,0.798613,0.734653,0.78
3,0.703292,0.677914,0.8
4,0.671944,0.669563,0.52
5,0.747734,0.820812,0.5
6,0.67734,0.650709,0.75
7,0.616736,0.506052,0.75
8,0.732711,0.70689,0.75
9,0.828496,0.792971,1.0


In [111]:
c_kappa = CohenKappa(test_y_sentence,cnn_glove_predict)
print("Cohen Kappa: ",c_kappa)

Cohen Kappa:  0.15593670766702683


In [114]:
c_kappa = CohenKappa(test_y_sentence,prediction_lstm)
print("Cohen Kappa LSTM: ",c_kappa)

Cohen Kappa LSTM:  0.14598100325900654
