In [1]:
import keras
import sklearn
import pandas as pd
import numpy as np

from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_squared_error
from math import sqrt

Using TensorFlow backend.


In [2]:
from tqdm import tqdm_notebook

from keras.models import Model, Sequential
from keras.layers import Input, LSTM, Dense, Embedding, Activation, Dropout, Conv1D, MaxPooling1D, Bidirectional, Flatten, TimeDistributed

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from sklearn.model_selection import train_test_split

In [3]:
all_data = pd.read_csv(r'training_set_rel3.tsv',sep='\t', encoding='latin1')

In [4]:
SentenceEnders=r'!.?'
SentenceContinuation=r',:;-'

def EssayLength(Essay):
    return len(Essay.split())

def CountSentences(Essay):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(Essay, set(SentenceEnders))

def CountContinuation(Essay):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(Essay, set(SentenceContinuation))

In [5]:
classification_data = all_data
DivSeries = pd.DataFrame({'div': [12,5,3,3,4,4,25,50],'dataset':[1,2,3,4,5,6,7,8]})

# score normalization
for i in all_data.essay_set.unique():
    if(i==1):
        classification_data.loc[classification_data.essay_set == i, 'adjusted_domain1_score'] = classification_data.loc[classification_data.essay_set == i, 'domain1_score'] / 12
    elif (i==2):
        classification_data.loc[classification_data.essay_set == i, 'adjusted_domain1_score'] = classification_data.loc[classification_data.essay_set == i, 'domain1_score'] / 5
    elif (i in (3,4)):
        classification_data.loc[classification_data.essay_set == i, 'adjusted_domain1_score'] = classification_data.loc[classification_data.essay_set == i, 'domain1_score'] / 3
    elif (i in (5,6)):
        classification_data.loc[classification_data.essay_set == i, 'adjusted_domain1_score'] = classification_data.loc[classification_data.essay_set == i, 'domain1_score'] / 4
    elif (i == 7):
        classification_data.loc[classification_data.essay_set == i, 'adjusted_domain1_score'] = classification_data.loc[classification_data.essay_set == i, 'domain1_score'] / 25
    else:
        classification_data.loc[classification_data.essay_set == i, 'adjusted_domain1_score'] = classification_data.loc[classification_data.essay_set == i, 'domain1_score'] / 50


train_sa_x_class,test_sa_x_class,train_sa_y_class,test_sa_y_class = train_test_split(np.asarray(classification_data.essay), classification_data[['adjusted_domain1_score','essay_set']],test_size=0.2, random_state=42)

max_len_class = all_data.essay.apply(EssayLength).sort_values(ascending=True).iloc[int(np.floor(len(all_data)*.95))]

tok_class = Tokenizer()
tok_class.fit_on_texts(pd.Series(train_sa_x_class))
sequences_class = tok_class.texts_to_sequences(train_sa_x_class)
sequences_matrix_class = sequence.pad_sequences(sequences_class,maxlen=max_len_class)

sequences_test_class = tok_class.texts_to_sequences(test_sa_x_class)
sequences_test_matrix_class = sequence.pad_sequences(sequences_test_class,maxlen=max_len_class)

In [6]:
#looking to pull glove embeddings so no embedding training required.
import csv
gloves = pd.read_table(r"glove.42B.300d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(tok_class.word_index)+1, 300))
for word, i in tqdm_notebook(tok_class.word_index.items()):
    if word in gloves.index:
        embedding_matrix[i] = np.asarray(gloves.loc[word])
    else:
        embedding_matrix[i] = np.zeros(300)

HBox(children=(IntProgress(value=0, max=36434), HTML(value='')))




# Feed-forward networks:

In [9]:
ff_1 = Sequential()
ff_1.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
ff_1.add(Dense(100,name='deep1'))
ff_1.add(Flatten())
#Regression
ff_1.add(Dense(1,activation='sigmoid',name='out_layer'))
ff_1.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
ff_1.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#Classification
#ff_1.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),activation='sigmoid',name='out_layer'))
#ff_1.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#ff_1.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f86169c2550>

In [10]:
ff_2 = Sequential()
ff_2.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
ff_2.add(Dense(100,name='deep1'))
ff_2.add(Dense(50,name='deep2'))
ff_2.add(Flatten())
#Regression
ff_2.add(Dense(1,activation='sigmoid',name='out_layer'))
ff_2.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
ff_2.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#Classification
#ff_2.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),activation='sigmoid',name='out_layer'))
#ff_2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#ff_2.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f8617f1fb38>

In [11]:
ff_3 = Sequential()
ff_3.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
ff_3.add(Dense(100,name='deep1'))
ff_3.add(Dense(50,name='deep2'))
ff_3.add(Dense(25,name='deep3'))
ff_3.add(Flatten())
#Regression
ff_3.add(Dense(1,activation='sigmoid',name='out_layer'))
ff_3.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
ff_3.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#Classification
#ff_3.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),activation='sigmoid',name='out_layer'))
#ff_3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#ff_3.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f86161d3f60>

In [None]:
#classification only
#required to convert from cat_CE output back to single prediction
NormalizeSeries = pd.Series(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns)
def RetrieveNormalize(IndexVal):
    return NormalizeSeries.loc[IndexVal]

In [12]:
#Classification
#scoringFrame_FF = pd.DataFrame(columns=['actual','dataset','feedforward','ff1','ff2', 'ff3'],index=range(0,len(test_sa_y_class)))
#for i in tqdm_notebook(scoringFrame_FF.index):
    #scoringFrame_FF.loc[i, 'actual'] = test_sa_y_class.adjusted_domain1_score.iloc[i]
    #scoringFrame_FF.loc[i, 'dataset'] = test_sa_y_class.essay_set.iloc[i]
#scoringFrame_FF.dataset = scoringFrame_FF.dataset.astype(int)

#ff_pred1 = ff_1.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class)).flatten()
#ff_pred2 = ff_2.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class)).flatten()
#ff_pred3 = ff_3.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class)).flatten()

#scoringFrame_FF['ff1'] = np.argmax(ff_pred1,axis=1)
#scoringFrame_FF['ff2'] = np.argmax(ff_pred2,axis=1)
#scoringFrame_FF['ff3'] = np.argmax(ff_pred3,axis=1)

#NOTE - this way of scoring is much slower than the batch scoring done for CNN/RNN. Including it for completion's sake
#regression
scoringFrame_FF = pd.DataFrame(columns=['actual','dataset','feedforward','ff1','ff2', 'ff3'],index=range(0,len(test_sa_y_class)))
for i in tqdm_notebook(scoringFrame_FF.index):
    scoringFrame_FF.loc[i, 'actual'] = test_sa_y_class.adjusted_domain1_score.iloc[i]
    scoringFrame_FF.loc[i, 'dataset'] = test_sa_y_class.essay_set.iloc[i]
    scoringFrame_FF.loc[i, 'ff1'] = ff_1.predict(sequences_test_matrix_class[i].reshape(1,max_len_class))[0][0]
    scoringFrame_FF.loc[i, 'ff2'] = ff_2.predict(sequences_test_matrix_class[i].reshape(1,max_len_class))[0][0]
    scoringFrame_FF.loc[i, 'ff3'] = ff_3.predict(sequences_test_matrix_class[i].reshape(1,max_len_class))[0][0]
scoringFrame_FF.dataset = scoringFrame_FF.dataset.astype(int)

scoringFrame_FF = scoringFrame_FF.merge(DivSeries, on='dataset')

for colName in ['actual','ff1','ff2','ff3']:
    scoringFrame_FF[colName] = scoringFrame_FF[colName] * scoringFrame_FF['div']
    scoringFrame_FF[colName] = scoringFrame_FF[colName].apply(round)

scoringFrame_FF.actual = scoringFrame_FF.actual.astype(int)

QuadKappaCalculation = pd.DataFrame(columns = ['ff1', 'ff2', 'ff3'],index = np.unique(scoringFrame_FF.dataset))
for essaySetValue in np.unique(scoringFrame_FF.dataset):
    temp_ff_ES = scoringFrame_FF[scoringFrame_FF.dataset == essaySetValue]
    QuadKappaCalculation.loc[essaySetValue, 'ff1'] = cohen_kappa_score(temp_ff_ES.actual, temp_ff_ES.ff1.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'ff2'] = cohen_kappa_score(temp_ff_ES.actual, temp_ff_ES.ff2.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'ff3'] = cohen_kappa_score(temp_ff_ES.actual, temp_ff_ES.ff3.apply(round),weights='quadratic')
print("Mean Weighted Quadrating Kappa scores: ")
print(QuadKappaCalculation.mean())

HBox(children=(IntProgress(value=0, max=2596), HTML(value='')))


Mean Weighted Quadrating Kappa scores: 
ff1    0.525088
ff2    0.503471
ff3    0.483585
dtype: float64


In [13]:
print("ff1 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(scoringFrame_FF.actual, scoringFrame_FF.ff1.apply(round))),
                                       cohen_kappa_score(scoringFrame_FF.actual, scoringFrame_FF.ff1.apply(round)),
                                        cohen_kappa_score(scoringFrame_FF.actual, scoringFrame_FF.ff1.apply(round),weights='quadratic'),
                                                 accuracy_score(scoringFrame_FF.actual, scoringFrame_FF.ff1.apply(round))))

print("ff2 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(scoringFrame_FF.actual, scoringFrame_FF.ff2.apply(round))),
                                       cohen_kappa_score(scoringFrame_FF.actual, scoringFrame_FF.ff2.apply(round)),
                                        cohen_kappa_score(scoringFrame_FF.actual, scoringFrame_FF.ff2.apply(round),weights='quadratic'),             
                                      accuracy_score(scoringFrame_FF.actual, scoringFrame_FF.ff2.apply(round))))

print("ff3 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(scoringFrame_FF.actual, scoringFrame_FF.ff3.apply(round))),
                                       cohen_kappa_score(scoringFrame_FF.actual, scoringFrame_FF.ff3.apply(round)),
                                       cohen_kappa_score(scoringFrame_FF.actual, scoringFrame_FF.ff3.apply(round),weights='quadratic'),               
                                      accuracy_score(scoringFrame_FF.actual, scoringFrame_FF.ff3.apply(round))))


ff1 RMSE, Cohen, Quad Cohen, accuracy: 3.0328199614114753, 0.30789610591583094, 0.94380742738271, 0.3898305084745763
ff2 RMSE, Cohen, Quad Cohen, accuracy: 3.2238039910287957, 0.3134868672604054, 0.9358966126520158, 0.39522342064714944
ff3 RMSE, Cohen, Quad Cohen, accuracy: 3.1528229823587255, 0.2899852661633757, 0.9397881395544337, 0.37403697996918334


# Convolutional Architectures

In [20]:
cnn_1 = Sequential()
cnn_1.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
cnn_1.add(Conv1D(64, 5, activation='relu'))
cnn_1.add(MaxPooling1D(pool_size=4))
cnn_1.add(Flatten())
#regression
cnn_1.add(Dense(1,name='out_layer'))
cnn_1.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
cnn_1.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#classification
#cnn_1.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),name='out_layer'))
#cnn_1.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#cnn_1.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7fd0cc62bc50>

In [21]:
cnn_2 = Sequential()
cnn_2.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
cnn_2.add(Conv1D(64, 5, activation='relu'))
cnn_2.add(MaxPooling1D(pool_size=4))
cnn_2.add(Conv1D(20, 5, activation='relu'))
cnn_2.add(MaxPooling1D(pool_size=2))
cnn_2.add(Flatten())
#regression
cnn_2.add(Dense(1,name='out_layer'))
cnn_2.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
cnn_2.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#classification
#cnn_2.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),name='out_layer'))
#cnn_2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#cnn_2.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7fd0eeea3518>

In [22]:
cnn_3 = Sequential()
cnn_3.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
cnn_3.add(Conv1D(64, 5, activation='relu'))
cnn_3.add(MaxPooling1D(pool_size=4))
cnn_3.add(Conv1D(20, 5, activation='relu'))
cnn_3.add(MaxPooling1D(pool_size=2))
cnn_3.add(Conv1D(20, 5, activation='relu'))
cnn_3.add(MaxPooling1D(pool_size=2))
cnn_3.add(Flatten())
#regression
cnn_3.add(Dense(1,name='out_layer'))
cnn_3.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
cnn_3.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#classification
#cnn_3.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),name='out_layer'))
#cnn_3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#cnn_3.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7fd0ac415d30>

In [24]:
cnn_pred1 = cnn_1.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class)).flatten()
cnn_pred2 = cnn_2.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class)).flatten()
cnn_pred3 = cnn_3.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class)).flatten()

#Uncomment in classification case
#cnn_pred1 = np.argmax(cnn_pred1,axis=1)
#cnn_pred2 = np.argmax(cnn_pred2,axis=1)
#cnn_pred3 = np.argmax(cnn_pred3,axis=1)

CNN_pred_frame = pd.DataFrame({'actual':np.asarray(test_sa_y_class.adjusted_domain1_score),
                           'dataset': np.asarray(test_sa_y_class.essay_set),
                           'CNN1':cnn_pred1,
                          'CNN2':cnn_pred2,
                          'CNN3':cnn_pred3})

cnn_scoring = CNN_pred_frame.merge(DivSeries, on='dataset')

for colName in ['actual', 'CNN1', 'CNN2', 'CNN3']:
    #Uncomment in classification case
    #if(colName != 'actual'):
    #    cnn_scoring[colName] = cnn_scoring[colName].apply(RetrieveNormalize)
    cnn_scoring[colName] = cnn_scoring[colName] * cnn_scoring['div']
    cnn_scoring[colName] = cnn_scoring[colName].apply(round)

cnn_scoring.actual = cnn_scoring.actual.astype(int)

QuadKappaCalculation = pd.DataFrame(columns = ['CNN1', 'CNN2', 'CNN3'],index = np.unique(cnn_scoring.dataset))
for essaySetValue in np.unique(cnn_scoring.dataset):
    temp_CNN_ES = cnn_scoring[cnn_scoring.dataset == essaySetValue]
    QuadKappaCalculation.loc[essaySetValue, 'CNN1'] = cohen_kappa_score(temp_CNN_ES.actual, temp_CNN_ES.CNN1.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'CNN2'] = cohen_kappa_score(temp_CNN_ES.actual, temp_CNN_ES.CNN2.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'CNN3'] = cohen_kappa_score(temp_CNN_ES.actual, temp_CNN_ES.CNN3.apply(round),weights='quadratic')
QuadKappaCalculation.mean()

CNN1    0.571500
CNN2    0.578590
CNN3    0.569282
dtype: float64

In [21]:
print("CNN1 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(cnn_scoring.actual, cnn_scoring.CNN1.apply(round))),
                                       cohen_kappa_score(cnn_scoring.actual, cnn_scoring.CNN1.apply(round)),
                                        cohen_kappa_score(cnn_scoring.actual, cnn_scoring.CNN1.apply(round),weights='quadratic'),
                                                 accuracy_score(cnn_scoring.actual, cnn_scoring.CNN1.apply(round))))

print("CNN2 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(cnn_scoring.actual, cnn_scoring.CNN2.apply(round))),
                                       cohen_kappa_score(cnn_scoring.actual, cnn_scoring.CNN2.apply(round)),
                                        cohen_kappa_score(cnn_scoring.actual, cnn_scoring.CNN2.apply(round),weights='quadratic'),
                                                 accuracy_score(cnn_scoring.actual, cnn_scoring.CNN2.apply(round))))

print("CNN3 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(cnn_scoring.actual, cnn_scoring.CNN3.apply(round))),
                                       cohen_kappa_score(cnn_scoring.actual, cnn_scoring.CNN3.apply(round)),
                                        cohen_kappa_score(cnn_scoring.actual, cnn_scoring.CNN3.apply(round),weights='quadratic'),             
                                      accuracy_score(cnn_scoring.actual, cnn_scoring.CNN3.apply(round))))


CNN1 RMSE, Cohen, Quad Cohen, accuracy: 2.7530624494441263, 0.3313692333735996, 0.9551708905519521, 0.4133281972265023
CNN2 RMSE, Cohen, Quad Cohen, accuracy: 2.3984779457160506, 0.3652710641678031, 0.963926811955723, 0.44414483821263484
CNN3 RMSE, Cohen, Quad Cohen, accuracy: 2.563522711093615, 0.3429602729180997, 0.9607755874545442, 0.4241140215716487


# Recurrent Neural Network

In [17]:
rnn_1 = Sequential()
rnn_1.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
rnn_1.add(LSTM(20))
#Regression
rnn_1.add(Dense(1,name='out_layer'))
rnn_1.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
rnn_1.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#Classification
#rnn_1.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),name='out_layer'))
#rnn_1.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#rnn_1.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f8617c62588>

In [18]:
rnn_2 = Sequential()
rnn_2.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
rnn_2.add(LSTM(20, return_sequences=True))
rnn_2.add(LSTM(20))
#regression
rnn_2.add(Dense(1,name='out_layer'))
rnn_2.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
rnn_2.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#Classification
#rnn_2.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),name='out_layer'))
#rnn_2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#rnn_2.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f858d3cec18>

In [19]:
rnn_3 = Sequential()
rnn_3.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],input_length=max_len_class,trainable=False))
rnn_3.add(LSTM(20, return_sequences=True))
rnn_3.add(LSTM(20, return_sequences=True))
rnn_3.add(LSTM(20))
#Regression
rnn_3.add(Dense(1,name='out_layer'))
rnn_3.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
rnn_3.fit(sequences_matrix_class,train_sa_y_class.adjusted_domain1_score, batch_size = 500, epochs=35, validation_split=0.15)
#Classification
#rnn_3.add(Dense(len(pd.get_dummies(train_sa_y_class.adjusted_domain1_score).columns),name='out_layer'))
#rnn_3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#rnn_3.fit(sequences_matrix_class,np.asarray(pd.get_dummies(train_sa_y_class.adjusted_domain1_score)), batch_size = 500, epochs=35, validation_split=0.15)

Train on 8823 samples, validate on 1557 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f8581b78780>

In [22]:
rnn_pred1 = rnn_1.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class))
rnn_pred2 = rnn_2.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class))
rnn_pred3 = rnn_3.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class))

#Uncomment for classification case with cat_ce loss
#rnn_pred1 = np.argmax(rnn_pred1,axis=1)
#rnn_pred2 = np.argmax(rnn_pred2,axis=1)
#rnn_pred3 = np.argmax(rnn_pred3,axis=1)

RNN_pred_frame = pd.DataFrame({'actual':np.asarray(test_sa_y_class.adjusted_domain1_score),
                           'dataset': np.asarray(test_sa_y_class.essay_set),
                           'RNN1':rnn_pred1.flatten(),
                          'RNN2':rnn_pred2.flatten(),
                          'RNN3':rnn_pred3.flatten()})

rnn_scoring = RNN_pred_frame.merge(DivSeries, on='dataset')

for colName in ['actual', 'RNN1', 'RNN2', 'RNN3']:
    #Uncomment for classification case with cat_ce loss
    #if(colName != 'actual'):
     #   rnn_scoring[colName] = rnn_scoring[colName].apply(RetrieveNormalize)
    rnn_scoring[colName] = rnn_scoring[colName] * rnn_scoring['div']
    rnn_scoring[colName] = rnn_scoring[colName].apply(round)

rnn_scoring.actual = rnn_scoring.actual.astype(int)

QuadKappaCalculation = pd.DataFrame(columns = ['RNN1', 'RNN2', 'RNN3'],index = np.unique(rnn_scoring.dataset))
for essaySetValue in np.unique(rnn_scoring.dataset):
    temp_RNN_ES = rnn_scoring[rnn_scoring.dataset == essaySetValue]
    QuadKappaCalculation.loc[essaySetValue, 'RNN1'] = cohen_kappa_score(temp_RNN_ES.actual, temp_RNN_ES.RNN1.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'RNN2'] = cohen_kappa_score(temp_RNN_ES.actual, temp_RNN_ES.RNN2.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'RNN3'] = cohen_kappa_score(temp_RNN_ES.actual, temp_RNN_ES.RNN3.apply(round),weights='quadratic')
QuadKappaCalculation.mean()

RNN1    0.652702
RNN2    0.656994
RNN3    0.680003
dtype: float64

In [23]:
print("RNN RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(rnn_scoring.actual, rnn_scoring.RNN1.apply(round))),
                                       cohen_kappa_score(rnn_scoring.actual, rnn_scoring.RNN1.apply(round)),
                                        cohen_kappa_score(rnn_scoring.actual, rnn_scoring.RNN1.apply(round),weights='quadratic'),
                                                 accuracy_score(rnn_scoring.actual, rnn_scoring.RNN1.apply(round))))

print("RNN2 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(rnn_scoring.actual, rnn_scoring.RNN2.apply(round))),
                                       cohen_kappa_score(rnn_scoring.actual, rnn_scoring.RNN2.apply(round)),
                                        cohen_kappa_score(rnn_scoring.actual, rnn_scoring.RNN2.apply(round),weights='quadratic'),
                                                 accuracy_score(rnn_scoring.actual, rnn_scoring.RNN2.apply(round))))

print("RNN3 RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(rnn_scoring.actual, rnn_scoring.RNN3.apply(round))),
                                       cohen_kappa_score(rnn_scoring.actual, rnn_scoring.RNN3.apply(round)),
                                        cohen_kappa_score(rnn_scoring.actual, rnn_scoring.RNN3.apply(round),weights='quadratic'),             
                                      accuracy_score(rnn_scoring.actual, rnn_scoring.RNN3.apply(round))))


RNN RMSE, Cohen, Quad Cohen, accuracy: 1.9465112404620202, 0.39328857697456265, 0.9761170303301328, 0.46802773497688754
RNN2 RMSE, Cohen, Quad Cohen, accuracy: 1.7253659088032314, 0.4429170385173772, 0.9807697388568016, 0.512326656394453
RNN3 RMSE, Cohen, Quad Cohen, accuracy: 1.6757584580903435, 0.4519293159349017, 0.9823337336873228, 0.522342064714946


# Evaluation split by dataset - top model

In [35]:
preds_to_analyze = rnn_3.predict(sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class))

byset_analysis = pd.DataFrame({'actual':np.asarray(test_sa_y_class.adjusted_domain1_score),
                           'dataset': np.asarray(test_sa_y_class.essay_set),
                          'RNN3':preds_to_analyze.flatten()})

preds_to_analyze = byset_analysis.merge(DivSeries, on='dataset')

for colName in ['actual', 'RNN3']:
    newColName = colName + '_adjusted'
    preds_to_analyze[newColName] = preds_to_analyze[colName] * preds_to_analyze['div']
    preds_to_analyze[newColName] = preds_to_analyze[colName].apply(round)

preds_to_analyze['right'] = (preds_to_analyze.RNN3_adjusted == preds_to_analyze.actual_adjusted)
preds_to_analyze['delta'] = (preds_to_analyze.RNN3_adjusted - preds_to_analyze.actual_adjusted)**2

aggregates = pd.concat([preds_to_analyze.groupby('dataset')['right'].sum(),
           preds_to_analyze.groupby('dataset')['actual'].count(),
                        preds_to_analyze.groupby('dataset')['delta'].mean()],axis=1)
aggregates.columns = ['correct', 'test_samples','RMSE']

aggregates = pd.DataFrame(index=range(1,9),columns = ['UnweightedKappa', 'QuadKappa', 'Accuracy'])

for i in aggregates.index:
    temp = preds_to_analyze[preds_to_analyze.dataset == i]
    aggregates.loc[i,'UnweightedKappa'] = cohen_kappa_score(temp.actual_adjusted, temp.RNN3_adjusted.apply(round))
    aggregates.loc[i,'QuadKappa'] = cohen_kappa_score(temp.actual_adjusted, temp.RNN3_adjusted.apply(round),weights='quadratic')
    aggregates.loc[i,'Accuracy'] = accuracy_score(temp.actual_adjusted, temp.RNN3_adjusted.apply(round))
    aggregates.loc[i,'UnweightedRMSE'] = mean_squared_error(temp.actual, temp.RNN3)
aggregates.UnweightedRMSE = aggregates.UnweightedRMSE.apply(sqrt)

print(aggregates)

  UnweightedKappa QuadKappa  Accuracy  UnweightedRMSE
1         0.45316   0.45316  0.945355        0.079591
2        0.491156  0.491156  0.919786        0.116014
3        0.611015  0.611015  0.814371        0.192485
4         0.72878   0.72878  0.866667        0.181767
5        0.573857  0.573857  0.782609        0.141832
6        0.638259  0.638259  0.849206        0.136209
7        0.631245  0.631245  0.877483        0.122134
8               0         0  0.963504        0.098858


# Combining Manual Features with Network Outputs

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
#from nltk.tokenize import sent_tokenize, word_tokenize 

import spacy
from spacy.attrs import ORTH
#import textacy
import pickle
from collections import defaultdict

[nltk_data] Downloading package punkt to /home/nishray/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
nlp=spacy.load('en_core_web_lg',disable=['ner'])

In [9]:
train_x_obj = pd.Series(train_sa_x_class).apply(lambda essay: nlp(essay.lower()))
test_x_obj = pd.Series(test_sa_x_class).apply(lambda essay: nlp(essay.lower()))

In [10]:
def tree_height(root):
    """
    Find the maximum depth (height) of the dependency parse of a spacy sentence by starting with its root
    Code adapted from https://stackoverflow.com/questions/35920826/how-to-find-height-for-non-binary-tree
    :param root: spacy.tokens.token.Token
    :return: int, maximum height of sentence's dependency parse tree
    """
    if not list(root.children):
        return 1
    else:
        return 1 + max(tree_height(x) for x in root.children)
    
def get_average_heights(paragraph):
    """
    Computes average height of parse trees for each sentence in paragraph.
    :param paragraph: spacy doc object or str
    :return: float
    """
    if type(paragraph) == str:
        doc = nlp(paragraph)
    else:
        doc = paragraph
    roots = [sent.root for sent in doc.sents]
    return np.mean([tree_height(root) for root in roots])

def get_variance_heights(paragraph):
    """
    Computes average height of parse trees for each sentence in paragraph.
    :param paragraph: spacy doc object or str
    :return: float
    """
    if type(paragraph) == str:
        doc = nlp(paragraph)
    else:
        doc = paragraph
    roots = [sent.root for sent in doc.sents]
    return np.std([tree_height(root) for root in roots])

def get_tree_heights(paragraph):
    """
    Computes average height of parse trees for each sentence in paragraph.
    :param paragraph: spacy doc object or str
    :return: float
    """
    if type(paragraph) == str:
        doc = nlp(paragraph)
    else:
        doc = paragraph
    roots = [sent.root for sent in doc.sents]
    return [tree_height(root) for root in roots]

def get_sentences(doc):
    sents = list(doc.sents)
    return sents

def get_sentence_lengths(sentences):
    return float(len(sentences))

def get_word_counts(doc):
    return doc.count_by(ORTH)

def get_connectives(doc):
    text = doc.text.lower()
    connectives = [
    'after',
    'earlier',
    'before',
    'during',
    'while',
    'later',
    'because',
    'consequently',
    'thus',
    'both',
    'additionally',
    'furthermore',
    'moreover',
    'actually',
    'as a result',
    'due to',
    'but',
    'yet',
    'however',
    'although',
    'nevertheless'
    ]
    total = 0
    for connector in connectives:
        total += text.count(connector)
    return float((total/len(doc)))

def get_pos(doc):
    return [token.pos_ for token in doc]


def get_posngrams(poslist,n):
    posngrams = []
    for item in range(len(poslist) - n + 1):
        posngrams.append(tuple([poslist[item+i] for i in range(n)]))
    return posngrams

def get_posgrams_counts(list_grams):
    posgrams_counts = defaultdict(int)
    for gram in list_grams:
        posgrams_counts[gram] += 1
    return posgrams_counts

def get_TF(list_dicts):
    TF_dict = defaultdict(int)
    for dictionary in list_dicts:
        for gram in dictionary:
            TF_dict[gram] += dictionary[gram]
    return TF_dict

def get_mean_tfTF(posgram_counts,TF):
    tfTF_ratios = list()
    for key, value in posgram_counts.items():
        tfTF_ratios.append(value/TF[key])
    return np.mean(tfTF_ratios)

def get_posngram_ratio(posngrams):
    if len(posngrams) > 0:
        return float(len(set(posngrams))/len(posngrams))
    else:
        return 0

def get_reading_scores(doc):
    scores = textacy.TextStats(doc).readability_stats
    del scores['smog_index']
    return scores

def get_word_lengths(doc):
    lengths = list()
    for word in doc:
        if word.is_alpha:
            lengths.append(float(len(word)))
    return lengths

def get_words_of_length(lengths, n, p):
    count = 0
    for length in lengths:
        if length > n and length < p:
            count += 1
    return float(count)

def get_type_token_ratio(doc):
    unique_words = set(word for word in doc if word.is_alpha)
    total_words = [word for word in doc if word.is_alpha]
    return float(len(unique_words)/len(total_words))

def get_similarity_scores(doc):
    sents = [sent for sent in doc.sents]
    similarity_scores = list()
    for i in range(1,len(sents)):
        sent1 = sents[i-1]
        sent2 = sents[i]
        similarity_scores.append(sent1.similarity(sent2))
    return np.mean(similarity_scores)

def nth_root(x,n):
    return x ** (1/float(n))

def get_yules_k(word_counts):
    m1 =  sum(word_counts.values())
    m2 = sum([freq ** 2 for freq in word_counts.values()])
    if m1 == m2:
        k = 0 
    else:
        i = (m1*m1) / (m2-m1)
        k = 1/i * 10000
        return float(k)

In [11]:
test_x_obj = pd.DataFrame(test_x_obj)
test_x_obj['essay'] = test_sa_x_class
test_x_obj.columns = ['doc', 'essay']

train_x_obj = pd.DataFrame(train_x_obj)
train_x_obj['essay'] = train_sa_x_class
train_x_obj.columns = ['doc', 'essay']

In [12]:
# Preengineering
train_x_obj['sentences'] = train_x_obj.doc.apply(get_sentences)
train_x_obj['word_counts'] = train_x_obj.doc.apply(get_word_counts)
train_x_obj['word_lengths'] = train_x_obj.doc.apply(get_word_lengths)
train_x_obj['pos'] = train_x_obj.doc.apply(get_pos)
train_x_obj['pos_trigrams'] = train_x_obj.pos.apply(lambda pos: get_posngrams(pos, n=3))
train_x_obj['pos_4grams'] = train_x_obj.pos.apply(lambda pos: get_posngrams(pos, n=4))
train_x_obj['pos_trigram_counts'] = train_x_obj.pos_trigrams.apply(get_posgrams_counts)
pos_TF = get_TF(train_x_obj.pos_trigram_counts)
train_x_obj['tree_heights'] = train_x_obj.doc.apply(lambda doc: get_tree_heights(doc))

# Lexical Features
train_x_obj['words_length_4'] = train_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 4,6))
train_x_obj['words_length_6'] = train_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 6,8))
train_x_obj['words_length_8'] = train_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 8,10))
train_x_obj['words_length_10'] = train_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 10,12))
train_x_obj['words_length_12'] = train_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 12,100))
train_x_obj['mean_word_length'] = train_x_obj.word_lengths.apply(np.mean)
train_x_obj['variance_word_length'] = train_x_obj.word_lengths.apply(np.std)
train_x_obj['type_token_ratio'] = train_x_obj.doc.apply(get_type_token_ratio)


# Length Features
train_x_obj['essay_length'] = train_x_obj.doc.apply(len)
train_x_obj['num_words'] = train_x_obj.doc.apply(lambda doc: float(len([word for word in doc if word.is_alpha])))
train_x_obj['num_sentences'] = train_x_obj.sentences.apply(get_sentence_lengths)
train_x_obj['mean_sentence_length'] = train_x_obj.num_words/train_x_obj.num_sentences
train_x_obj['num_characters'] = train_x_obj.essay.apply(len)
train_x_obj['fourth_root_num_characters'] = train_x_obj.num_characters.apply(nth_root, n=4)

# # Occurrence Features
train_x_obj['num_commas'] = train_x_obj.essay.apply(lambda essay: float(essay.count(',')))
train_x_obj['num_periods'] = train_x_obj.essay.apply(lambda essay: float(essay.count('.')))
train_x_obj['num_exclaim'] = train_x_obj.essay.apply(lambda essay: float(essay.count('!')))
train_x_obj['num_question'] = train_x_obj.essay.apply(lambda essay: float(essay.count('?')))
train_x_obj['num_semicolon'] = train_x_obj.essay.apply(lambda essay: float(essay.count(';')))
train_x_obj['num_colon'] = train_x_obj.essay.apply(lambda essay: float(essay.count(':')))

# # Style Features
# FIX train_x_obj['vocabulary'] = train_x_obj.word_tokens.apply(lambda word_tokens: set(word.lower() for word in word_tokens if word.isalpha()))
train_x_obj['vocab_size'] = train_x_obj.word_counts.apply(len)
# train_x_obj['yules_k'] = train_x_obj.word_counts.apply(get_yules_k)

# # Syntactical Features
# # the number for these lengths comes from Chen and He 2013
train_x_obj['sentence_lengths'] = train_x_obj.sentences.apply(lambda sentences: [len(sent) for sent in sentences])
train_x_obj['very_short_sentences'] = train_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length <= 10 for length in sentence_lengths])))
train_x_obj['short_sentences'] = train_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 10 and length <18 for length in sentence_lengths])))
train_x_obj['medium_sentences'] = train_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 18 and length <25 for length in sentence_lengths])))
train_x_obj['long_sentences'] = train_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 25 for length in sentence_lengths])))
train_x_obj['variance_sentence_length'] = train_x_obj.sentence_lengths.apply(lambda sentence_lengths: np.std(sentence_lengths))

train_x_obj['max_height'] = train_x_obj.tree_heights.apply(lambda heights: float(max(heights)))
train_x_obj['sum_heights'] = train_x_obj.tree_heights.apply(sum)
train_x_obj['mean_heights'] = train_x_obj.tree_heights.apply(np.mean)

# train_x_obj['mean_sentence_similarity'] = train_x_obj.doc.apply(get_similarity_scores)

# # POS Ngrams
train_x_obj['pos_trigram_ratio'] = train_x_obj.pos_trigrams.apply(get_posngram_ratio)
train_x_obj['pos_fourgram_ratio'] = train_x_obj.pos_4grams.apply(get_posngram_ratio)
train_x_obj['mean_trigram_tfTF'] = train_x_obj.pos_trigram_counts.apply(lambda pos_trigram_counts: get_mean_tfTF(pos_trigram_counts, TF=pos_TF))

# # Cohesion Features
train_x_obj['connectives'] = train_x_obj.doc.apply(get_connectives)

# Readability Features
#train_x_obj['reading_scores'] = train_x_obj.doc.apply(get_reading_scores)



  out=out, **kwargs)


In [13]:
# Preengineering
test_x_obj['sentences'] = test_x_obj.doc.apply(get_sentences)
test_x_obj['word_counts'] = test_x_obj.doc.apply(get_word_counts)
test_x_obj['word_lengths'] = test_x_obj.doc.apply(get_word_lengths)
test_x_obj['pos'] = test_x_obj.doc.apply(get_pos)
test_x_obj['pos_trigrams'] = test_x_obj.pos.apply(lambda pos: get_posngrams(pos, n=3))
test_x_obj['pos_4grams'] = test_x_obj.pos.apply(lambda pos: get_posngrams(pos, n=4))
test_x_obj['pos_trigram_counts'] = test_x_obj.pos_trigrams.apply(get_posgrams_counts)
pos_TF = get_TF(test_x_obj.pos_trigram_counts)
test_x_obj['tree_heights'] = test_x_obj.doc.apply(lambda doc: get_tree_heights(doc))

# Lexical Features
test_x_obj['words_length_4'] = test_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 4,6))
test_x_obj['words_length_6'] = test_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 6,8))
test_x_obj['words_length_8'] = test_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 8,10))
test_x_obj['words_length_10'] = test_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 10,12))
test_x_obj['words_length_12'] = test_x_obj.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 12,100))
test_x_obj['mean_word_length'] = test_x_obj.word_lengths.apply(np.mean)
test_x_obj['variance_word_length'] = test_x_obj.word_lengths.apply(np.std)
test_x_obj['type_token_ratio'] = test_x_obj.doc.apply(get_type_token_ratio)


# Length Features
test_x_obj['essay_length'] = test_x_obj.doc.apply(len)
test_x_obj['num_words'] = test_x_obj.doc.apply(lambda doc: float(len([word for word in doc if word.is_alpha])))
test_x_obj['num_sentences'] = test_x_obj.sentences.apply(get_sentence_lengths)
test_x_obj['mean_sentence_length'] = test_x_obj.num_words/test_x_obj.num_sentences
test_x_obj['num_characters'] = test_x_obj.essay.apply(len)
test_x_obj['fourth_root_num_characters'] = test_x_obj.num_characters.apply(nth_root, n=4)

# # Occurrence Features
test_x_obj['num_commas'] = test_x_obj.essay.apply(lambda essay: float(essay.count(',')))
test_x_obj['num_periods'] = test_x_obj.essay.apply(lambda essay: float(essay.count('.')))
test_x_obj['num_exclaim'] = test_x_obj.essay.apply(lambda essay: float(essay.count('!')))
test_x_obj['num_question'] = test_x_obj.essay.apply(lambda essay: float(essay.count('?')))
test_x_obj['num_semicolon'] = test_x_obj.essay.apply(lambda essay: float(essay.count(';')))
test_x_obj['num_colon'] = test_x_obj.essay.apply(lambda essay: float(essay.count(':')))

# # Style Features
# FIX test_x_obj['vocabulary'] = test_x_obj.word_tokens.apply(lambda word_tokens: set(word.lower() for word in word_tokens if word.isalpha()))
test_x_obj['vocab_size'] = test_x_obj.word_counts.apply(len)
# test_x_obj['yules_k'] = test_x_obj.word_counts.apply(get_yules_k)

# # Syntactical Features
# # the number for these lengths comes from Chen and He 2013
test_x_obj['sentence_lengths'] = test_x_obj.sentences.apply(lambda sentences: [len(sent) for sent in sentences])
test_x_obj['very_short_sentences'] = test_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length <= 10 for length in sentence_lengths])))
test_x_obj['short_sentences'] = test_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 10 and length <18 for length in sentence_lengths])))
test_x_obj['medium_sentences'] = test_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 18 and length <25 for length in sentence_lengths])))
test_x_obj['long_sentences'] = test_x_obj.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 25 for length in sentence_lengths])))
test_x_obj['variance_sentence_length'] = test_x_obj.sentence_lengths.apply(lambda sentence_lengths: np.std(sentence_lengths))

test_x_obj['max_height'] = test_x_obj.tree_heights.apply(lambda heights: float(max(heights)))
test_x_obj['sum_heights'] = test_x_obj.tree_heights.apply(sum)
test_x_obj['mean_heights'] = test_x_obj.tree_heights.apply(np.mean)

# test_x_obj['mean_sentence_similarity'] = test_x_obj.doc.apply(get_similarity_scores)

# # POS Ngrams
test_x_obj['pos_trigram_ratio'] = test_x_obj.pos_trigrams.apply(get_posngram_ratio)
test_x_obj['pos_fourgram_ratio'] = test_x_obj.pos_4grams.apply(get_posngram_ratio)
test_x_obj['mean_trigram_tfTF'] = test_x_obj.pos_trigram_counts.apply(lambda pos_trigram_counts: get_mean_tfTF(pos_trigram_counts, TF=pos_TF))

# # Cohesion Features
test_x_obj['connectives'] = test_x_obj.doc.apply(get_connectives)

# Readability Features
#test_x_obj['reading_scores'] = test_x_obj.doc.apply(get_reading_scores)



In [14]:
colList = ['words_length_4', 'words_length_6', 'words_length_8', 'words_length_10',
       'words_length_12', 'mean_word_length', 'variance_word_length',
       'type_token_ratio', 'essay_length', 'num_words', 'num_sentences',
       'mean_sentence_length', 'num_characters', 'fourth_root_num_characters',
       'num_commas', 'num_periods', 'num_exclaim', 'num_question',
       'num_semicolon', 'num_colon', 'vocab_size', 'very_short_sentences',
       'short_sentences', 'medium_sentences', 'long_sentences',
       'variance_sentence_length', 'max_height', 'sum_heights', 'mean_heights',
       'pos_trigram_ratio', 'pos_fourgram_ratio', 'mean_trigram_tfTF',
       'connectives']


In [15]:
from keras.layers import Concatenate
RNN_comb_best = Sequential()
RNN_comb_best.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],
                       input_length=max_len_class,trainable=False,name='embed'))
RNN_comb_best.add(LSTM(20, return_sequences=True,name='LSTM1'))
RNN_comb_best.add(LSTM(20, return_sequences=True,name='LSTM3'))
RNN_comb_best.add(LSTM(20,name='LSTM2'))

Manual_Features_comb = Sequential()
Manual_Features_comb.add(Dense(25,input_shape=(len(colList),),name='ManualFeatureConnected'))

mergedOut = Concatenate()([RNN_comb_best.output, Manual_Features_comb.output])
mergedOut = Dense(1, name='out_layer')(mergedOut)

rnn_combined_final = Model([RNN_comb_best.input, Manual_Features_comb.input], mergedOut)
rnn_combined_final.compile(optimizer='adagrad',loss='mse', metrics=['accuracy'])
rnn_combined_final.summary()


rnn_combined_final.fit([sequences_matrix_class, np.array(train_x_obj[colList].fillna(0))],train_sa_y_class.adjusted_domain1_score, batch_size = 100, epochs=35, validation_split=0.15)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embed_input (InputLayer)        (None, 588)          0                                            
__________________________________________________________________________________________________
embed (Embedding)               (None, 588, 300)     10930500    embed_input[0][0]                
__________________________________________________________________________________________________
LSTM1 (LSTM)                    (None, 588, 20)      25680       embed[0][0]                      
__________________________________________________________________________________________________
LSTM3 (LSTM)                    (None, 588, 20)      3280        LSTM1[0][0]                      
__________________________________________________________________________________________________
ManualFeat

<keras.callbacks.History at 0x7fd1106a5048>

In [16]:
from keras.layers import *

CNN_comb = Sequential()
CNN_comb.add(Embedding(len(tok_class.word_index)+1,300, weights=[embedding_matrix],
                       input_length=max_len_class,trainable=False,name='embed'))
CNN_comb.add(Conv1D(64, 5, activation = 'relu'))
CNN_comb.add(MaxPooling1D(2))
CNN_comb.add(Flatten())

Manual_Features_comb_cnn = Sequential()
Manual_Features_comb_cnn.add(Dense(25,input_shape=(len(colList),),name='ManualFeatureConnected'))

mergedOut_cnn = Concatenate()([CNN_comb.output, Manual_Features_comb_cnn.output])
mergedOut_cnn = Dense(1, name='out_layer')(mergedOut_cnn)

cnn_combined_final = Model([CNN_comb.input, Manual_Features_comb_cnn.input], mergedOut_cnn)
cnn_combined_final.compile(optimizer='adam',loss='mse', metrics=['accuracy'])
cnn_combined_final.summary()

#final_model.summary()
#sahil_model_comb.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
cnn_combined_final.fit([sequences_matrix_class, np.array(train_x_obj[colList].fillna(0))],train_sa_y_class.adjusted_domain1_score, batch_size = 100, epochs=35, validation_split=0.15)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embed_input (InputLayer)        (None, 588)          0                                            
__________________________________________________________________________________________________
embed (Embedding)               (None, 588, 300)     10930500    embed_input[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 584, 64)      96064       embed[0][0]                      
__________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D)  (None, 292, 64)      0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
ManualFeat

<keras.callbacks.History at 0x7fd0ebdad3c8>

In [17]:
rnn_comb_preds = rnn_combined_final.predict([sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class), np.asarray(test_x_obj[colList].fillna(0)).reshape(sequences_test_matrix_class.shape[0],len(colList))])
cnn_comb_preds = cnn_combined_final.predict([sequences_test_matrix_class.reshape(sequences_test_matrix_class.shape[0],max_len_class), np.asarray(test_x_obj[colList].fillna(0)).reshape(sequences_test_matrix_class.shape[0],len(colList))])

comb_pred_frame = pd.DataFrame({'RNN':rnn_comb_preds.flatten(),
                           'CNN':cnn_comb_preds.flatten(),
                           'actual':np.asarray(test_sa_y_class.adjusted_domain1_score), 'dataset': np.asarray(test_sa_y_class.essay_set)})
comb_pred_frame = comb_pred_frame.merge(DivSeries, on='dataset')

for colName in ['CNN', 'RNN', 'actual']:
    comb_pred_frame[colName] = comb_pred_frame[colName] * comb_pred_frame['div']
    comb_pred_frame[colName] = comb_pred_frame[colName].apply(round)

comb_pred_frame.actual = comb_pred_frame.actual.astype(int)

In [18]:
QuadKappaCalculation = pd.DataFrame(columns = ['CNN', 'RNN'],index = np.unique(comb_pred_frame.dataset))
for essaySetValue in np.unique(comb_pred_frame.dataset):
    temp_comb_ES = comb_pred_frame[comb_pred_frame.dataset == essaySetValue]
    QuadKappaCalculation.loc[essaySetValue, 'RNN'] = cohen_kappa_score(temp_comb_ES.actual, temp_comb_ES.RNN.apply(round),weights='quadratic')
    QuadKappaCalculation.loc[essaySetValue, 'CNN'] = cohen_kappa_score(temp_comb_ES.actual, temp_comb_ES.CNN.apply(round),weights='quadratic')
QuadKappaCalculation.mean()

CNN    0.065429
RNN    0.128090
dtype: float64

In [19]:
print("CNN combined RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(comb_pred_frame.actual, comb_pred_frame.CNN.apply(round))),
                                       cohen_kappa_score(comb_pred_frame.actual, comb_pred_frame.CNN.apply(round)),
                                        cohen_kappa_score(comb_pred_frame.actual, comb_pred_frame.CNN.apply(round),weights='quadratic'),
                                                 accuracy_score(comb_pred_frame.actual, comb_pred_frame.CNN.apply(round))))

print("RNN combined RMSE, Cohen, Quad Cohen, accuracy: {0}, {1}, {2}, {3}".format(sqrt(mean_squared_error(comb_pred_frame.actual, comb_pred_frame.RNN.apply(round))),
                                       cohen_kappa_score(comb_pred_frame.actual, comb_pred_frame.RNN.apply(round)),
                                        cohen_kappa_score(comb_pred_frame.actual, comb_pred_frame.RNN.apply(round),weights='quadratic'),             
                                      accuracy_score(comb_pred_frame.actual, comb_pred_frame.RNN.apply(round))))


CNN combined RMSE, Cohen, Quad Cohen, accuracy: 16.995728186204698, 0.04440576138623553, -0.42204815395281625, 0.12172573189522343
RNN combined RMSE, Cohen, Quad Cohen, accuracy: 29.489244859841666, 0.04736246919148801, 0.47937816314859816, 0.12249614791987673
