### Setup

In [97]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Data

In [5]:
kaggle_data = pd.read_csv("https://raw.githubusercontent.com/lodi-m/u-integrity/main/data/normalized_scores/normalized_kaggle_essay_set.csv")
kaggle_data.head(5)

Unnamed: 0,Essay_id,Essay_set_id,Essay,rater1_domain1,rater2_domain1,domain1_score,normalized_score
0,1,1,dear local newspap think effect comput peopl g...,4.0,4.0,8.0,0.6
1,2,1,dear believ use comput benefit us mani way lik...,5.0,4.0,9.0,0.7
2,3,1,dear peopl use comput everyon agre benefit soc...,4.0,3.0,7.0,0.5
3,4,1,dear local newspap found mani expert say compu...,5.0,5.0,10.0,0.8
4,5,1,dear know comput posit effect people. comput c...,4.0,4.0,8.0,0.6


### LSTM model

In [120]:
def get_lstm():
    model = Sequential()
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 200], return_sequences=True))
    # model.add(LSTM(100, recurrent_dropout=0.4, input_shape=[1, 300]))
    model.add(LSTM(100, recurrent_dropout=0.2))
    model.add(Dropout(0.75))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

### Word2Vec

In [73]:
def get_w2v(df, min_word_count, num_features, num_workers, max_distance, downsample):
  w2v = Word2Vec(workers=num_workers, vector_size=num_features, min_count=min_word_count, window=max_distance, sample=downsample)

  w2v.build_vocab(df, progress_per=progress_val)
  w2v.train(df, total_examples=w2v.corpus_count, epochs=w2v.epochs)

  return w2v

### Word embeddings

In [131]:
def make_feature_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features, ))
    index_keys = set(model.wv.index_to_key)
    
    for word in words:
      if word in index_keys:
        feature_vector = np.add(feature_vector, model.wv[word])
    return feature_vector

In [130]:
def avg_feature_vectors(essays, model, num_features):
    final_essay_vector = np.zeros((len(essays), num_features))
    
    for i in range(len(essays)):
        final_essay_vector[i] = make_feature_vectors(essays[i], model, num_features)
    return final_essay_vector

### Parameters

In [7]:
X = kaggle_data["Essay"]
y = kaggle_data["normalized_score"]

In [121]:
num_fold = 2
num_features = 200

min_word_count = 40
num_workers = 4
max_distance = 10
downsampling = 1e-3
progress_val = 2000

### Training LSTM

In [105]:
cv = KFold(n_splits=num_fold, shuffle=True)
mae_results = []
mse_results = []

In [132]:
for traincv, testcv in cv.split(X):
  X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
  
  w2v = get_w2v(X_train, min_word_count, num_features, num_workers, max_distance, downsampling)

  clean_train_essays = X_train.to_list()
  clean_test_essays = X_test.to_list()

  train_vectors = avg_feature_vectors(clean_train_essays, w2v, num_features)
  test_vectors = avg_feature_vectors(clean_test_essays, w2v, num_features)

  train_vectors = np.array(train_vectors)
  test_vectors = np.array(test_vectors)
  
  train_vectors = np.reshape(train_vectors, (train_vectors.shape[0], 1, train_vectors.shape[1]))
  test_vectors = np.reshape(test_vectors, (test_vectors.shape[0], 1, test_vectors.shape[1]))
  
  lstm_model = get_lstm()
  lstm_model.fit(train_vectors, y_train, batch_size=32, epochs=50)

  y_pred = lstm_model.predict(test_vectors)

  y_pred = np.around(y_pred)

  mse = mean_squared_error(y_test, y_pred)
  mae = mean_absolute_error(y_test, y_pred)

  mse_results.append(mse)
  mae_results.append(mae)



Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_51 (LSTM)              (None, 1, 200)            320800    
                                                                 
 lstm_52 (LSTM)              (None, 100)               120400    
                                                                 
 dropout_21 (Dropout)        (None, 100)               0         
                                                                 
 dense_20 (Dense)            (None, 1)                 101       
                                                                 
Total params: 441,301
Trainable params: 441,301
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16



Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_53 (LSTM)              (None, 1, 200)            320800    
                                                                 
 lstm_54 (LSTM)              (None, 100)               120400    
                                                                 
 dropout_22 (Dropout)        (None, 100)               0         
                                                                 
 dense_21 (Dense)            (None, 1)                 101       
                                                                 
Total params: 441,301
Trainable params: 441,301
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16

In [133]:
np.around(np.array(mse_results).mean(), decimals=4)

0.1761

In [134]:
np.around(np.array(mae_results).mean(), decimals=4)

0.3812