In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from keras.models import Sequential # type: ignore
from keras.layers import Embedding, LSTM, Dense, Dropout # type: ignore
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
df = pd.read_csv("training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')
df.dropna(axis=1, inplace=True)
df.drop(columns=['domain1_score', 'rater1_domain1', 'rater2_domain1'], inplace=True, axis=1)
temp = pd.read_csv("Processed_data.csv")
temp.drop("Unnamed: 0", inplace=True, axis=1)
df['domain1_score'] = temp['final_score']

In [6]:
y = df['domain1_score']
X = df.drop('domain1_score', axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()

In [8]:
# Preprocessing using spaCy
stop_words = nlp.Defaults.stop_words

In [9]:
def sent2word(text):
    text = re.sub("[^A-Za-z]", " ", text)
    doc = nlp(text.lower())
    return [token.text for token in doc if token.text not in stop_words and token.is_alpha]

def essay2word(essay):
    doc = nlp(essay.strip())
    return [sent2word(sent.text) for sent in doc.sents if len(sent.text.strip()) > 0]

In [10]:
# Tokenize all essays
def preprocess_essays(essays):
    processed_essays = []
    for doc in nlp.pipe(essays, disable=["ner", "parser"]):  # disable unused components
        essay_tokens = []
        for token in doc:
            if token.is_alpha and token.text.lower() not in stop_words:
                essay_tokens.append(token.text.lower())
        processed_essays.append(essay_tokens)
    return processed_essays

# Usage
clean_train = preprocess_essays(train_e)
clean_test = preprocess_essays(test_e)

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

# Assuming `clean_train` is a list of tokenized essays (from spaCy)
train_sentences = [' '.join(essay) for essay in clean_train]

# Initialize and fit tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(train_sentences)

# Define maxlen
maxlen = max(len(x) for x in X_train_seq)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')

In [12]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

w2v_model = Word2Vec(clean_train,
                     vector_size=num_features,
                     min_count=min_word_count,
                     workers=num_workers,
                     window=context,
                     sample=downsampling)

w2v_model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)


In [13]:
def makeVec(words, model, num_features):
    vec = np.zeros(num_features, dtype="float32")
    noOfWords = 0
    index_to_key = set(model.wv.index_to_key)
    for word in words:
        if word in index_to_key:
            noOfWords += 1
            vec = np.add(vec, model.wv[word])
    if noOfWords > 0:
        vec = np.divide(vec, noOfWords)
    return vec

def getVecs(essays, model, num_features):
    essay_vecs = np.zeros((len(essays), num_features), dtype="float32")
    for i, essay in enumerate(essays):
        essay_vecs[i] = makeVec(essay, model, num_features)
    return essay_vecs

training_vectors = getVecs(clean_train, w2v_model, num_features)
testing_vectors = getVecs(clean_test, w2v_model, num_features)

In [14]:
training_vectors = training_vectors.reshape((training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = testing_vectors.reshape((testing_vectors.shape[0], 1, testing_vectors.shape[1]))


In [15]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=(1, 300), return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

In [16]:
lstm_model = get_model()
lstm_model.fit(training_vectors, y_train, batch_size=64, epochs=150)
lstm_model.save('final_lstm.h5')

  super().__init__(**kwargs)


Epoch 1/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - loss: 13.5906 - mae: 2.9054
Epoch 2/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 5.0405 - mae: 1.7891
Epoch 3/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 4.9000 - mae: 1.7513
Epoch 4/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 4.6969 - mae: 1.6995
Epoch 5/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 4.6340 - mae: 1.6819
Epoch 6/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 4.3957 - mae: 1.6463
Epoch 7/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 4.4045 - mae: 1.6455
Epoch 8/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 4.1259 - mae: 1.5897
Epoch 9/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



In [17]:
lstm_model.save('final_lstm.keras')

In [18]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [19]:
import json
with open('config.json', 'w') as f:
    json.dump({'maxlen': maxlen}, f)


In [20]:
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step


In [21]:
loss, mae = lstm_model.evaluate(testing_vectors, y_test, verbose=1)
print(f"\nTest Loss (MSE): {loss:.4f}")
print(f"Test MAE: {mae:.4f}")

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 3.3064 - mae: 1.3555

Test Loss (MSE): 3.0747
Test MAE: 1.3185


In [22]:
from sklearn.metrics import cohen_kappa_score

y_pred_rounded = np.rint(y_pred).astype(int)

y_test_int = np.array(y_test).astype(int)

kappa = cohen_kappa_score(y_test_int, y_pred_rounded, weights='quadratic')

print(f"Quadratic Weighted Kappa (QWK): {kappa:.4f}")

Quadratic Weighted Kappa (QWK): 0.6769


In [23]:
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)
y_pred


[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


array([[3.],
       [6.],
       [7.],
       ...,
       [8.],
       [8.],
       [9.]], dtype=float32)

In [24]:
y_pred = y_pred.flatten()


In [25]:
X_test.head()

Unnamed: 0,essay_id,essay_set,essay
6351,9908,4,The author concludes the story w/this paragrap...
6315,9872,4,I believe that the author concludes the story ...
304,305,1,"Computers, a very much talked about subject. D..."
8023,12771,5,I think in my opion is that the author was ver...
4442,6839,3,The setting that affect the cyclist is the con...
