In [4]:
import numpy as np
!pip install Keras-Preprocessing
!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.optimizers import Adam
from nltk.corpus import stopwords
import string
import nltk

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MANASVI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load your essay data
data = pd.read_csv("dataset.tsv", sep='\t', encoding='ISO-8859-1')

In [6]:
# Text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to essays
data['processed_essay'] = data['essay'].apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_essay'].values)
X = tokenizer.texts_to_sequences(data['processed_essay'].values)
X = pad_sequences(X, maxlen=500)  # Assume maximum length of essay is 500 tokens

# Prepare target variable
y = data['domain1_score'].values

# Train Word2Vec model
w2v_model = Word2Vec(data['processed_essay'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=2, workers=4)
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=500, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='linear'))



In [8]:
# Compile model
model.compile(loss='mean_squared_error', optimizer=Adam(), metrics=['mean_absolute_error'])

# Train model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), verbose=2)

Epoch 1/5
163/163 - 145s - 889ms/step - loss: 58.4353 - mean_absolute_error: 4.3038 - val_loss: 25.9194 - val_mean_absolute_error: 2.4441
Epoch 2/5
163/163 - 133s - 816ms/step - loss: 28.9097 - mean_absolute_error: 2.8455 - val_loss: 14.9623 - val_mean_absolute_error: 2.0701
Epoch 3/5
163/163 - 137s - 841ms/step - loss: 13.7784 - mean_absolute_error: 2.0632 - val_loss: 7.4325 - val_mean_absolute_error: 1.5822
Epoch 4/5
163/163 - 136s - 837ms/step - loss: 8.0012 - mean_absolute_error: 1.6588 - val_loss: 13.3034 - val_mean_absolute_error: 1.8633
Epoch 5/5
163/163 - 136s - 833ms/step - loss: 6.6797 - mean_absolute_error: 1.6011 - val_loss: 4.2055 - val_mean_absolute_error: 1.2633


In [9]:
# Evaluate the model
y_pred = model.predict(X_val)
print(y_pred)

r2 = r2_score(y_val, y_pred)
print(f'R^2 Score: {r2}')

mae = mean_absolute_error(y_val, y_pred)
print(f'Validation Mean Absolute Error: {mae}')

mse = mean_squared_error(y_val, y_pred)
print(f'Validation Mean Squared Error: {mse}')

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 101ms/step
[[ 1.0465763]
 [ 1.2016009]
 [ 8.597796 ]
 ...
 [ 9.877499 ]
 [ 2.3862283]
 [37.728992 ]]
R^2 Score: 0.9459846989013149
Validation Mean Absolute Error: 1.2633219671669562
Validation Mean Squared Error: 4.205516938330287


In [10]:
# Function to preprocess and predict score for a new essay
def predict_score(new_essay, model, tokenizer, max_length=500):
    processed_essay = preprocess_text(new_essay)
    sequence = tokenizer.texts_to_sequences([processed_essay])
    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    predicted_score = model.predict(padded_sequence, verbose=0)
    return predicted_score[0][0]

# Predict grade for a new essay
new_essay = "Reading opens doors to new worlds and ideas. It enriches the mind and broadens perspectives. Whether fiction or non-fiction, books offer endless opportunities for learning and imagination. They entertain, educate, and inspire. In a busy world, reading provides a tranquil escape, fostering growth and creativity in every reader."
predicted_score = predict_score(new_essay, model, tokenizer)
print(f"Predicted Score: {predicted_score}")

Predicted Score: 3.7132339477539062


In [12]:
# Save the model
import pickle
model.save('models/essay_scoring_model.h5')

# Save the tokenizer
with open('models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved.")



Model and tokenizer saved.
