In [18]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MANASVI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Load data
data = pd.read_csv("dataset.tsv", sep='\t', encoding='ISO-8859-1')

In [20]:
# Text preprocessing
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

data['processed_essay'] = data['essay'].apply(preprocess_text)

# Train Word2Vec model
w2v_model = Word2Vec(data['processed_essay'], vector_size=100, window=5, min_count=2, workers=4)

# Convert essays to Word2Vec vectors
def essay_to_vector(essay, model):
    words = [word for word in essay if word in model.wv]
    if len(words) == 0:  # if no words in essay are in the model's vocabulary
        return np.zeros(100)
    return np.mean(model.wv[words], axis=0)

data['essay_vector'] = data['processed_essay'].apply(lambda x: essay_to_vector(x, w2v_model))
X = np.vstack(data['essay_vector'].values)
y = data['domain1_score'].values

In [21]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Define and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred1 = rf_model.predict(X_val)
mse1 = mean_squared_error(y_val, y_pred1)
print(f'Validation Mean Squared Error: {mse1}')

print("Score:",rf_model.score(X_val,y_val))

Validation Mean Squared Error: 8.423887916238657
Score: 0.8918043016139029


In [7]:
# Define and train the SVR model
svr_model = SVR(kernel='linear', C=1.0, epsilon=0.2)
svr_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred2 = svr_model.predict(X_val)
mse2 = mean_squared_error(y_val, y_pred2)
print(f'Validation Mean Squared Error: {mse2}')

print("Score:",svr_model.score(X_val,y_val))

Validation Mean Squared Error: 45.26055949636236
Score: 0.4186772316124334
