In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from utils.common_utils import model_performance
from utils.processor import create_edited_sentences

In [2]:
"""
Create data frames for training and testing sets
"""
train_df = pd.read_csv('data/task-1/train.csv')
test_df = pd.read_csv('data/task-1/truth_task_1.csv')

training_data = train_df['original']
training_edits = train_df['edit']
training_grades = train_df['meanGrade']

edited_training = pd.Series(create_edited_sentences(training_data, training_edits))

testing_data = test_df['original']
testing_edits = test_df['edit']
testing_grades = test_df['meanGrade']

edited_testing = pd.Series(create_edited_sentences(testing_data, testing_edits))

In [3]:
"""
Feature engineering for datasets
"""
vectorizer = CountVectorizer()
training_bags_of_words = vectorizer.fit_transform(edited_training)
testing_bag_of_words = vectorizer.transform(edited_testing)

In [11]:
"""
Support Vector Regressor
"""
model = SVR(kernel='rbf', C=0.1)
model = model.fit(training_bags_of_words, training_grades)
predictions = model.predict(testing_bag_of_words)

model_performance(predictions, testing_grades, print_output=True)

| MSE: 0.32 | RMSE: 0.57 |


(970.7414653797691, 0.32101238934516174)

In [14]:
"""
Random Forest Regressor
"""
model = RandomForestRegressor()
model = model.fit(training_bags_of_words, training_grades)
predictions = model.predict(testing_bag_of_words)

model_performance(predictions, testing_grades, print_output=True)

| MSE: 0.33 | RMSE: 0.57 |


(997.373641835153, 0.32981932600368813)

In [15]:
"""
Linear Regression
"""
model = LinearRegression()
model = model.fit(training_bags_of_words, training_grades)
predictions = model.predict(testing_bag_of_words)

model_performance(predictions, testing_grades, print_output=True)

| MSE: 19.80 | RMSE: 4.45 |


(59884.301548905525, 19.80300977146347)

In [4]:
"""
Linear Regression with Polynomial Features
"""
poly = PolynomialFeatures(degree=2, interaction_only=True)
training_poly = poly.fit_transform(training_bags_of_words)
testing_poly = poly.fit_transform(testing_bag_of_words)


model = LinearRegression()
model = model.fit(training_poly, training_grades)
predictions = model.predict(testing_poly)

model_performance(predictions, testing_grades, print_output=True)

| MSE: 0.33 | RMSE: 0.57 |


(990.965240766977, 0.3277001457562755)

In [6]:
"""
Neural Network
"""
model = MLPRegressor()
model = model.fit(training_bags_of_words, training_grades)
predictions = model.predict(testing_bag_of_words)

model_performance(predictions, testing_grades, print_output=True)

| MSE: 0.55 | RMSE: 0.74 |


(1657.890442145098, 0.5482441938310509)