In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

from processor import *

In [5]:
train_df = pd.read_csv('data/task-1/train.csv')
test_df = pd.read_csv('data/task-1/dev.csv')

training_data = train_df['original']
training_edits = train_df['edit']
training_grades = train_df['meanGrade']

edited_training = pd.Series(create_edited_sentences(training_data, training_edits))
training_dataset, validation_dataset, training_gradeset, validation_gradeset = train_test_split(edited_training, training_grades, test_size=0.2, random_state=42)

In [14]:
def model_performance(output, target, print_output=False):
    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [15]:
vectorizer = CountVectorizer()
training_bags_of_words = vectorizer.fit_transform(training_dataset)
validation_bag_of_words = vectorizer.transform(validation_dataset)
vectorizer.get_feature_names()

['000',
 '02',
 '10',
 '100',
 '100k',
 '100m',
 '101',
 '103',
 '104',
 '106',
 '10g',
 '11',
 '12',
 '13',
 '130',
 '133m',
 '138',
 '1380',
 '14',
 '140',
 '15',
 '16',
 '168',
 '17',
 '18',
 '180',
 '19',
 '190',
 '192',
 '1928',
 '1973',
 '1984',
 '199',
 '1st',
 '20',
 '200',
 '2004',
 '2005',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2013',
 '2016',
 '2017',
 '2018',
 '2019',
 '202',
 '2020',
 '2024',
 '2050',
 '20m',
 '21',
 '21st',
 '22',
 '227',
 '230k',
 '235',
 '23m',
 '24',
 '25',
 '250',
 '26b',
 '27',
 '28',
 '29th',
 '2nd',
 '30',
 '300mw',
 '301',
 '30th',
 '31',
 '34',
 '35',
 '35s',
 '36',
 '361',
 '37',
 '38',
 '39',
 '40',
 '400',
 '41',
 '418m',
 '42',
 '43',
 '44',
 '45',
 '451',
 '462',
 '473',
 '50',
 '500',
 '53',
 '547',
 '55',
 '57',
 '59',
 '60',
 '600',
 '61',
 '62',
 '63rd',
 '64',
 '65',
 '666',
 '67',
 '69',
 '70',
 '700',
 '75',
 '76',
 '77',
 '773',
 '78',
 '80s',
 '81',
 '82',
 '83',
 '84',
 '85bn',
 '87',
 '88',
 '89',
 '8g',
 '90',
 '900',
 '9

In [25]:
model = SVR(kernel='rbf', C=0.1)
model = model.fit(training_bags_of_words, training_gradeset)
predictions = model.predict(validation_bag_of_words)

In [26]:
model_performance(predictions, validation_gradeset, print_output=True)

| MSE: 0.33 | RMSE: 0.57 |


(634.7940007193528, 0.32873847784534066)

(2419, 10397)