In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

import re
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin 

#metrics
from sklearn.metrics import mean_squared_error
#model selection
from sklearn.model_selection import cross_val_score

#load preprocessed dataset:
import joblib
#models
from sklearn.linear_model import LinearRegression, ElasticNet 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor


In [2]:
X = joblib.load("transformed_train.pkl")
X.shape

(2471, 230)

In [3]:
input_folder = "kaggle/input/linking-writing-processes-to-writing-quality/"
train_scores = pd.read_csv(input_folder +"train_scores.csv", delimiter = ",", header = 0)
scores = pd.Series(data = train_scores['score'].values, index = train_scores['id'].values, name = 'score')

In [4]:
Y = scores.values

In [5]:
# display scores
def display_scores(rmse_neg_score):
    scores = np.sqrt(-rmse_neg_score)
    print("score: {}".format(scores))
    print("Mean rmse: {}".format(scores.mean()))
    print("Sd rmse: {}".format(scores.std(ddof = 1)))
    

In [10]:
model_1 = BaggingRegressor(estimator = SVR(C=1000, epsilon = 0.001), max_samples = 800, n_estimators = 60)  
model_2 = RandomForestRegressor()
model_3 = BaggingRegressor(estimator = ElasticNet(alpha =0.1, l1_ratio = 0.5), max_samples = 800, n_estimators = 60)
main_model = VotingRegressor(estimators = [('b_svr',model_1),('rfr', model_2), 
                                           ('b_elastic', model_3)])

display_scores(cross_val_score(estimator = main_model, X= X, y = Y,
                               scoring = "neg_mean_squared_error", cv = 5))

score: [0.66079266 0.68343441 0.66836185 0.68635073 0.70923515]
Mean rmse: 0.681634960238361
Sd rmse: 0.018693742445172013


In [11]:
model_test = BaggingRegressor(estimator = SVR(C=1000, epsilon = 0.001), max_samples =800, 
# model_2 = AdaBoostRegressor(random_state = 42, n_estimators = 50)
                             n_estimators = 100, random_state = 11)
model_test.fit(X,Y)
rmse = np.sqrt(mean_squared_error(model_test.predict(X), Y))
print("RMSE: {}".format(rmse))



RMSE: 0.5497826176717799
