# Finding the best model

After we have processed our dataset, we need to find a model to predict new values. We will explore different models to see which ones perform better in this particular dataset.

In [None]:
import numpy as np
from distributed import Client
import dask.dataframe as dd
from dask import compute
from sklearn import set_config
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from dask_ml.preprocessing import Categorizer, StandardScaler, DummyEncoder

# from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import Ridge

from sklearn.svm import SVR
from dask_ml.linear_model import LinearRegression

# from sklearn.model_selection import GridSearchCV
from dask_ml.model_selection import GridSearchCV

from dask_ml.model_selection import train_test_split
# from sklearn.model_selection import train_test_split

client = Client(n_workers=2, threads_per_worker=2, memory_limit='4GB')
ddf = dd.read_parquet('/home/diego/Coding/code-challenge-2020/data_root/processed/train.parquet', engine='pyarrow')
X, y = ddf.drop(['points'], axis=1), ddf['points']
# X, y = compute(ddf.drop(['points'], axis=1), ddf[['points']]) # using pandas dataframes only
X.head()


In [None]:
set_config(display='diagram')  # Allows us to visualize pipeline
num_proc = make_pipeline(StandardScaler())
cat_proc = make_pipeline(Categorizer(), DummyEncoder())
cat_cols = X.columns.to_list()
cat_cols.remove('price')
preprocessor = make_column_transformer((num_proc, ['price']),
                                       (cat_proc, cat_cols))


## Baseline - Linear Regression

First we will create a baseline using a linear regression model.

In [None]:
# Create a small test set to test scores
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0, shuffle=True)

linear_pipeline = make_pipeline(preprocessor, LinearRegression())
linear_pipeline

In [None]:
linear_pipeline.fit(X_train, y_train)
print(f"R2 score is : {r2_score(y_test, linear_pipeline.predict(X_test))}")
print(f"MSE is : {mean_squared_error(y_test, linear_pipeline.predict(X_test))}")
print(f"MAE score is : {mean_absolute_error(y_test, linear_pipeline.predict(X_test))}")

## Ridge Regression

In [None]:
ridge_pipeline = make_pipeline(preprocessor, Ridge(alpha=0.001))
ridge_pipeline

In [None]:
parameters = {
    'ridge__alpha': np.logspace(0, 1, 10),
}

#metrics = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
#grid_search = GridSearchCV(ridge_pipeline, parameters, scoring=metrics, refit='r2')
grid_search = GridSearchCV(ridge_pipeline, parameters) # grid search with 5-fold cv
grid_search.fit(X_train, y_train)


In [None]:
print(f"R2 score is : {r2_score(y_test, grid_search.predict(X_test))}")
print(f"MSE is : {mean_squared_error(y_test, grid_search.predict(X_test))}")
print(f"MAE score is : {mean_absolute_error(y_test, grid_search.predict(X_test))}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


In [None]:
np.logspace(0, 1, 10)

## Support Vector Machine

In [None]:
svr_pipeline = make_pipeline(preprocessor, SVR())
svr_pipeline

In [None]:
svr_pipeline = make_pipeline(preprocessor, SVR(kernel='rbf'))
svr_pipeline.fit(X_train, y_train)
print(f"R2 score is : {r2_score(y_test, svr_pipeline.predict(X_test))}")
print(f"MSE is : {mean_squared_error(y_test, svr_pipeline.predict(X_test))}")
print(f"MAE score is : {mean_absolute_error(y_test, svr_pipeline.predict(X_test))}")

In [None]:
param_grid = [
    {'svr__kernel': ['rbf'], 'svr__gamma': [1e-3, 1e-4], 'svr__C': [1, 10, 100, 1000]},
]
# metrics = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
# grid_search = GridSearchCV(svr_pipeline, param_grid, scoring=metrics, refit='r2')
grid_search = GridSearchCV(svr_pipeline, param_grid)
grid_search.fit(X_train, y_train)
print(f"R2 score is : {r2_score(y_test, grid_search.predict(X_test))}")
print(f"MSE is : {mean_squared_error(y_test, grid_search.predict(X_test))}")
print(f"MAE score is : {mean_absolute_error(y_test, grid_search.predict(X_test))}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

