## Importing libraries
_____

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, model_selection, metrics

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.svm import SVC
import lightgbm as lgb

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error

## Reading Files
_____

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Reading the clean train dataset
clean_train = pd.read_csv("/content/drive/MyDrive/CSC 418 : Data Science /santander_prediction/data/cleaned_data/clean_train.csv")
print(f'The shape of the dataset is: {clean_train.shape}')
clean_train.head()

The shape of the dataset is: (4420, 77)


Unnamed: 0,ID,target,feature_1,feature_6,feature_7,feature_11,feature_14,feature_17,feature_20,feature_21,...,feature_174,feature_178,feature_179,feature_180,feature_181,feature_182,feature_185,feature_193,feature_194,feature_195
0,000d6aaf2,38000000.0,0.0,0.003238,0.003939,0.000355,0.006825,0.0,0.004047,0.005104,...,0.009863,0.006227,0.007668,0.0,0.0,3.8e-05,0.002089,0.004458,0.004871,0.000731
1,000fbd867,600000.0,0.0,0.006283,0.012509,0.021983,0.001761,0.000278,0.0,0.010881,...,0.002311,0.0,0.0,0.0,0.009721,0.014991,0.0,0.011652,0.006373,0.00661
2,0027d6b71,10000000.0,0.001278,0.000667,0.001381,0.002111,0.002571,0.0,0.003468,0.0,...,0.003329,0.001767,0.001102,0.0,0.000965,0.001258,0.000988,0.0,0.001824,0.001587
3,0028cbf45,2000000.0,0.0,0.003343,0.002699,0.006143,0.001584,0.0,0.002308,0.000662,...,0.006563,0.0,0.001147,0.000403,0.000172,0.00055,8.1e-05,0.004262,0.004592,0.001864
4,002a68644,14400000.0,0.0,0.0,0.0,0.004519,0.001464,0.0,0.000621,0.001411,...,0.006201,0.0,0.001922,0.0,0.002856,0.002095,0.000387,0.002601,0.006268,0.001264


In [None]:
# Reading the clean test dataset
clean_test = pd.read_csv("/content/drive/MyDrive/CSC 418 : Data Science /santander_prediction/data/cleaned_data/clean_test.csv")
print(f'The shape of the dataset is: {clean_test.shape}')
clean_test.head()

The shape of the dataset is: (48443, 76)


Unnamed: 0,ID,feature_1,feature_6,feature_7,feature_11,feature_14,feature_17,feature_20,feature_21,feature_23,...,feature_174,feature_178,feature_179,feature_180,feature_181,feature_182,feature_185,feature_193,feature_194,feature_195
0,000137c73,0.0,0.0,0.047692,0.0,0.0,0.033567,0.024573,0.144102,0.0,...,0.129965,0.120888,0.0,0.0,0.005642,0.129815,0.028257,0.0,0.0,0.104119
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.005134,0.002093,0.014857,0.0,...,0.007821,0.003665,0.000712,0.0,0.0,0.006838,0.00387,0.0,0.0,0.006105
2,0004d7953,0.005912,0.0,0.00222,0.0,0.0,0.005461,0.004475,0.022596,0.0,...,0.00706,0.006901,0.0,0.0,3.9e-05,0.014183,0.006599,0.0,0.0,0.01869
3,00056a333,0.008201,0.0,0.0,0.0,0.0,0.017106,0.013691,0.038653,0.0,...,0.024198,0.025699,0.002997,0.0,0.000815,0.024599,0.007541,0.0,0.0,0.023945
4,00056d8eb,0.0,0.0,0.005813,0.0,0.0,0.021753,0.012632,0.038338,0.0,...,0.023285,0.024585,0.0,0.0,0.009129,0.023452,0.023275,0.0,0.0,0.043653


## Splitting the Train Dataset into Train and Test for the Model

In [None]:
# Select main columns to be used in training
main_cols = clean_train.columns.difference(['ID', 'target'])
X = clean_train[main_cols]
y = np.log1p(clean_train["target"].values)
test_X = clean_test.drop(["ID"], axis=1)


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

print(X_train.shape)
print(X_test.shape)

(3094, 75)
(1326, 75)


In [None]:
# Grouped Bar Chart for both training and validation data
def plot_result(x_label, # Name of the algorithm used for training e.g 'Decision Tree'
                y_label, # Name of metric being visualized e.g 'Accuracy'
                plot_title, # title of the plot e.g 'Accuracy Plot'
                train_data, # list containing either training precision, accuracy, or f1 score.
                val_data):  # list containing either validation precision, accuracy, or f1 score
      
        # Set size of plot
        plt.figure(figsize=(12,6))
        labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold"]
        X_axis = np.arange(len(labels))
        ax = plt.gca()
        plt.ylim(0.40000, 1)
        plt.bar(X_axis-0.2, train_data, 0.4, color='blue', label='Training')
        plt.bar(X_axis+0.2, val_data, 0.4, color='red', label='Validation')
        plt.title(plot_title, fontsize=30)
        plt.xticks(X_axis, labels)
        plt.xlabel(x_label, fontsize=14)
        plt.ylabel(y_label, fontsize=14)
        plt.legend()
        plt.grid(True)
        plt.show()   #  The function returns a Grouped Barchart showing the training and validation result in each fold.

## Testing Different Classifier Algorithms

---

## Prediction
Continuous target prediction for the unprocessed data

### Metrics used
1. Train set score (R2)
2. Test set score (R2)
3. Mean Squared Error (MSE)
4. Root Mean Squared Error (RMSE)
5. Mean Absolute Error (MAE)
6. Mean Absolute Percentage Error (MAPE)
7. Number of features used

### How to interpret metrics
R2 score is supposed to be within the 0 to 1 range with 1 as perfect fit and zero as no linear relationship between dependent variables with the independent variables. Negative R2 score indicates very poor fit, worse than a horizontal line

### Baseline Model (Dummy Regressor)

Predicts a specified constant value

In [None]:
model_dr = DummyRegressor(strategy='median').fit(X_train, y_train)
Y_pred_lr = model_dr.predict(X_test)

print(f"constant: {model_dr.constant_}")

# Metrics
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")

print(f"Train set score (R2): {model_dr.score(X_train, y_train)}")
print(f"Test set score (R2): {model_dr.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")

constant: [[14.53736679]]
Root Mean Squared Log Error (RMSlE): 0.013690360560561934
Train set score (R2): -0.0015880394162655076
Test set score (R2): -6.789789090033693e-05
Mean Squared Error (MSE): 3.0334827942851135
Root Mean Squared Error (RMSE): 1.7416896377613071
Root Mean Squared Log Error (RMSlE): 0.013690360560561934
Mean Absolute Error (MAE): 1.4324769861252766
Mean Absolute Percentage Error (MAPE): 0.10391389649202282


Let us do KFold cross validation and average the predictions of the test set.

### **Using LGBM**

In [None]:
def run_lgb(X, y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(X, label=y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=200, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
pred_test_full = 0
for dev_index, val_index in kf.split(X):
    dev_X, val_X = X.loc[dev_index,:], X.loc[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]
    pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)
    pred_test_full += pred_test
pred_test_full /= 5.
pred_test_full = np.expm1(pred_test_full)

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 1.69404
[400]	valid_0's rmse: 1.68523
Early stopping, best iteration is:
[433]	valid_0's rmse: 1.68425
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 1.72362
Early stopping, best iteration is:
[244]	valid_0's rmse: 1.72319
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 1.67168
[400]	valid_0's rmse: 1.6708
Early stopping, best iteration is:
[353]	valid_0's rmse: 1.67052
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 1.58695
Early stopping, best iteration is:
[131]	valid_0's rmse: 1.58545
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 1.6729
Early stopping, best iteration is:
[262]	valid_0's rmse: 1.67132


### 1. Linear Regression

In [None]:
model_LR = LinearRegression().fit(X_train, y_train)
Y_pred_lr = model_LR.predict(X_test)

print(f"intercept: {model_LR.intercept_}")
print(f"slope: {model_LR.coef_}")

# Metrics
print(f"Train set score (R2): {model_LR.score(X_train, y_train)}")
print(f"Test set score (R2): {model_LR.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")
print(f"No of features used: {np.sum(model_LR.coef_ != 0)} of {X_train.shape[1]}")

intercept: 14.048935035681138
slope: [-46.17112127   1.5785475  -11.47115548   0.82753651  -3.83260666
   7.80761894   9.49334934  -0.2835381    1.15194209   2.31630543
  -3.09081072  12.42689797  -1.53380555 -24.10529503  -5.67530655
  16.8113514    0.475448    -8.98869814  16.05120068   8.0050902
   6.78353804   2.67113504   1.16881377  -2.79795763 -17.22426086
  -1.6197041   -5.39891833  -6.66709865 -10.77750655  -5.6151612
 -13.21985421  13.01508971  -0.10898339 -14.53588067   7.98578575
 -22.83101194  -1.0549675  -12.56450168  -0.97277564  11.05667737
  -3.61872587   2.04449195   7.33948764   3.2847566   15.17940038
  11.78953731  -5.87930444 -11.29569005   1.72787873  -9.00909618
   5.53705328   8.2275338   -2.4030907    1.30894635  14.81355055
  12.28652367   0.59171358  11.36471481   3.1762009   -5.53439546
  12.24604268   1.4811346   -6.48494109  -2.2272676  -10.79541761
   2.80461024   7.1498493   -7.37788407 -13.20397301  19.52452081
 -11.46574689  18.20067454   2.55645172  

#### Results interpretation
The model has a train set score of 0.9991 which is close to 1. However, the test set score (R2 score) is negative which indicating that the model overfitting, which performs really badly on test data, hence the model fit is poor.

### 2. Ridge Regression

In [None]:
# Using default alpha of 1.0
model_ridge = Ridge().fit(X_train, y_train)
Y_pred_lr = model_ridge.predict(X_test)

print(f"intercept: {model_ridge.intercept_}")
print(f"slope: {model_ridge.coef_}")

# Metrics
print(f"Train set score (R2): {model_ridge.score(X_train, y_train)}")
print(f"Test set score (R2): {model_ridge.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")
print(f"No of features used: {np.sum(model_ridge.coef_ != 0)} of {X_train.shape[1]}")

intercept: 14.061070245792232
slope: [-0.45423067  1.77719156 -0.76393707  0.5710757   0.55189114  2.0992016
  0.02198501  1.3008997   0.19104985  0.30927772  0.30504158  1.07903137
 -0.04135309 -1.22858969  0.45069859  0.07433169  1.92134532 -1.27551483
  2.44222696  1.35237656  0.04454932  0.29669243  0.24597662 -0.35646465
 -0.39748759  0.2637574  -0.95404392 -0.75365189 -0.86564035 -0.32934209
 -0.76014252  0.73190619  2.25412077 -0.00921438  0.20317733 -0.30640025
 -0.28993768 -0.21803998 -0.2934394   0.5372202   0.08891078  1.79054551
  0.64405014  0.03058309  1.97704284  1.56941329 -0.56839049 -0.52281471
  0.81660191 -1.02750582  0.86272324  1.70464187  0.14017434 -0.23881233
  0.8263383   0.05535054  0.26735787  1.21318118 -0.06901931  0.31950609
  2.3114603   0.06236112 -0.55477819 -0.39624101 -0.66687739  0.12857836
  1.51786753  0.71717209 -0.13621563  1.15164683 -0.63180572  1.00367923
  1.59221581  0.04070029  0.50584883]
Train set score (R2): 0.060488744694209395
Test se

#### Results interpretation (alpha 1.0)
- Poor fitting due to negative test R2
- Default alpha for the model was 1.0 which doesn't allow for much generalization

In [None]:
# Using alpha of 0.1
model_ridge = Ridge(alpha=0.1).fit(X_train, y_train)
Y_pred_lr = model_ridge.predict(X_test)

print(f"intercept: {model_ridge.intercept_}")
print(f"slope: {model_ridge.coef_}")

# Metrics
print(f"Train set score (R2): {model_ridge.score(X_train, y_train)}")
print(f"Test set score (R2): {model_ridge.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")
print(f"No of features used: {np.sum(model_ridge.coef_ != 0)} of {X_train.shape[1]}")

intercept: 14.043972980710242
slope: [-4.52382337e+00  2.46598842e+00 -3.73215076e+00  1.08868200e+00
 -9.81389239e-01  4.42095640e+00  1.64539502e+00  1.30027378e+00
  5.75223921e-01 -6.17133445e-03 -2.74347681e+00  3.89876331e+00
 -1.08610764e+00 -8.86164383e+00 -1.01103881e+00  2.98848570e+00
  2.89056198e+00 -5.21206038e+00  8.51091366e+00  4.39678244e+00
  1.01195737e+00  5.53522854e-01  3.63900050e-01 -2.47331462e+00
 -2.98582754e+00 -5.32876865e-01 -5.26248492e+00 -3.80114722e+00
 -5.27155449e+00 -2.72473257e+00 -4.12916162e+00  4.37810299e+00
  3.76949770e+00 -1.84056282e+00  1.20960605e+00 -3.16383726e+00
 -2.07787542e+00 -3.58795038e+00 -1.10976592e+00  3.65368667e+00
 -3.43235282e-01  5.70726146e+00  2.51850362e+00  9.71311309e-01
  6.96428616e+00  5.02002505e+00 -3.34875029e+00 -4.24801110e+00
  1.21504599e+00 -4.73194464e+00  3.28323097e+00  5.61929457e+00
 -5.73421926e-01 -2.05352524e+00  4.37920286e+00  1.12817864e+00
  1.34179310e+00  3.32679849e+00  2.26828079e-01 -1.5

#### Results interpretation (alpha 0.1)
- Model still performs poorly, but ever so slightly better than alpha 1.0

### 3. Lasso Regression

In [None]:
# Using default alpha of 1.0
model_lasso = Lasso().fit(X_train, y_train)
Y_pred_lr = model_lasso.predict(X_test)

print(f"intercept: {model_lasso.intercept_}")
print(f"slope: {model_lasso.coef_}")

# Metrics
print(f"Train set score (R2): {model_lasso.score(X_train, y_train)}")
print(f"Test set score (R2): {model_lasso.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")
print(f"No of features used: {np.sum(model_lasso.coef_ != 0)} of {X_train.shape[1]}")

intercept: 14.467401277269948
slope: [-0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]
Train set score (R2): 0.0
Test set score (R2): -0.0010196783148335165
Mean Squared Error (MSE): 3.0363698078029233
Root Mean Squared Error (RMSE): 1.7425182374376813
Root Mean Squared Log Error (RMSlE): 0.013642545471176864
Mean Absolute Error (MAE): 1.4405402775378477
Mean Absolute Percentage Error (MAPE): 0.10396894692680184
No of features used: 0 of 75


#### Results interpretation (alpha 1.0)
Test set score is slightly lower than in ridge and linear regression models. The test set score has improved, but is still very poor

In [None]:
# Using alpha of 0.1
model_lasso = Lasso(alpha=0.1).fit(X_train, y_train)
Y_pred_lr = model_lasso.predict(X_test)

print(f"intercept: {model_lasso.intercept_}")
print(f"slope: {model_lasso.coef_}")

# Metrics
print(f"Train set score (R2): {model_lasso.score(X_train, y_train)}")
print(f"Test set score (R2): {model_lasso.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")
print(f"No of features used: {np.sum(model_lasso.coef_ != 0)} of {X_train.shape[1]}")

intercept: 14.467401277269948
slope: [-0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]
Train set score (R2): 0.0
Test set score (R2): -0.0010196783148335165
Mean Squared Error (MSE): 3.0363698078029233
Root Mean Squared Error (RMSE): 1.7425182374376813
Root Mean Squared Log Error (RMSlE): 0.013642545471176864
Mean Absolute Error (MAE): 1.4405402775378477
Mean Absolute Percentage Error (MAPE): 0.10396894692680184
No of features used: 0 of 75


In [None]:
# Using alpha of 10
model_lasso = Lasso(alpha=10, max_iter=10000).fit(X_train, y_train)
Y_pred_lr = model_lasso.predict(X_test)

print(f"intercept: {model_lasso.intercept_}")
print(f"slope: {model_lasso.coef_}")

# Metrics
print(f"Train set score (R2): {model_lasso.score(X_train, y_train)}")
print(f"Test set score (R2): {model_lasso.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")
print(f"No of features used: {np.sum(model_lasso.coef_ != 0)} of {X_train.shape[1]}")

intercept: 14.467401277269948
slope: [-0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]
Train set score (R2): 0.0
Test set score (R2): -0.0010196783148335165
Mean Squared Error (MSE): 3.0363698078029233
Root Mean Squared Error (RMSE): 1.7425182374376813
Root Mean Squared Log Error (RMSlE): 0.013642545471176864
Mean Absolute Error (MAE): 1.4405402775378477
Mean Absolute Percentage Error (MAPE): 0.10396894692680184
No of features used: 0 of 75


#### Results interpretation (alpha 0.1)
Changing alpha only marginally affected the performance. The effect is negligible.

### 4. SVM - Support Vector Machine

In [None]:
model_SVR = svm.SVR().fit(X_train, y_train)
Y_pred_lr = model_SVR.predict(X_test)

# Metrics
print(f"Train set score (R2): {model_SVR.score(X_train, y_train)}")
print(f"Test set score (R2): {model_SVR.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")

Train set score (R2): 0.19501771722256311
Test set score (R2): 0.06368259098729889
Mean Squared Error (MSE): 2.840109912756645
Root Mean Squared Error (RMSE): 1.6852625649306534
Root Mean Squared Log Error (RMSlE): 0.01290889242834023
Mean Absolute Error (MAE): 1.3591387524202783
Mean Absolute Percentage Error (MAPE): 0.09932541201610866


#### Results Interpretation
Both test and train set scores (R2) are negative indicating a poor model fit. However this is a significant improvement from linear ridge and lasso models.

### 5. Random Forest

In [None]:
model_RFR = RandomForestRegressor(n_estimators=10).fit(X_train, y_train)
Y_pred_lr = model_RFR.predict(X_test)

# Metrics
print(f"Train set score (R2): {model_RFR.score(X_train, y_train)}")
print(f"Test set score (R2): {model_RFR.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")

Train set score (R2): 0.8201631437584741
Test set score (R2): 0.0007806917331907304
Mean Squared Error (MSE): 3.0309087870306883
Root Mean Squared Error (RMSE): 1.7409505412362203
Root Mean Squared Log Error (RMSlE): 0.01352128353242947
Mean Absolute Error (MAE): 1.416838944266821
Mean Absolute Percentage Error (MAPE): 0.1016857935510907


#### Results interpretation
Both the train and test set scores (R2) are within the accepted range of 0 to 1. The train set score is 0.85, which is high. However, the test set score is as low as 0.15 which indicates that the model is overfitted to the test data and does not have much room for generalization.

### 6. Gradient Boosting Regressor

In [None]:
model_GBR = GradientBoostingRegressor().fit(X_train, y_train)
Y_pred_lr = model_GBR.predict(X_test)

# Metrics
print(f"Train set score (R2): {model_GBR.score(X_train, y_train)}")
print(f"Test set score (R2): {model_GBR.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")

Train set score (R2): 0.3379824369377492
Test set score (R2): 0.07944078594541881
Mean Squared Error (MSE): 2.7923109449313004
Root Mean Squared Error (RMSE): 1.6710209289327589
Root Mean Squared Log Error (RMSlE): 0.012572225532220626
Mean Absolute Error (MAE): 1.3626974851563434
Mean Absolute Percentage Error (MAPE): 0.09836372236820774


#### Results interpretation
Though the training set R2 score is lower than the Random Forest model, the test set R2 score is higher indicating less overfitting and more generalization. Overally, this model performs better than the ones above for this dataset.

### 7. MLP Regressor

In [None]:
model_mlpr = MLPRegressor(hidden_layer_sizes= (5,5,5,5),
                           max_iter=500, alpha=0.05, solver='sgd',
                           learning_rate='adaptive', activation='tanh').fit(X_train, y_train)
Y_pred_lr = model_mlpr.predict(X_test)

# Metrics
print(f"Train set score (R2): {model_mlpr.score(X_train, y_train)}")
print(f"Test set score (R2): {model_mlpr.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lr)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {mean_squared_error(y_test, Y_pred_lr, squared=False)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lr)}")

Train set score (R2): 2.5005464128757815e-05
Test set score (R2): -0.0010687302529537313
Mean Squared Error (MSE): 3.036518595910839
Root Mean Squared Error (RMSE): 1.7425609303295075
Root Mean Squared Log Error (RMSlE): 0.013641692910914742
Mean Absolute Error (MAE): 1.4407921678934594
Mean Absolute Percentage Error (MAPE): 0.10397401167317989


### 8. LightGBM Regressor

In [None]:
model_lgbm = LGBMRegressor().fit(X_train, y_train)
Y_pred_lgbm = model_lgbm.predict(X_test)

# Metrics
print(f"Train set score (R2): {model_lgbm.score(X_train, y_train)}")
print(f"Test set score (R2): {model_lgbm.score(X_test, y_test)}")
mse = mean_squared_error(y_test, Y_pred_lgbm)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {mean_squared_error(y_test, Y_pred_lgbm, squared=False)}")
print(f"Root Mean Squared Log Error (RMSlE): {mean_squared_log_error(y_test, Y_pred_lr)}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, Y_pred_lgbm)}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error(y_test, Y_pred_lgbm)}")

Train set score (R2): 0.8038397971199561
Test set score (R2): 0.032604688306304475
Mean Squared Error (MSE): 2.934377795231509
Root Mean Squared Error (RMSE): 1.713002567199334
Root Mean Squared Log Error (RMSlE): 0.013641692910914742
Mean Absolute Error (MAE): 1.3938327531558599
Mean Absolute Percentage Error (MAPE): 0.10041847243845886


Let us do KFold cross validation and average the predictions of the test set.

#### Results interpretation
This model performs poorly because of the negative train and test R2 scores. However, its performance is better than that off the linear, lasso and ridge models.

## Predicting The Test Dataset with our Star Model

In [None]:
# Make prediction on the test set
test_df = clean_test[main_cols]
predictions = model_GBR.predict(test_df)

In [None]:
predictions

array([12.92019614, 13.91043689, 13.29703773, ..., 13.24147036,
       12.99785008, 14.262472  ])