# Imports

In [28]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from catboost import CatBoostRegressor # not sure to test
from sklearn.kernel_ridge import KernelRidge # not sure to test
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor

from src.evaluate_regression import get_rankings, average_spearman

In [29]:
# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set_style("whitegrid")
sns.set_palette("Set2")

np.random.seed(42)

In [30]:
# get current directory
os.getcwd()

'C:\\Users\\merti\\PycharmProjects\\phase-2\\notebooks\\week11'

# Load data

In [31]:
FACTORS = ["dataset", "model", "tuning", "scoring"]
NEW_INDEX = "encoder"

In [32]:
# Load Train Data
X_train = pd.read_csv("../../data/preprocessed/X_train.csv")
y_train = pd.read_csv("../../data/preprocessed/y_train.csv")

In [33]:
print("Shape of X_train ", X_train.shape)
print("Shape of y_train ", y_train.shape)

Shape of X_train  (26989, 112)
Shape of y_train  (26989, 1)


In [34]:
X_train.head(5)

Unnamed: 0,dataset,model_DTC,model_KNC,model_LGBMC,model_LR,model_SVC,tuning_full,tuning_model,tuning_no,scoring_ACC,...,non_categorical_features_count,ratio_of_categorical_features_to_all,sum_of_all_categories,categorical_target_variables_count,non_categorical_target_variables_count,categorical_target_values_sum,total_feature_count,min_number_of_categories_per_cat_feature,max_number_of_categories_per_cat_feature,avg_number_of_categories_per_cat_feature
0,0.025297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.564327,0.045059,1.0,0.0,0.0,0.0,0.476695,0.0,0.705853,0.429681
1,0.025297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.564327,0.045059,1.0,0.0,0.0,0.0,0.476695,0.0,0.705853,0.429681
2,0.025297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.564327,0.045059,1.0,0.0,0.0,0.0,0.476695,0.0,0.705853,0.429681
3,0.025297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.564327,0.045059,1.0,0.0,0.0,0.0,0.476695,0.0,0.705853,0.429681
4,0.025297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.564327,0.045059,1.0,0.0,0.0,0.0,0.476695,0.0,0.705853,0.429681


In [35]:
# Load holdout data
X_holdout_original = pd.read_csv("../../data/preprocessed/X_hold_out_original.csv")
X_holdout = pd.read_csv("../../data/preprocessed/X_holdout.csv")
y_holdout = pd.read_csv("../../data/preprocessed/y_holdout.csv")

In [36]:
print("Shape of X_holdout_original ", X_holdout_original.shape)
print("Shape of X_holdout ", X_holdout.shape)
print("Shape of y_holdout ", y_holdout.shape)

Shape of X_holdout_original  (9065, 5)
Shape of X_holdout  (9065, 112)
Shape of y_holdout  (9065, 1)


# Modelling

In [37]:
# minmax scaling for y (target) values [0,1]
scaler = MinMaxScaler()
y_train_scaled = scaler.fit_transform(y_train)
y_train_scaled = pd.DataFrame(y_train_scaled, columns=y_train.columns)
y_holdout_scaled = scaler.transform(y_holdout)
y_holdout_scaled = pd.DataFrame(y_holdout_scaled, columns=y_holdout.columns)

y_train_scaled.head(5)

Unnamed: 0,rank
0,0.677419
1,0.612903
2,0.83871
3,0.387097
4,0.903226


In [38]:
# regression models to be evaluated
models = {
    # tree
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
    # ensemble
    "RandomForestRegressor": RandomForestRegressor(random_state=42, n_jobs=-1),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=42, n_jobs=-1),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
    # linear
    "ElasticNet": ElasticNet(random_state=42),
    "SGDRegressor": SGDRegressor(random_state=42),
    "SVR": SVR(),
    "BayesianRidge": BayesianRidge(),
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    # others
    "CatBoostRegressor": CatBoostRegressor(random_state=42, verbose=False),
    "KernelRidge": KernelRidge(),
    "XGBRegressor": XGBRegressor(random_state=42, n_jobs=-1),
    "LGBMRegressor": LGBMRegressor(random_state=42, n_jobs=-1)
}

In [41]:
def fit_and_predict(models, X_train, y_train, X_holdout):
    """
    Fit on train data and predict on holdout data.

    :param models: Dictionary of models to be evaluated.
    :type models: dict
    :param X_train: Train data.
    :type X_train: pandas.core.frame.DataFrame
    :param y_train: Train target.
    :type y_train: pandas.core.frame.DataFrame
    :param X_holdout: Holdout data.
    :type X_holdout: pandas.core.frame.DataFrame

    :return: Dictionary of predictions and fitted models.
    :rtype: dict
    """
    predictions = {}
    fitted_models = {}

    # transform y_train to 1D numpy array in order to fit the models if needed
    if type(y_train) is not np.ndarray:
        y_train = y_train.values.ravel()

    # Create a single progress bar for the loop
    progress_bar = tqdm(models.items(), total=len(models), desc="Models")

    for model_name, model in progress_bar:
        print("="*50)
        print(f"Training {model_name}...")
        model.fit(X_train, y_train)
        fitted_models[model_name] = model

        print(f"Predicting {model_name}...")
        predictions[model_name] = model.predict(X_holdout)

    return predictions, fitted_models

In [42]:
# fit and predict
predictions, fitted_models = fit_and_predict(models, X_train, y_train_scaled, X_holdout)

  0%|          | 0/15 [00:00<?, ?it/s]

Training DecisionTreeRegressor...


  7%|▋         | 1/15 [00:00<00:06,  2.22it/s]

Predicting DecisionTreeRegressor...
Training RandomForestRegressor...


 13%|█▎        | 2/15 [00:06<00:47,  3.69s/it]

Predicting RandomForestRegressor...
Training ExtraTreesRegressor...


 20%|██        | 3/15 [00:10<00:47,  3.93s/it]

Predicting ExtraTreesRegressor...
Training GradientBoostingRegressor...


 40%|████      | 6/15 [00:23<00:31,  3.54s/it]

Predicting GradientBoostingRegressor...
Training ElasticNet...
Predicting ElasticNet...
Training SGDRegressor...
Predicting SGDRegressor...
Training SVR...
Predicting SVR...


 47%|████▋     | 7/15 [02:27<04:52, 36.58s/it]

Training BayesianRidge...


 53%|█████▎    | 8/15 [02:28<03:05, 26.43s/it]

Predicting BayesianRidge...
Training LinearRegression...


 73%|███████▎  | 11/15 [02:28<00:41, 10.48s/it]

Predicting LinearRegression...
Training Ridge...
Predicting Ridge...
Training Lasso...
Predicting Lasso...
Training CatBoostRegressor...


 80%|████████  | 12/15 [02:42<00:33, 11.17s/it]

Predicting CatBoostRegressor...
Training KernelRidge...
Predicting KernelRidge...


 87%|████████▋ | 13/15 [09:12<03:37, 108.92s/it]

Training XGBRegressor...


 93%|█████████▎| 14/15 [09:15<01:20, 80.40s/it] 

Predicting XGBRegressor...
Training LGBMRegressor...


100%|██████████| 15/15 [09:15<00:00, 37.07s/it]

Predicting LGBMRegressor...



