In [233]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import make_scorer, mean_squared_error, r2_score, matthews_corrcoef
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate, train_test_split

In [183]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path

import src.evaluate_regression as er
import src.load_datasets as ld

from src.encoding import ohe_encode_train_data
from src.meta_information import add_dataset_meta_information
from src.feature_engineering import normalize_train_data
from src.data_cleaning import drop_pearson_correlated_features

In [184]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set_style("whitegrid")
sns.set_palette("Set2") # Paired

In [185]:
random_state = 42

# Load data

In [186]:
DATA_DIR = Path("./data")

In [187]:
df_train = ld.load_dataset("../../data/raw/dataset_rank_train.csv")
if "cv_score" in df_train.columns:
    df_train = df_train.drop("cv_score", axis=1)
df_train, df_holdout = train_test_split(df_train, test_size=0.2, random_state=random_state)

Loading data from '../../data/raw/dataset_rank_train.csv' ...


In [188]:
print("Shape of train data: ", df_train.shape)
print("Columns of train data: ", df_train.columns)
#df_train.sort_values(by=["dataset", "rank"], inplace=True)
df_train.head()

Shape of train data:  (28843, 6)
Columns of train data:  Index(['dataset', 'model', 'tuning', 'scoring', 'encoder', 'rank'], dtype='object')


Unnamed: 0,dataset,model,tuning,scoring,encoder,rank
21436,43922,LR,full,F1,SE,1.0
7324,42738,DTC,model,ACC,BUCV2RGLMME,26.0
14725,31,LR,model,ACC,ME01E,11.0
30041,43922,LR,no,F1,WOEE,0.0
644,56,LR,model,AUC,DTEM2,1.0


# Pivot data to get multi-output target of encoder ranking

In [189]:
# statics of rank
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "rank"

In [190]:
# pivot encoder column with rank as value
df_train_pivot = df_train.pivot_table(
    index=factors, columns=new_index, values=target, aggfunc="first"
).reset_index()
df_holdout_pivot = df_holdout.pivot_table(
    index=factors, columns=new_index, values=target, aggfunc="first"
).reset_index()

In [191]:
print("Shape of train data: ", df_train_pivot.shape)
print("Columns of train data: ", df_train_pivot.columns)
df_train_pivot.head()

Shape of train data:  (1160, 36)
Columns of train data:  Index(['dataset', 'model', 'tuning', 'scoring', 'BE', 'BUCV10RGLMME',
       'BUCV10TE', 'BUCV2RGLMME', 'BUCV2TE', 'BUCV5RGLMME', 'BUCV5TE', 'CBE',
       'CE', 'CV10RGLMME', 'CV10TE', 'CV2RGLMME', 'CV2TE', 'CV5RGLMME',
       'CV5TE', 'DE', 'DTEM10', 'DTEM2', 'DTEM5', 'ME01E', 'ME10E', 'ME1E',
       'MHE', 'OE', 'OHE', 'PBTE0001', 'PBTE001', 'PBTE01', 'RGLMME', 'SE',
       'TE', 'WOEE'],
      dtype='object', name='encoder')


encoder,dataset,model,tuning,scoring,BE,BUCV10RGLMME,BUCV10TE,BUCV2RGLMME,BUCV2TE,BUCV5RGLMME,...,MHE,OE,OHE,PBTE0001,PBTE001,PBTE01,RGLMME,SE,TE,WOEE
0,3,DTC,full,ACC,,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,,1.0,2.0,1.0,,1.0,1.0
1,3,DTC,full,AUC,1.0,,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
2,3,DTC,full,F1,1.0,,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,3,DTC,model,AUC,12.0,14.0,0.0,18.0,,17.0,...,2.0,9.0,5.0,7.0,,23.0,19.0,1.0,3.0,3.0
4,3,DTC,model,F1,12.0,13.0,0.0,18.0,6.0,,...,2.0,9.0,5.0,,11.0,23.0,19.0,1.0,3.0,


In [192]:
# Set train variables
X_train = df_train_pivot[factors]
y_train = df_train_pivot.drop(factors, axis=1)
# Set holdout variables
X_holdout = df_holdout_pivot[factors]
y_holdout = df_holdout_pivot.drop(factors, axis=1)
# Update target
target = y_train.columns.tolist()

In [193]:
# Check for missing values
print("Missing values in train data: ", X_train.isnull().sum().sum())
print("Missing values in target: ", y_train.isnull().sum().sum())
print("Missing values in holdout data: ", X_holdout.isnull().sum().sum())
print("Missing values in holdout target: ", y_holdout.isnull().sum().sum())

Missing values in train data:  0
Missing values in target:  8277
Missing values in holdout data:  0
Missing values in holdout target:  29813


In [194]:
# Fill missing values (or np.max() of target)
y_train.fillna(y_train.median(), inplace=True)
y_holdout.fillna(y_train.median(), inplace=True)

In [195]:
# Check for missing values
print("Missing values in train data: ", X_train.isnull().sum().sum())
print("Missing values in target: ", y_train.isnull().sum().sum())
print("Missing values in holdout data: ", X_holdout.isnull().sum().sum())
print("Missing values in holdout target: ", y_holdout.isnull().sum().sum())

Missing values in train data:  0
Missing values in target:  0
Missing values in holdout data:  0
Missing values in holdout target:  0


# Preprocess data

In [196]:
# Save copy of unprocessed train data
X_train_original = X_train.copy()
X_holdout_original = X_holdout.copy()

In [197]:
%%time
# Encode categorical features
X_train, _ = ohe_encode_train_data(X_train=X_train, cols_to_encode=["model", "tuning", "scoring"], verbosity=1)
X_holdout, _ = ohe_encode_train_data(X_train=X_holdout, cols_to_encode=["model", "tuning", "scoring"], verbosity=1)
# Add meta information
X_train = add_dataset_meta_information(df=X_train,
                                        path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                        nan_threshold=0.5,
                                        replacing_strategy="median")
X_holdout = add_dataset_meta_information(df=X_holdout,
                                        path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                        nan_threshold=0.5,
                                        replacing_strategy="median")
# Drop correlated features
X_train, _ = drop_pearson_correlated_features(train_data=X_train, test_data=None, threshold=0.7, verbosity=1)
X_holdout, _ = drop_pearson_correlated_features(train_data=X_holdout, test_data=None, threshold=0.7, verbosity=1)
# Normalize
X_train, scaler = normalize_train_data(X_train=X_train, method="minmax", verbosity=1)
X_holdout, _ = normalize_train_data(X_train=X_holdout, method="minmax", verbosity=1)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...
One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...
Drop pearson correlated features with threshold 0.7...
Filter correlated features
Drop pearson correlated features with threshold 0.7...
Filter correlated features
Normalizing train data using method 'minmax' ...
Normalizing train data using method 'minmax' ...
CPU times: total: 125 ms
Wall time: 606 ms


In [198]:
print("Shape of train data: ", X_train.shape)
print("Columns of train data: ", X_train.columns)
X_train.head()

Shape of train data:  (1160, 40)
Columns of train data:  Index(['dataset', 'model_DTC', 'model_KNC', 'model_LGBMC', 'model_LR',
       'model_SVC', 'tuning_full', 'tuning_model', 'tuning_no', 'scoring_ACC',
       'scoring_AUC', 'scoring_F1', 'Quartile1KurtosisOfNumericAtts',
       'J48.001.ErrRate', 'Dimensionality', 'Quartile2MutualInformation',
       'MinSkewnessOfNumericAtts', 'Quartile2AttributeEntropy',
       'MinorityClassSize', 'MajorityClassPercentage',
       'Quartile2StdDevOfNumericAtts', 'NumberOfBinaryFeatures',
       'Quartile1MutualInformation', 'Quartile1MeansOfNumericAtts',
       'MaxMutualInformation', 'AutoCorrelation', 'PercentageOfBinaryFeatures',
       'MinKurtosisOfNumericAtts', 'DecisionStumpErrRate',
       'PercentageOfNumericFeatures', 'NumberOfSymbolicFeatures',
       'MinMutualInformation', 'PercentageOfInstancesWithMissingValues',
       'MinNominalAttDistinctValues', 'NumberOfNumericFeatures',
       'rows_with_null_values_count', 'categorical_tar

Unnamed: 0,dataset,model_DTC,model_KNC,model_LGBMC,model_LR,model_SVC,tuning_full,tuning_model,tuning_no,scoring_ACC,...,NumberOfSymbolicFeatures,MinMutualInformation,PercentageOfInstancesWithMissingValues,MinNominalAttDistinctValues,NumberOfNumericFeatures,rows_with_null_values_count,categorical_target_variables_count,non_categorical_target_variables_count,categorical_target_values_sum,min_number_of_categories_per_cat_feature
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152


# Train and Evaluate models

In [234]:
# Model selection for multi-class classification
models = {
    "Random Forest": MultiOutputRegressor(RandomForestRegressor(random_state=random_state)),
    "XGBoost": MultiOutputRegressor(XGBRegressor(random_state=random_state)),
    "LightGBM": MultiOutputRegressor(LGBMRegressor(random_state=random_state)),
    "KNN": MultiOutputRegressor(KNeighborsRegressor()),
    "CatBoost": MultiOutputRegressor(CatBoostRegressor(random_state=random_state, silent=True)),
    "Decision Tree": MultiOutputRegressor(DecisionTreeRegressor(random_state=random_state)),
    "Gradient Boosting": MultiOutputRegressor(GradientBoostingRegressor(random_state=random_state)),
    # Add other multi-output regression models here if needed
}

## via cross validation

In [218]:
# Get indices of train data
cv_indices = er.custom_cross_validated_indices(pd.concat([X_train_original, y_train], axis=1), factors, target, n_splits=5, shuffle=True, random_state=random_state)

In [237]:
# Define scoring for CV
scoring = {
    'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}

In [240]:
%%time
# Train and evaluate models
cv_results = []
for i, (model_name, model) in enumerate(models.items()):
    print("Training model: ", model_name, f"({i+1}/{len(models)})", "...")

    # MinMax scale target if necessary (for ExtraTreesRegressor)
    if model_name == "ExtraTrees":
        scaler = MinMaxScaler()
        y_scaled = scaler.fit_transform(y_train)
        y_train = pd.DataFrame(y_scaled, columns=y_train.columns)

    # Perform CV
    results = cross_validate(estimator=model, X=X_train, y=y_train, cv=cv_indices, scoring=scoring, return_train_score=True, error_score="raise")
    cv_results.append(results)

    # Iterate through the provided scoring (list) in cv_results and print results
    for scorer in scoring:
        print(f"CV Training {scorer}: {round(results['train_' + scorer].mean(), 4)} "
              f"+/- {round(results['train_' + scorer].std(), 4)} ")
        print(f"CV Test {scorer}: {round(results['test_' + scorer].mean(), 4)} "
                f"+/- {round(results['test_' + scorer].std(), 4)}")
    print("")

Training model:  Random Forest

CV Training neg_mean_squared_error: -3.9901 +/- 0.042 
CV Test neg_mean_squared_error: -28.6935 +/- 0.8862
CV Training r2: 0.9245 +/- 0.0006 
CV Test r2: 0.4524 +/- 0.0103

Training model:  XGBoost

CV Training neg_mean_squared_error: -0.6032 +/- 0.0116 
CV Test neg_mean_squared_error: -33.4201 +/- 0.9938
CV Training r2: 0.9886 +/- 0.0002 
CV Test r2: 0.3603 +/- 0.0104

Training model:  LightGBM

CV Training neg_mean_squared_error: -9.7467 +/- 0.097 
CV Test neg_mean_squared_error: -29.9793 +/- 0.9563
CV Training r2: 0.8154 +/- 0.0017 
CV Test r2: 0.4283 +/- 0.0127

Training model:  KNN

CV Training neg_mean_squared_error: -36.6004 +/- 0.2284 
CV Test neg_mean_squared_error: -54.5189 +/- 0.6054
CV Training r2: 0.3132 +/- 0.0041 
CV Test r2: -0.0312 +/- 0.0203

Training model:  CatBoost

CV Training neg_mean_squared_error: -3.7631 +/- 0.0147 
CV Test neg_mean_squared_error: -29.4076 +/- 0.8775
CV Training r2: 0.9289 +/- 0.0002 
CV Test r2: 0.4375 +/- 0.01

## via custom average spearmen on holdout set

In [None]:
%%time
# Train and evaluate models
holdout_results = []
for i, (model_name, model) in enumerate(models.items()):
    print("Training model: ", model_name, f"({i+1}/{len(models)})", "...")

    # MinMax scale target if necessary (for ExtraTreesRegressor)
    if model_name == "ExtraTrees":
        scaler = MinMaxScaler()
        y_scaled = scaler.fit_transform(y_train)
        y_train = pd.DataFrame(y_scaled, columns=y_train.columns)

    # Train model on train set
    model.fit(X_train, y_train)

    # Predict on holdout set
    y_holdout_pred = pd.DataFrame(model.predict(X_holdout), columns=y_holdout.columns, index=X_holdout.index)

    # Evaluate predictions
    df_pred = pd.merge(pd.concat([X_holdout_original, y_holdout], axis=1).melt(id_vars=factors, value_name="rank").dropna(axis=0),
                       pd.concat([X_holdout_original, y_holdout_pred], axis=1).melt(id_vars=factors, value_name="pred_rank").dropna(axis=0),
                       on=factors+"encoder", how="left")
    print(test)

    # Get rankings
    rankings_holdout = er.get_rankings(df=df_pred, factors=factors, target="rank")
    rankings_holdout_pred = er.get_rankings(df=df_pred, factors=factors, target="pred_rank")

    # Custom average spearman
    spearman = er.custom_average_spearman(rankings_holdout, rankings_holdout_pred)
    holdout_results.append(spearman)

    # Print results
    print(f"Average Spearman: {round(spearman, 4)}")
    print("")

Training model:  Random Forest (1/7) ...


## compare results in plots

In [None]:
df_cv_results = pd.DataFrame(cv_results)

In [None]:
df_holdout_results = pd.DataFrame(holdout_results)