In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
import pandas as pd
import numpy as np
import math

import sys
sys.path.append("../..")
from src.load_datasets import load_dataset, load_rankings, load_train_data
import src.evaluate_regression

# From example
import src.encoder_utils as eu
import src.evaluate_regression as er
import src.load_datasets as ld
import src.pairwise_utils as pu

from category_encoders import OneHotEncoder
from pathlib import Path
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Classifiers
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

# Example

- With CV
- Without preprocessing

In [3]:
# Define variables for ranking
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "rank"

In [4]:
df_train = load_dataset("../../data/raw/dataset_rank_train.csv")

if "cv_score" in df_train.columns:
    df_train = df_train.drop("cv_score", axis=1)

X_train = df_train.drop(target, axis=1)
y_train = df_train[target]

Loading data from '../../data/raw/dataset_rank_train.csv' ...


In [5]:
# For pairwise methods: 
X_train = df_train[factors + ["encoder"]].groupby(factors).agg(lambda x: np.nan).reset_index()[factors]
print(X_train.shape)
X_train.head()

(1161, 4)


Unnamed: 0,dataset,model,tuning,scoring
0,3,DTC,full,ACC
1,3,DTC,full,AUC
2,3,DTC,full,F1
3,3,DTC,model,AUC
4,3,DTC,model,F1


In [6]:
y_train = pd.merge(X_train,
                   pu.get_pairwise_target(df_train, features=factors, target="rank", column_to_compare="encoder"),
                   on=factors, how="left").drop(factors, axis=1).fillna(0)
print(y_train.shape)
y_train.head()

(1161, 992)


Unnamed: 0,"(BUCV2RGLMME, BUCV2TE)","(BUCV2TE, BUCV2RGLMME)","(BUCV2RGLMME, CBE)","(CBE, BUCV2RGLMME)","(BUCV2RGLMME, CE)","(CE, BUCV2RGLMME)","(BUCV2RGLMME, CV10RGLMME)","(CV10RGLMME, BUCV2RGLMME)","(BUCV2RGLMME, CV10TE)","(CV10TE, BUCV2RGLMME)",...,"(OHE, TE)","(TE, OHE)","(OHE, WOEE)","(WOEE, OHE)","(OHE, SE)","(SE, OHE)","(SE, TE)","(TE, SE)","(SE, WOEE)","(WOEE, SE)"
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [7]:
target = list(y_train.columns)
cv_indices = src.evaluate_regression.custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), 
                                                                    factors, 
                                                                    target,  #target 
                                                                    n_splits=5, 
                                                                    shuffle=True, 
                                                                    random_state=1444)

In [8]:
for fold in cv_indices:
    X_tr = X_train.iloc[fold[0]]
    X_te = X_train.iloc[fold[1]]
    y_tr = y_train.iloc[fold[0]]
    y_te = y_train.iloc[fold[1]]
    
    dummy_pipe = Pipeline([("encoder", eu.NoY(OneHotEncoder())), ("model", DecisionTreeClassifier())])
    y_pred = pd.DataFrame(dummy_pipe.fit(X_tr, y_tr).predict(X_te), columns=y_tr.columns, index=X_te.index)
    # In contrast to the example use df_train
    # It should do the same, since a inner join is used and the other part consists of X_te
    tmp = pu.join_pairwise2rankings(X_te, y_pred, factors)
    df_pred = pd.merge(df_train,
                       tmp,
                       on=factors + ["encoder"], how="inner")
    
    rankings_test = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank")
    rankings_pred = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank_pred")
    print(er.average_spearman(rankings_test, rankings_pred))

0.7303527078715193
0.764568338388426
0.7072889052019953
0.7655881493418606
0.7204404002609067


# Example
 
- With CV
- With preprocessing

In [9]:
# Define variables for ranking
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "rank"

In [10]:
df_train = load_dataset("../../data/raw/dataset_rank_train.csv")

if "cv_score" in df_train.columns:
    df_train = df_train.drop("cv_score", axis=1)

X_train = df_train.drop(target, axis=1)
y_train = df_train[target]

Loading data from '../../data/raw/dataset_rank_train.csv' ...


In [11]:
# For pairwise methods: 
X_train = df_train[factors + ["encoder"]].groupby(factors).agg(lambda x: np.nan).reset_index()[factors]
print(X_train.shape)
X_train.head()

(1161, 4)


Unnamed: 0,dataset,model,tuning,scoring
0,3,DTC,full,ACC
1,3,DTC,full,AUC
2,3,DTC,full,F1
3,3,DTC,model,AUC
4,3,DTC,model,F1


In [12]:
y_train = pd.merge(X_train,
                   pu.get_pairwise_target(df_train, features=factors, target="rank", column_to_compare="encoder"),
                   on=factors, how="left").drop(factors, axis=1).fillna(0)
print(y_train.shape)
y_train.head()

(1161, 992)


Unnamed: 0,"(BUCV2RGLMME, BUCV2TE)","(BUCV2TE, BUCV2RGLMME)","(BUCV2RGLMME, CBE)","(CBE, BUCV2RGLMME)","(BUCV2RGLMME, CE)","(CE, BUCV2RGLMME)","(BUCV2RGLMME, CV10RGLMME)","(CV10RGLMME, BUCV2RGLMME)","(BUCV2RGLMME, CV10TE)","(CV10TE, BUCV2RGLMME)",...,"(OHE, TE)","(TE, OHE)","(OHE, WOEE)","(WOEE, OHE)","(OHE, SE)","(SE, OHE)","(SE, TE)","(TE, SE)","(SE, WOEE)","(WOEE, SE)"
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


### Preprocessing

- Do not preprocess (embed) the encoder features (because it is not present in this task)

In [13]:
import src.encoding
from src.feature_engineering import normalize_train_data, normalize_test_data
from src.meta_information import add_dataset_meta_information
from src.data_cleaning import drop_pearson_correlated_features

In [14]:
base_df = df_train.copy()
print(base_df.columns)

Index(['dataset', 'model', 'tuning', 'scoring', 'encoder', 'rank'], dtype='object')


In [15]:
# OHE encoding 
X_train, ohe = src.encoding.ohe_encode_train_data(X_train=X_train,
                                                  cols_to_encode=["model", "tuning", "scoring"],
                                                  verbosity=2)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...


In [16]:
base_df_mod = src.encoding.ohe_encode_test_data(X_test=base_df,
                                                cols_to_encode=["model", "tuning", "scoring"],
                                                ohe=ohe, 
                                                verbosity=2)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the test data ...


In [17]:
X_train = add_dataset_meta_information(df=X_train,
                                       path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                       nan_threshold=0.5,
                                       replacing_strategy="median")

In [18]:
#cols = list(base_df_mod.columns)
#X_train, scaler = normalize_train_data(X_train=X_train[cols], 
#                                       method="minmax",
#                                       verbosity=2)

In [19]:
#x_test_scaled = scaler.transform(base_df_mod)
## Transform back to pandas DataFrame
#base_df_mod_scaled = pd.DataFrame(x_test_scaled, columns=base_df_mod.columns, index=base_df_mod.index)
#base_df_mod_scaled.head()

In [20]:
#base_df_mod_scaled.head()

In [21]:
X_train, _ = drop_pearson_correlated_features(train_data=X_train, 
                                              test_data=X_train, 
                                              threshold=0.7, 
                                              verbosity=2)

Drop pearson correlated features with threshold 0.7...
Filter correlated features


In [22]:
print(list(X_train.columns))

['dataset', 'model_DTC', 'model_KNC', 'model_LGBMC', 'model_LR', 'model_SVC', 'tuning_full', 'tuning_model', 'tuning_no', 'scoring_ACC', 'scoring_AUC', 'scoring_F1', 'Quartile1KurtosisOfNumericAtts', 'J48.001.ErrRate', 'Dimensionality', 'Quartile2MutualInformation', 'MinSkewnessOfNumericAtts', 'Quartile2AttributeEntropy', 'MinorityClassSize', 'MajorityClassPercentage', 'Quartile2StdDevOfNumericAtts', 'NumberOfBinaryFeatures', 'Quartile1MutualInformation', 'Quartile1MeansOfNumericAtts', 'MaxMutualInformation', 'AutoCorrelation', 'PercentageOfBinaryFeatures', 'MinKurtosisOfNumericAtts', 'DecisionStumpErrRate', 'PercentageOfNumericFeatures', 'NumberOfSymbolicFeatures', 'MinMutualInformation', 'PercentageOfInstancesWithMissingValues', 'MinNominalAttDistinctValues', 'NumberOfNumericFeatures', 'rows_with_null_values_count', 'categorical_target_variables_count', 'non_categorical_target_variables_count', 'categorical_target_values_sum', 'min_number_of_categories_per_cat_feature']


In [23]:
print(list(base_df_mod.columns))

['dataset', 'encoder', 'rank', 'model_DTC', 'model_KNC', 'model_LGBMC', 'model_LR', 'model_SVC', 'tuning_full', 'tuning_model', 'tuning_no', 'scoring_ACC', 'scoring_AUC', 'scoring_F1']


In [24]:
new_factors = ['dataset', 
               'model_DTC', 
               'model_KNC', 
               'model_LGBMC', 
               'model_LR', 
               'model_SVC', 
               'tuning_full', 
               'tuning_model', 
               'tuning_no', 
               'scoring_ACC', 
               'scoring_AUC', 
               'scoring_F1']
new_factors = list(base_df_mod.columns)
new_factors.remove("rank")
new_factors.remove("encoder")

### Predict with CV

In [25]:
target = list(y_train.columns)
factors = list(X_train.columns)
cv_indices = src.evaluate_regression.custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), 
                                                                    factors, 
                                                                    target,  #target 
                                                                    n_splits=5, 
                                                                    shuffle=True, 
                                                                    random_state=1444)

In [None]:
models = [
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42, n_jobs=-1),
    ExtraTreeClassifier(random_state=42), 
    ExtraTreesClassifier(random_state=42, n_jobs=-1)
]
for model in models:
    scores = []
    for fold in cv_indices:
        X_tr = X_train.iloc[fold[0]].copy()
        X_te = X_train.iloc[fold[1]].copy()
        y_tr = y_train.iloc[fold[0]].copy()
        y_te = y_train.iloc[fold[1]].copy()

        pipeline = Pipeline([("scaler", MinMaxScaler()), ("model", model)])
        y_pred = pd.DataFrame(pipeline.fit(X_tr, y_tr).predict(X_te), columns=y_tr.columns, index=X_te.index)
        # In contrast to the example use df_train
        # It should do the same, since a inner join is used and the other part consists of X_te
        tmp = pu.join_pairwise2rankings(X_te, y_pred, factors)
        df_pred = pd.merge(base_df_mod,
                           tmp,
                           on=new_factors + ["encoder"], how="inner")

        rankings_test = er.get_rankings(df_pred, factors=new_factors, new_index=new_index, target="rank")
        rankings_pred = er.get_rankings(df_pred, factors=new_factors, new_index=new_index, target="rank_pred")
        scores.append(er.average_spearman(rankings_test, rankings_pred))
    
    # Print scores
    print(f"{model}")
    print(f"Avg Spearman: {round(np.mean(scores), 4)} +/- {round(np.std(scores), 4)}")

DecisionTreeClassifier(random_state=42)
Avg Spearman: 0.7425 +/- 0.0065
RandomForestClassifier(n_jobs=-1, random_state=42)
Avg Spearman: 0.7113 +/- 0.0195
ExtraTreeClassifier(random_state=42)
Avg Spearman: 0.5618 +/- 0.0156
