In [44]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor

import src.evaluate_regression as er
import src.load_datasets as ld

from src.encoding import ohe_encode_train_data
from meta_information import add_dataset_meta_information
from feature_engineering import normalize_train_data
from data_cleaning import drop_pearson_correlated_features


# Load Data

In [45]:
DATA_DIR = Path("./data")

factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "rank"

In [46]:
df_train = ld.load_dataset("../../data/raw/dataset_rank_train.csv")

if "cv_score" in df_train.columns:
    df_train = df_train.drop("cv_score", axis=1)

Loading data from '../../data/raw/dataset_rank_train.csv' ...


# Adapt to schema of the current week

In [47]:
df_train = pd.pivot(df_train, index=factors, columns="encoder", values="rank").reset_index()
df_train.head()

encoder,dataset,model,tuning,scoring,BE,BUCV10RGLMME,BUCV10TE,BUCV2RGLMME,BUCV2TE,BUCV5RGLMME,...,MHE,OE,OHE,PBTE0001,PBTE001,PBTE01,RGLMME,SE,TE,WOEE
0,3,DTC,full,ACC,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
1,3,DTC,full,AUC,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
2,3,DTC,full,F1,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,3,DTC,model,AUC,12.0,14.0,0.0,18.0,6.0,17.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0
4,3,DTC,model,F1,12.0,13.0,0.0,18.0,6.0,16.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0


In [48]:
X_train = df_train[factors]
X_train_org = X_train.copy()
X_train.head()

encoder,dataset,model,tuning,scoring
0,3,DTC,full,ACC
1,3,DTC,full,AUC
2,3,DTC,full,F1
3,3,DTC,model,AUC
4,3,DTC,model,F1


In [49]:
y_train = df_train.drop(factors, axis=1)

print(y_train.shape)
print(y_train.columns)

y_train.head()

(1161, 32)
Index(['BE', 'BUCV10RGLMME', 'BUCV10TE', 'BUCV2RGLMME', 'BUCV2TE',
       'BUCV5RGLMME', 'BUCV5TE', 'CBE', 'CE', 'CV10RGLMME', 'CV10TE',
       'CV2RGLMME', 'CV2TE', 'CV5RGLMME', 'CV5TE', 'DE', 'DTEM10', 'DTEM2',
       'DTEM5', 'ME01E', 'ME10E', 'ME1E', 'MHE', 'OE', 'OHE', 'PBTE0001',
       'PBTE001', 'PBTE01', 'RGLMME', 'SE', 'TE', 'WOEE'],
      dtype='object', name='encoder')


encoder,BE,BUCV10RGLMME,BUCV10TE,BUCV2RGLMME,BUCV2TE,BUCV5RGLMME,BUCV5TE,CBE,CE,CV10RGLMME,...,MHE,OE,OHE,PBTE0001,PBTE001,PBTE01,RGLMME,SE,TE,WOEE
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,12.0,14.0,0.0,18.0,6.0,17.0,4.0,24.0,8.0,16.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0
4,12.0,13.0,0.0,18.0,6.0,16.0,4.0,25.0,8.0,15.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0


In [50]:
target = list(y_train.columns)
cv_indices = er.custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1),
                                                                    factors,
                                                                    target,
                                                                    n_splits=5,
                                                                    shuffle=True,
                                                                    random_state=1444)

# Preprocess Train Data

In [51]:
X_train, _ = ohe_encode_train_data(X_train=X_train, cols_to_encode=["model", "tuning", "scoring"], verbosity=1)

X_train = add_dataset_meta_information(df=X_train,
                                        path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                        nan_threshold=0.5,
                                        replacing_strategy="median")

# Drop correlated features
X_train, _ = drop_pearson_correlated_features(train_data=X_train, test_data=None, threshold=0.7, verbosity=1)

# Normalize
X_train, scaler = normalize_train_data(X_train=X_train, method="minmax", verbosity=1)

X_train.head(5)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...
Drop pearson correlated features with threshold 0.7...
Filter correlated features
Normalizing train data using method 'minmax' ...


Unnamed: 0,dataset,model_DTC,model_KNC,model_LGBMC,model_LR,model_SVC,tuning_full,tuning_model,tuning_no,scoring_ACC,...,NumberOfSymbolicFeatures,MinMutualInformation,PercentageOfInstancesWithMissingValues,MinNominalAttDistinctValues,NumberOfNumericFeatures,rows_with_null_values_count,categorical_target_variables_count,non_categorical_target_variables_count,categorical_target_values_sum,min_number_of_categories_per_cat_feature
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.270073,4.793819e-08,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015152


In [52]:
y_train = y_train.fillna(np.max(y_train))

  return reduction(axis=axis, out=out, **passkwargs)


# Evaluate using custom CV

In [53]:
scores = []
dummy_pipe = Pipeline([
    ("model", ExtraTreesRegressor(random_state=43))
])

for fold in cv_indices:
    #X_train_org_tr = X_train_org.iloc[fold[0]]
    X_train_org_te = X_train_org.iloc[fold[1]]

    X_tr = X_train.iloc[fold[0]]
    X_te = X_train.iloc[fold[1]]
    y_tr = y_train.iloc[fold[0]]
    y_te = y_train.iloc[fold[1]]

    y_pred = pd.DataFrame(dummy_pipe.fit(X_tr, y_tr).predict(X_te), columns=y_tr.columns, index=X_te.index)
    
    df_pred = pd.merge(pd.concat([X_train_org_te, y_te], axis=1).melt(id_vars=factors, value_name="rank").dropna(axis=0),
                       pd.concat([X_train_org_te, y_pred], axis=1).melt(id_vars=factors, value_name="rank_pred"),
                       on=factors+["encoder"], how="left")

    rankings_test = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank")
    rankings_pred = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank_pred")
    scores.append(er.average_spearman(rankings_test, rankings_pred))

print(f"Average Spearman: {round(np.mean(scores), 4)} +/- {round(np.std(scores), 4)}")

Average Spearman: 0.7587 +/- 0.0219
