In [11]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [21]:
import numpy as np
import pandas as pd

import sys
sys.path.append("../..")

from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

import src.evaluate_regression as er
import src.load_datasets as ld

In [22]:
DATA_DIR = Path("./data")

factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "rank"

In [23]:
df_train = ld.load_dataset("../../data/raw/dataset_rank_train.csv")

if "cv_score" in df_train.columns:
    df_train = df_train.drop("cv_score", axis=1)

Loading data from '../../data/raw/dataset_rank_train.csv' ...


In [24]:
df_train = pd.pivot(df_train, index=factors, columns="encoder", values="rank").reset_index()
df_train.head()

encoder,dataset,model,tuning,scoring,BE,BUCV10RGLMME,BUCV10TE,BUCV2RGLMME,BUCV2TE,BUCV5RGLMME,...,MHE,OE,OHE,PBTE0001,PBTE001,PBTE01,RGLMME,SE,TE,WOEE
0,3,DTC,full,ACC,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
1,3,DTC,full,AUC,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
2,3,DTC,full,F1,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,3,DTC,model,AUC,12.0,14.0,0.0,18.0,6.0,17.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0
4,3,DTC,model,F1,12.0,13.0,0.0,18.0,6.0,16.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0


In [25]:
X_train = df_train[factors]
X_train.head()

encoder,dataset,model,tuning,scoring
0,3,DTC,full,ACC
1,3,DTC,full,AUC
2,3,DTC,full,F1
3,3,DTC,model,AUC
4,3,DTC,model,F1


In [26]:
y_train = df_train.drop(factors, axis=1)

print(y_train.shape)
print(y_train.columns)

y_train.head()

(1161, 32)
Index(['BE', 'BUCV10RGLMME', 'BUCV10TE', 'BUCV2RGLMME', 'BUCV2TE',
       'BUCV5RGLMME', 'BUCV5TE', 'CBE', 'CE', 'CV10RGLMME', 'CV10TE',
       'CV2RGLMME', 'CV2TE', 'CV5RGLMME', 'CV5TE', 'DE', 'DTEM10', 'DTEM2',
       'DTEM5', 'ME01E', 'ME10E', 'ME1E', 'MHE', 'OE', 'OHE', 'PBTE0001',
       'PBTE001', 'PBTE01', 'RGLMME', 'SE', 'TE', 'WOEE'],
      dtype='object', name='encoder')


encoder,BE,BUCV10RGLMME,BUCV10TE,BUCV2RGLMME,BUCV2TE,BUCV5RGLMME,BUCV5TE,CBE,CE,CV10RGLMME,...,MHE,OE,OHE,PBTE0001,PBTE001,PBTE01,RGLMME,SE,TE,WOEE
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,12.0,14.0,0.0,18.0,6.0,17.0,4.0,24.0,8.0,16.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0
4,12.0,13.0,0.0,18.0,6.0,16.0,4.0,25.0,8.0,15.0,...,2.0,9.0,5.0,7.0,11.0,23.0,19.0,1.0,3.0,3.0


In [27]:
y_train = y_train.fillna(np.max(y_train))

In [28]:
target = list(y_train.columns)
cv_indices = er.custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), 
                                                                    factors, 
                                                                    target,
                                                                    n_splits=5, 
                                                                    shuffle=True, 
                                                                    random_state=1444)

In [29]:
scores = []

dummy_pipe = Pipeline([("encoder", OneHotEncoder()), ("model", DecisionTreeClassifier(random_state=43))])

for fold in cv_indices:
    X_tr = X_train.iloc[fold[0]]
    X_te = X_train.iloc[fold[1]]
    y_tr = y_train.iloc[fold[0]]
    y_te = y_train.iloc[fold[1]]

    y_pred = pd.DataFrame(dummy_pipe.fit(X_tr, y_tr).predict(X_te), columns=y_tr.columns, index=X_te.index)
    
    df_pred = pd.merge(pd.concat([X_te, y_te], axis=1).melt(id_vars=factors, value_name="rank").dropna(axis=0),
                       pd.concat([X_te, y_pred], axis=1).melt(id_vars=factors, value_name="rank_pred"),
                       on=factors+["encoder"], how="left")

    rankings_test = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank")
    rankings_pred = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank_pred")
    scores.append(er.average_spearman(rankings_test, rankings_pred))

print(f"Average Spearman: {round(np.mean(scores), 4)} +/- {round(np.std(scores), 4)}")

Average Spearman: 0.6654 +/- 0.0318
