In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [9]:
# Imports
import pandas as pd

from category_encoders import OneHotEncoder
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor


import xgboost

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [10]:
import sys
sys.path.append("../..")

from src.load_datasets import (
    load_dataset, 
    load_rankings, 
    load_train_data
)

from src.evaluate_regression import (
    get_rankings, 
    average_spearman, 
    custom_train_test_split
)

import src.utils
import src.load_datasets
import src.modelling
import src.mlflow_registry
import src.encoding
import src.evaluate_regression
from src.feature_engineering import normalize_train_data, normalize_test_data
from src.meta_information import add_dataset_meta_information
from src.evaluate_regression import custom_spearmanr_scorer

# Example code

Leave tuning out. 

In [14]:
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "cv_score"
DATA_DIR = Path("../../data/raw/")


# ---- ... or split a dataset
df_train = load_dataset(DATA_DIR / "dataset_train.csv")
X_train, X_test, y_train, y_test = custom_train_test_split(df_train, factors, target)


# ---- predict ...
dummy_pipe = Pipeline([("encoder", OneHotEncoder()), ("model", DecisionTreeRegressor())])
y_pred = pd.Series(dummy_pipe.fit(X_train, y_train).predict(X_test), index=y_test.index, name="cv_score_pred")
df_pred = pd.concat([X_test, y_test, y_pred], axis=1)

# ---- convert to rankings and evaluate
rankings_test = get_rankings(df_pred, factors=factors, new_index=new_index, target="cv_score")
rankings_pred = get_rankings(df_pred, factors=factors, new_index=new_index, target="cv_score_pred")
print(average_spearman(rankings_test, rankings_pred))

Loading data ...
0.48754998890189793


In [6]:
# View the dataframes
df_pred.head()

Unnamed: 0,dataset,model,tuning,scoring,encoder,cv_score,cv_score_pred
0,43098,KNC,full,AUC,BE,0.701196,0.693207
1,43098,KNC,full,AUC,BUCV10RGLMME,0.849776,0.858937
2,43098,KNC,full,AUC,BUCV10TE,0.84667,0.844146
3,43098,KNC,full,AUC,BUCV2RGLMME,0.853203,0.860167
4,43098,KNC,full,AUC,BUCV2TE,0.866123,0.859091


In [7]:
rankings_test.head()

Unnamed: 0_level_0,3,3,3,3,3,3,3,29,29,29,...,43900,43900,43900,43922,43922,43922,43922,43922,43922,43922
Unnamed: 0_level_1,DTC,DTC,DTC,KNC,LGBMC,LR,LR,DTC,DTC,KNC,...,LGBMC,LR,LR,DTC,KNC,KNC,KNC,LGBMC,SVC,SVC
Unnamed: 0_level_2,full,no,no,full,no,full,no,full,model,full,...,no,model,no,full,full,full,no,no,no,no
Unnamed: 0_level_3,AUC,ACC,AUC,F1,F1,F1,F1,ACC,ACC,F1,...,ACC,ACC,AUC,AUC,ACC,F1,ACC,F1,ACC,F1
encoder,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
BE,1,1,1,22,11,17,5,4,6,7,...,9,3.0,28,10,0,0,0,2,4,4
BUCV10RGLMME,1,1,1,17,3,23,23,7,9,5,...,18,19.0,3,14,17,18,14,13,21,21
BUCV10TE,1,1,1,10,1,25,24,6,10,9,...,19,25.0,1,12,12,12,13,11,12,12
BUCV2RGLMME,1,1,1,16,14,14,13,9,8,13,...,21,14.0,22,15,8,8,9,16,7,8
BUCV2TE,1,1,1,11,7,16,6,8,8,11,...,17,15.0,17,12,9,9,8,12,8,7


# With our preprocessing

In [15]:
verbosity=2

# Make a copy of the original datasets
X_train_original = X_train.copy()
X_test_original = X_test.copy()
y_train_original = y_train.copy()
y_test_original = y_test.copy()

In [16]:
# Preprocess data as in main.py
X_train, ohe = src.encoding.ohe_encode_train_data(X_train=X_train,
                                                  cols_to_encode=["model", "tuning", "scoring"],
                                                  verbosity=verbosity)
X_test = src.encoding.ohe_encode_test_data(X_test=X_test,
                                           cols_to_encode=["model", "tuning", "scoring"],
                                           ohe=ohe, 
                                           verbosity=verbosity)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...
One Hot Encoding the features ['model', 'tuning', 'scoring'] of the test data ...


In [18]:
# Encoder encoding: Poincare Embeddings for feature "encoder"
X_train, _ = src.encoding.poincare_encoding(path_to_graph="../../data/raw/graph.adjlist",
                                            path_to_embeddings="../../data/preprocessed/embeddings.csv",
                                            data=X_train,
                                            column_to_encode="encoder",
                                            encode_dim=50,
                                            explode_dim=True,
                                            epochs=5000,
                                            dim_reduction=None,
                                            verbosity=verbosity)
X_test, _ = src.encoding.poincare_encoding(path_to_embeddings="../../data/preprocessed/embeddings.csv",
                                           data=X_test,
                                           column_to_encode="encoder",
                                           explode_dim=True,
                                           verbosity=verbosity)

(Poincare) Embedding the graph ...
Saving the embeddings to '../../data/preprocessed/embeddings.csv'...
Encoding the data feature 'encoder'...
Loading the embeddings from '../../data/preprocessed/embeddings.csv'...
Encoding the data feature 'encoder'...


In [19]:
# Add meta information
X_train = add_dataset_meta_information(df=X_train,
                                       path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                       nan_threshold=0.4,
                                       replacing_strategy="median")
X_test = add_dataset_meta_information(df=X_test,
                                      path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                      nan_threshold=0.4,
                                      replacing_strategy="median")

In [20]:
# Normalization
X_train, scaler = normalize_train_data(X_train=X_train, 
                                       method="minmax",
                                       verbosity=verbosity)
X_test = normalize_test_data(X_test=X_test, 
                             scaler=scaler, 
                             verbosity=verbosity)

Normalizing train data using method 'minmax' ...
Normalizing test data ...


### Training

In [21]:
model = RandomForestRegressor(random_state=42)
model.fit(X=X_train, y=y_train)

### Evaluation

In [24]:
# Predict
y_pred = pd.Series(model.predict(X_test), index=y_test.index, name=str(y_test.name) + "_pred")
df_pred = pd.concat([X_test, y_test, y_pred], axis=1)

In [27]:
df_pred.head()

Unnamed: 0,dataset,model_DTC,model_KNC,model_LGBMC,model_LR,model_SVC,tuning_full,tuning_model,tuning_no,scoring_ACC,...,sum_of_all_categories,categorical_target_variables_count,non_categorical_target_variables_count,categorical_target_values_sum,total_feature_count,min_number_of_categories_per_cat_feature,max_number_of_categories_per_cat_feature,avg_number_of_categories_per_cat_feature,cv_score,cv_score_pred
0,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000168,0.0,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.701196,0.699061
1,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000168,0.0,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.849776,0.855542
2,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000168,0.0,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.84667,0.848686
3,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000168,0.0,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.853203,0.856451
4,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000168,0.0,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.866123,0.857787


In [35]:
# Map embeddings to encoder names for new index
## Load embeddings
emb_df = pd.read_csv("../../data/preprocessed/embeddings.csv", index_col=0)
## Rename
emb_df = emb_df.rename(columns={col: f"enc_dim_{col}" for col in emb_df.columns})
emb_df = emb_df.reset_index()
emb_df = emb_df.rename(columns={"index": "encoder"})
## Get all columns that start with enc_dim
enc_dim_cols = [col for col in emb_df.columns if col.startswith("enc_dim")]

In [29]:
emb_df.head()

Unnamed: 0,encoder,enc_dim_0,enc_dim_1,enc_dim_2,enc_dim_3,enc_dim_4,enc_dim_5,enc_dim_6,enc_dim_7,enc_dim_8,...,enc_dim_40,enc_dim_41,enc_dim_42,enc_dim_43,enc_dim_44,enc_dim_45,enc_dim_46,enc_dim_47,enc_dim_48,enc_dim_49
0,BE,0.01705,0.093248,0.17722,-0.010868,-0.021274,-0.107334,-0.09287,0.065657,0.127629,...,-0.028807,0.107018,-0.029541,0.259276,-0.19539,-0.103919,-0.003631,0.049735,-0.029539,0.291861
1,identifier,0.016755,0.090748,0.172885,-0.010455,-0.020559,-0.105024,-0.090359,0.063837,0.124562,...,-0.028155,0.104527,-0.028626,0.252764,-0.19081,-0.101375,-0.003598,0.04851,-0.028597,0.284649
2,BUCV10RGLMME,0.23211,-0.063714,-0.13136,-0.11745,-0.124077,0.120489,0.057724,0.317785,0.091357,...,-0.021935,-0.084162,-0.047769,-0.075806,0.16857,0.062137,-0.041558,-0.218176,-0.053083,0.114062
3,bucvglmm,0.208816,-0.056574,-0.119417,-0.106062,-0.112705,0.109952,0.052109,0.287805,0.082281,...,-0.018999,-0.076168,-0.041604,-0.067459,0.153232,0.054975,-0.036123,-0.196416,-0.048845,0.102078
4,BUCV10TE,0.032747,-0.004217,-0.115573,0.029377,0.095292,0.140222,-0.26192,-0.144816,-0.193528,...,-0.073051,-0.060565,-0.077507,-0.111169,0.168129,0.248395,-0.076465,-0.163055,0.060301,-0.236783


In [38]:
print(list(df_pred.columns))

['dataset', 'model_DTC', 'model_KNC', 'model_LGBMC', 'model_LR', 'model_SVC', 'tuning_full', 'tuning_model', 'tuning_no', 'scoring_ACC', 'scoring_AUC', 'scoring_F1', 'enc_dim_0', 'enc_dim_1', 'enc_dim_2', 'enc_dim_3', 'enc_dim_4', 'enc_dim_5', 'enc_dim_6', 'enc_dim_7', 'enc_dim_8', 'enc_dim_9', 'enc_dim_10', 'enc_dim_11', 'enc_dim_12', 'enc_dim_13', 'enc_dim_14', 'enc_dim_15', 'enc_dim_16', 'enc_dim_17', 'enc_dim_18', 'enc_dim_19', 'enc_dim_20', 'enc_dim_21', 'enc_dim_22', 'enc_dim_23', 'enc_dim_24', 'enc_dim_25', 'enc_dim_26', 'enc_dim_27', 'enc_dim_28', 'enc_dim_29', 'enc_dim_30', 'enc_dim_31', 'enc_dim_32', 'enc_dim_33', 'enc_dim_34', 'enc_dim_35', 'enc_dim_36', 'enc_dim_37', 'enc_dim_38', 'enc_dim_39', 'enc_dim_40', 'enc_dim_41', 'enc_dim_42', 'enc_dim_43', 'enc_dim_44', 'enc_dim_45', 'enc_dim_46', 'enc_dim_47', 'enc_dim_48', 'enc_dim_49', 'Dimensionality', 'Quartile2MutualInformation', 'Quartile2AttributeEntropy', 'MinorityClassSize', 'MajorityClassPercentage', 'NumberOfBinaryFeat

In [36]:
## Merge embeddings with df_pred
df_pred = df_pred.merge(emb_df, on=enc_dim_cols, how="left")

In [41]:
df_pred.head()

Unnamed: 0,dataset,model_DTC,model_KNC,model_LGBMC,model_LR,model_SVC,tuning_full,tuning_model,tuning_no,scoring_ACC,...,non_categorical_target_variables_count,categorical_target_values_sum,total_feature_count,min_number_of_categories_per_cat_feature,max_number_of_categories_per_cat_feature,avg_number_of_categories_per_cat_feature,cv_score,cv_score_pred,encoder_x,encoder_y
0,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.701196,0.699061,,
1,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.849776,0.855542,,
2,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.84667,0.848686,,
3,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.853203,0.856451,,
4,0.981238,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.004237,0.015152,0.000183,0.0004,0.866123,0.857787,,


In [43]:
unique_enc_dim_0 = list(emb_df.enc_dim_0.unique())
print(unique_enc_dim_0)

[0.0170499400517017, 0.0167548429572441, 0.2321096669278031, 0.2088156034225758, 0.0327467560786919, 0.0285859119833495, 0.2076704535524305, 0.0268558694245765, 0.2064133008505325, 0.0277798437850369, -0.0562227111928571, -0.0203100247944811, 0.0326221122814797, 0.0204827210990722, 0.0256723576441103, 0.0237935907985766, -0.2039062151959368, -0.1801794921527345, 0.0238655351443312, -0.1789837259549236, 0.0254938827052261, -0.1809019422653849, 0.0180096808033443, 0.0021002347173976, 0.0001371324713022, 0.0006660540880986, 0.0004468946197373, -0.0135613343631988, -0.0102144911551387, -0.0097233848285265, -0.0091866916912812, 0.019684380597816, 0.0166176335887179, 0.0166400525730744, -0.0376485765393947, -0.0250114300168241, -0.0243606701781919, -0.0252379708311986, 0.1184618760403427, 0.046265507774838, 0.0196356426159539, -0.1335613671169362, -0.042309789169428, -0.0179266735443363, 0.0001188921971814]


In [46]:
df_pred.iloc[0].enc_dim_0 in unique_enc_dim_0

False

The values of the embedding df and of the predictions df are different. Therefore, the mapping fails. 

Furthermore, the path to the embeddings in the ```custom_spearmanr_scorer``` is fix and does not refer to the flexible path in the config file. 
I do not know if the issue with the path can be fixed that easily. 
So at first get the function running. 

In [None]:
def custom_spearmanr_scorer(clf, X, y, **kwargs):
    # Predict
    y_pred = pd.Series(clf.predict(X), index=y.index, name=str(y.name) + "_pred")
    df_pred = pd.concat([X, y, y_pred], axis=1)
    # Map embeddings to encoder names for new index
    ## Load embeddings
    emb_df = pd.read_csv("data/preprocessed/embeddings.csv", index_col=0)
    ## Rename
    emb_df = emb_df.rename(columns={col: f"enc_dim_{col}" for col in emb_df.columns})
    emb_df = emb_df.reset_index()
    emb_df = emb_df.rename(columns={"index": "encoder"})
    ## Get all columns that start with enc_dim
    enc_dim_cols = [col for col in emb_df.columns if col.startswith("enc_dim")]
    ## Merge embeddings with df_pred
    df_pred = df_pred.merge(emb_df, on=enc_dim_cols, how="left")
    print(df_pred.head(10))
    # Convert to rankings
    NEW_INDEX = "encoder"
    FACTORS = [c for c in X.columns if c not in [NEW_INDEX, str(y.name)]]
    rankings_test = get_rankings(df_pred, factors=FACTORS, new_index=NEW_INDEX, target=str(y.name))
    rankings_pred = get_rankings(df_pred, factors=FACTORS, new_index=NEW_INDEX, target=str(y.name) + "_pred")
    # Evaluate
    return average_spearman(rankings_test, rankings_pred)
