# Imports

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

import src.load_datasets as ld
import src.pairwise_utils as pu
from src.encoding import poincare_encoding, ohe_encode_train_data
from src.meta_information import add_dataset_meta_information

In [23]:
# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set_style("whitegrid")
sns.set_palette("Set2")

np.random.seed(42)

# Import data

In [39]:
DATA_DIR = "../../data/raw/"

FACTORS = ["dataset", "model", "tuning", "scoring"]
NEW_INDEX = "encoder"

df_train = ld.load_dataset(DATA_DIR + "dataset_rank_train.csv")
#df_test = ld.load_dataset(DATA_DIR + "dataset_rank_test.csv")  # as usual, replace it with your own validation set

X_train = df_train[FACTORS + ["encoder"]].groupby(FACTORS).agg(lambda x: np.nan).reset_index()[FACTORS]
#X_test = df_test[FACTORS + ["encoder"]].groupby(FACTORS).agg(lambda x: np.nan).reset_index()[FACTORS]

# join to ensure X_train and y_train's indices are ordered the same
y_train = pd.merge(X_train,
                   pu.get_pairwise_target(df_train, features=FACTORS, target="rank", column_to_compare="encoder"),
                   on=FACTORS, how="left").drop(FACTORS, axis=1).fillna(0)

Loading data from '../../data/raw/dataset_rank_train.csv' ...


In [46]:
X_train.head()

Unnamed: 0,dataset,model,tuning,scoring
0,3,DTC,full,ACC
1,3,DTC,full,AUC
2,3,DTC,full,F1
3,3,DTC,model,AUC
4,3,DTC,model,F1


In [42]:
print("Shape of X_train ", X_train.shape)
print("Shape of y_train ", y_train.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36054 entries, 0 to 36053
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dataset   36054 non-null  int64  
 1   model     36054 non-null  object 
 2   tuning    36054 non-null  object 
 3   scoring   36054 non-null  object 
 4   encoder   36054 non-null  object 
 5   cv_score  36054 non-null  float64
 6   rank      36054 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 1.9+ MB


# Baseline
Run model with baseline data (no preprocessing).

In [44]:
# Preprocess data - baseline (one-hot encoding)
scaler = OneHotEncoder()
X_train_baseline = scaler.fit_transform(X_train)

In [26]:
# Run model

# Model with Preprocessed Data
Run model with preprocess data (full preprocessed pipeline).

In [27]:
X_train_preprocessed = X_train.copy()

In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161 entries, 0 to 1160
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  1161 non-null   int64 
 1   model    1161 non-null   object
 2   tuning   1161 non-null   object
 3   scoring  1161 non-null   object
dtypes: int64(1), object(3)
memory usage: 36.4+ KB


In [19]:
# General encodings: One Hot Encode (OHE) subset of features (["model", "tuning", "scoring"])
X_train_preprocessed, ohe = ohe_encode_train_data(X_train=X_train,
                                                  cols_to_encode=["model", "tuning", "scoring"],
                                                  verbosity=2)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...


In [None]:
# Add dataset_agg (= csv-file containing meta information about the datasets)
# The file can be created with the notebook from week 09
X_train = add_dataset_meta_information(df=X_train_preprocessed,
                                       path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                       nan_threshold=0.5,
                                       replacing_strategy="median")

In [None]:
# add drop of correlated features

In [None]:
# add feature selection

In [None]:
# normalize