# Imports

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

import src.load_datasets as ld
import src.pairwise_utils as pu
from src.utils import load_config
from src.feature_engineering import feature_selection, normalize_train_data, normalize_test_data
from src.data_cleaning import drop_pearson_correlated_features
from src.meta_information import add_dataset_meta_information
from src.encoding import ohe_encode_train_data, ohe_encode_test_data

In [58]:
# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set_style("whitegrid")
sns.set_palette("Set2")

np.random.seed(42)

# Import data

In [59]:
DATA_DIR = "../../data/raw/"

FACTORS = ["dataset", "model", "tuning", "scoring"]
NEW_INDEX = "encoder"
cfg = load_config("../../configs/config.yaml")

df_train = ld.load_dataset(DATA_DIR + "dataset_rank_train.csv")
#df_test = ld.load_dataset(DATA_DIR + "dataset_rank_test.csv")  # as usual, replace it with your own validation set

X_train = df_train[FACTORS + ["encoder"]].groupby(FACTORS).agg(lambda x: np.nan).reset_index()[FACTORS]
#X_test = df_test[FACTORS + ["encoder"]].groupby(FACTORS).agg(lambda x: np.nan).reset_index()[FACTORS]

# join to ensure X_train and y_train's indices are ordered the same
y_train = pd.merge(X_train,
                   pu.get_pairwise_target(df_train, features=FACTORS, target="rank", column_to_compare="encoder"),
                   on=FACTORS, how="left").drop(FACTORS, axis=1).fillna(0)

Loading data from '../../data/raw/dataset_rank_train.csv' ...


In [60]:
X_train.head()

Unnamed: 0,dataset,model,tuning,scoring
0,3,DTC,full,ACC
1,3,DTC,full,AUC
2,3,DTC,full,F1
3,3,DTC,model,AUC
4,3,DTC,model,F1


In [61]:
print("Shape of X_train ", X_train.shape)
print("Shape of y_train ", y_train.shape)

Shape of X_train  (1161, 4)
Shape of y_train  (1161, 992)


# Baseline
Run model with baseline data (no preprocessing).

In [62]:
# Preprocess data - baseline (one-hot encoding)
scaler = OneHotEncoder()
X_train_baseline = scaler.fit_transform(X_train)

In [63]:
# Run model
models = {""}

# Model with Preprocessed Data
Run model with preprocess data (full preprocessed pipeline).

In [64]:
X_train_preprocessed = X_train.copy()

In [65]:
def preprocess_data_pairwise(X_train, y_train, X_test, cfg, verbosity):
    # General encodings: One Hot Encode (OHE) subset of features
    X_train, ohe = ohe_encode_train_data(X_train=X_train,
                                         cols_to_encode=cfg["feature_engineering"]["features_to_ohe"],
                                         verbosity=verbosity)
    if X_test is not None:
        X_test = ohe_encode_test_data(X_test=X_test, cols_to_encode=cfg["feature_engineering"]["features_to_ohe"],
                                      ohe=ohe, verbosity=verbosity)

    # Add dataset_agg (= csv-file containing meta information about the datasets)
    # The file can be created with the notebook from week 09
    print("Add dataset meta information...")
    X_train = add_dataset_meta_information(df=X_train,
                                           path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                           nan_threshold=cfg["feature_engineering"]["dataset_meta_information"][
                                               "nan_threshold"],
                                           replacing_strategy=cfg["feature_engineering"]["dataset_meta_information"][
                                               "replacing_strategy"])
    if X_test is not None:
        X_test = add_dataset_meta_information(df=X_test,
                                              path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                              nan_threshold=cfg["feature_engineering"]["dataset_meta_information"][
                                                  "nan_threshold"],
                                              replacing_strategy=cfg["feature_engineering"]["dataset_meta_information"][
                                                  "replacing_strategy"])

    # Drop correlated features
    X_train, X_test = drop_pearson_correlated_features(train_data=X_train,
                                                       test_data=X_test,
                                                       threshold=
                                                       cfg["data_cleaning"]["pearson_correlation"][
                                                           "threshold"],
                                                       verbosity=verbosity)

    # Select features
    X_train, X_test = feature_selection(X_train=X_train, X_test=X_test, y_train=y_train, quantile=0.4, verbosity=2)

    # Normalize data
    X_train, scaler = normalize_train_data(X_train=X_train, method=cfg["feature_engineering"]["normalize"]["method"],
                                           verbosity=verbosity)
    if X_test is not None:
        X_test = normalize_test_data(X_test=X_test, scaler=scaler, verbosity=verbosity)

    return X_train, y_train, X_test, scaler

In [66]:
%%time
X_train_preprocessed, y_train_preprocessed, X_test_preprocessed, scaler = preprocess_data_pairwise(X_train=X_train_preprocessed, y_train=y_train, X_test=None, cfg=cfg, verbosity=2)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...
Add dataset meta information...
Drop pearson correlated features with threshold 0.7...
Filter correlated features
Feature selection...
Normalizing train data using method 'minmax' ...
CPU times: total: 141 ms
Wall time: 655 ms
