In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import openml
import os
import pandas as pd
import numpy as np 
import math

import sys
sys.path.append("../..")
from src.load_datasets import load_dataset, load_rankings, load_train_data
import src.evaluate_regression

from category_encoders import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import KFold
from typing import Iterable, List

# Define functions

## Given functions

In [3]:
def custom_cross_validated_indices(df: pd.DataFrame, factors: Iterable[str], target: str,
                                   **kfoldargs) -> List[List[Iterable[int]]]:
    df_factors = df.groupby(factors)[target].mean().reset_index()
    X_factors, y_factors = df_factors.drop(target, axis=1), df_factors[target]

    indices = []
    for itr, ite in KFold(**kfoldargs).split(X_factors, y_factors):
        tr = pd.merge(X_factors.iloc[itr], df.reset_index(), on=factors).index  # "index" is the index of df
        te = pd.merge(X_factors.iloc[ite], df.reset_index(), on=factors).index  # "index" is the index of df
        indices.append([tr, te])

    return indices

In [4]:
def get_rankings(df: pd.DataFrame, factors, new_index, target) -> pd.DataFrame:

    assert set(factors).issubset(df.columns)

    rankings = {}
    for group, indices in df.groupby(factors).groups.items():
        score = df.iloc[indices].set_index(new_index)[target]
        rankings[group] = score2ranking(score, ascending=False)

    return pd.DataFrame(rankings)

In [5]:
def score2ranking(score: pd.Series, ascending=True):
    """
    Ascending =
        True: lower score = better rank (for instance, if score is the result of a loss function or a ranking itself)
        False: greater score = better rank (for instance, if score is the result of a score such as roc_auc_score)
    """
    c = 1 if ascending else -1
    order_map = {
        s: sorted(score.unique(), key=lambda x: c * x).index(s) for s in score.unique()
    }
    return score.map(order_map)


def spearman_rho(x: Iterable, y: Iterable, nan_policy="omit"):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConstantInputWarning)
        return spearmanr(x, y, nan_policy=nan_policy)[0]


def list_spearman(rf1: pd.DataFrame, rf2: pd.DataFrame) -> np.array:
    if not rf1.columns.equals(rf2.columns) or not rf1.index.equals(rf2.index):
        raise ValueError("The two input dataframes should have the same index and columns.")

    return np.array([
        spearman_rho(r1, r2, nan_policy="omit") for (_, r1), (_, r2) in zip(rf1.items(), rf2.items())
    ])


def average_spearman(rf1: pd.DataFrame, rf2: pd.DataFrame) -> np.array:
    #with warnings.catch_warnings():
    #    warnings.filterwarnings("ignore", category=RuntimeWarning)
    return np.nanmean(list_spearman(rf1, rf2))

## New functions

In [6]:
def my_custom_cross_validated_indices(df: pd.DataFrame, factors, target: str,
                                   **kfoldargs):
    df_factors = df.groupby(factors)[target].mean().reset_index()
    X_factors, y_factors = df_factors.drop(target, axis=1), df_factors[target]

    indices = []
    for itr, ite in KFold(**kfoldargs).split(X_factors, y_factors):
        tr = pd.merge(X_factors.iloc[itr], df.reset_index(), on=factors)['index']  # "index" is the index of df
        te = pd.merge(X_factors.iloc[ite], df.reset_index(), on=factors)['index']  # "index" is the index of df
        indices.append([tr, te])

    return indices

In [7]:
def my_get_rankings(df: pd.DataFrame, factors, new_index, target) -> pd.DataFrame:

    assert set(factors).issubset(df.columns)

    rankings = {}
    #print(df.shape)
    for group, indices in df.groupby(factors).groups.items():
        #print(indices)
        #print(df.shape)
        score = df.loc[indices].set_index(new_index)[target]
        rankings[group] = src.evaluate_regression.score2ranking(score, ascending=False)

    return pd.DataFrame(rankings)

In [8]:
def validate_fold_indices(indices):
    for i in range(len(indices)):
        # Train set
        print(f"Fold: {i}")
        print(f"Number of indices in train: {len(indices[i][0])}")
        print(f"Number of indices in test : {len(indices[i][1])}")


        intersect = np.intersect1d(indices[i][0], indices[i][1])
        print(f"Number of equal indices: {len(intersect)}")

# Example

## Given functions

In [9]:
# Load data
X_train, y_train = load_train_data('../../data/raw/dataset_rank_train.csv')
X_train = X_train.drop("rank", axis=1)

# Define variables for ranking
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "cv_score"

# Get indices for CV
indices = custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), factors, target, n_splits=5, shuffle=True, random_state=1444)
#indices = my_custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), factors, target, n_splits=5, shuffle=True, random_state=1444)

for fold in indices:
    # Define data of current fold
    X_train_fold = X_train.iloc[fold[0]]
    X_test_fold = X_train.iloc[fold[1]]
    y_train_fold = y_train.iloc[fold[0]]
    y_test_fold = y_train.iloc[fold[1]]
    
    # Train model and predict
    dummy_pipe = Pipeline([("encoder", OneHotEncoder()), ("model", DecisionTreeRegressor())])
    y_pred = pd.Series(dummy_pipe.fit(X_train_fold, y_train_fold).predict(X_test_fold), index=y_test_fold.index, name="cv_score_pred")
    df_pred = pd.concat([X_test_fold, y_test_fold, y_pred], axis=1)
    
    # Rankings and avg_spearman
    rankings_test = get_rankings(df_pred, factors=factors, new_index=new_index, target="cv_score")
    rankings_pred = get_rankings(df_pred, factors=factors, new_index=new_index, target="cv_score_pred")
    print(src.evaluate_regression.average_spearman(rankings_test, rankings_pred))

Loading train data from '../../data/raw/dataset_rank_train.csv'...
0.9891831469088159
0.9879437888849684
0.9877661360418818
0.9890122937636548
0.9891292937952844


In [10]:
# Show indices
indices[0]

[RangeIndex(start=0, stop=28876, step=1),
 RangeIndex(start=0, stop=7178, step=1)]

In [11]:
validate_fold_indices(indices)

Fold: 0
Number of indices in train: 28876
Number of indices in test : 7178
Number of equal indices: 7178
Fold: 1
Number of indices in train: 28807
Number of indices in test : 7247
Number of equal indices: 7247
Fold: 2
Number of indices in train: 28910
Number of indices in test : 7144
Number of equal indices: 7144
Fold: 3
Number of indices in train: 28805
Number of indices in test : 7249
Number of equal indices: 7249
Fold: 4
Number of indices in train: 28818
Number of indices in test : 7236
Number of equal indices: 7236


## New functions

In [12]:
# Load data
X_train, y_train = load_train_data('../../data/raw/dataset_rank_train.csv')
X_train = X_train.drop("rank", axis=1)

# Define variables for ranking
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "cv_score"

# Get indices for CV
#indices = custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), factors, target, n_splits=5, shuffle=True, random_state=1444)
indices = my_custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), factors, target, n_splits=5, shuffle=True, random_state=1444)

for fold in indices:
    # Define data of current fold
    X_train_fold = X_train.iloc[fold[0]]
    X_test_fold = X_train.iloc[fold[1]]
    y_train_fold = y_train.iloc[fold[0]]
    y_test_fold = y_train.iloc[fold[1]]
    
    # Train model and predict
    dummy_pipe = Pipeline([("encoder", OneHotEncoder()), ("model", DecisionTreeRegressor())])
    y_pred = pd.Series(dummy_pipe.fit(X_train_fold, y_train_fold).predict(X_test_fold), index=y_test_fold.index, name="cv_score_pred")
    df_pred = pd.concat([X_test_fold, y_test_fold, y_pred], axis=1)
    
    # Rankings and avg_spearman
    rankings_test = my_get_rankings(df_pred, factors=factors, new_index=new_index, target="cv_score")
    rankings_pred = my_get_rankings(df_pred, factors=factors, new_index=new_index, target="cv_score_pred")
    print(src.evaluate_regression.average_spearman(rankings_test, rankings_pred))

Loading train data from '../../data/raw/dataset_rank_train.csv'...
0.4250098422065961
0.5003065019900456
0.4722530960922732
0.48726084567346123
0.5030920623852049


In [13]:
# Show (sample of) indices
print(indices[0][0][:20])
print(indices[0][1][:20])

0     16858
1     16859
2     16860
3     16861
4     16862
5     16863
6     16864
7     16865
8     16866
9     16867
10    16868
11    16869
12    16870
13    16871
14    16872
15    16873
16    16874
17    16875
18    16876
19    16877
Name: index, dtype: int64
0     6502
1     6503
2     6504
3     6505
4     6506
5     6507
6     6508
7     6509
8     6510
9     6511
10    6512
11    6513
12    6514
13    6515
14    6516
15    6517
16    6518
17    6519
18    6520
19    6521
Name: index, dtype: int64


In [14]:
validate_fold_indices(indices)

Fold: 0
Number of indices in train: 28876
Number of indices in test : 7178
Number of equal indices: 0
Fold: 1
Number of indices in train: 28807
Number of indices in test : 7247
Number of equal indices: 0
Fold: 2
Number of indices in train: 28910
Number of indices in test : 7144
Number of equal indices: 0
Fold: 3
Number of indices in train: 28805
Number of indices in test : 7249
Number of equal indices: 0
Fold: 4
Number of indices in train: 28818
Number of indices in test : 7236
Number of equal indices: 0
