In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import openml
import os
import pandas as pd
import math

In [3]:
import sys

sys.path.append("../..")
from src.load_datasets import load_dataset, load_rankings, load_train_data

# Load data

In [4]:
dataset = load_dataset('../../data/raw/dataset.csv')
rankings = load_rankings('../../data/raw/rankings.csv')
X_train, y_train = load_train_data('../../data/raw/dataset_train.csv')

Loading data ...
Loading rankings ...
Loading train data ...


In [5]:
unique_encoders = dataset.encoder.unique()
unique_models = dataset.model.unique()
unique_scoring = dataset.scoring.unique()
unique_datasets = dataset.dataset.unique()
# unique_tuning = dataset.tuning.unique()

# Description

Extend the notebook from week 09 for the meta information. 
In this notebook I will use openml to perform tasks/flows in order to evaluate some encoders

- No tuning is used for all
- Standard parameters used for all

After some tests I found out, that one can only generate tests on active datasets. 

### Encoders

https://contrib.scikit-learn.org/category_encoders/

| Abbreviation | Name | From | Note |
| :- | :- | :- | :- |
| BE | Binary | category_encoders |  |
| CBE | CatBoost | category_encoders |  |
| CE | Count | category_encoders |  |
| DE | Drop |  |  |
| ME | Mean-Estimate | category_encoders | aka TargetEncoder: https://contrib.scikit-learn.org/category_encoders/targetencoder.html; also try M-Estimate as ME |
| MHE | Min-Hash | skrub | https://skrub-data.org/stable/api.html |
| OE | Ordinal | sklearn.preprocessing |  |
| OHE | One-Hot | sklearn.preprocessing |  |
| RGLMME | GLMM | category_encoders |  |
| SE | Sum | category_encoders | Not sure im Sum coder or Summary Encoder |
| TE | Mean-Target (not to be confused with Target) |  |  |
| WOEE | Weight-of-Evidence | category_encoders |  |
| BUCVX | BlowUp Cross-Validated X | ToDo: Implementation | Implemented by Federico |
| CVX | Cross-Validated X |  |  |
| DX | Discretized X | ToDo: Implementation | Implemented by Federico |
| PBX | PreBinned X | ToDo: Implementation | Implemented by Federico |

also use 
- HelemetEncoder from category_encoders
- James-Stein may be another name for Mean-Target
- Leave One out from category_encoders
- ... and everything else, which comes to hand :)

Links:
- https://skrub-data.org/stable/install.html
- https://contrib.scikit-learn.org/category_encoders/index.html

In [6]:
# Import encoders
from category_encoders import (
    BackwardDifferenceEncoder, 
    BaseNEncoder, 
    BinaryEncoder, 
    CatBoostEncoder, 
    CountEncoder, 
    GLMMEncoder, 
    GrayEncoder, 
    HashingEncoder, 
    HelmertEncoder, 
    JamesSteinEncoder, 
    LeaveOneOutEncoder, 
    MEstimateEncoder, 
    OneHotEncoder,
    OrdinalEncoder, 
    PolynomialEncoder, 
    QuantileEncoder, 
    RankHotEncoder, 
    SumEncoder, 
    SummaryEncoder, 
    TargetEncoder, 
    WOEEncoder
)

from skrub import (
    GapEncoder, 
    MinHashEncoder, 
    SimilarityEncoder, 
    TargetEncoder
)

In [16]:
available_encoders = {
    "BackwardDifferenceEncoder": BackwardDifferenceEncoder(), 
    "BaseNEncoder"             : BaseNEncoder(), 
    "BinaryEncoder"            : BinaryEncoder(), 
    "CatBoostEncoder"          : CatBoostEncoder(), 
    "CountEncoder"             : CountEncoder(), 
    "GLMMEncoder"              : GLMMEncoder(), 
    "GrayEncoder"              : GrayEncoder(), 
    "HashingEncoder"           : HashingEncoder(), 
    "HelmertEncoder"           : HelmertEncoder(), 
    "JamesSteinEncoder"        : JamesSteinEncoder(), 
    "LeaveOneOutEncoder"       : LeaveOneOutEncoder(), 
    "MEstimateEncoder"         : MEstimateEncoder(), 
    "OneHotEncoder"            : OneHotEncoder(),
    "OrdinalEncoder"           : OrdinalEncoder(), 
    "PolynomialEncoder"        : PolynomialEncoder(), 
    "QuantileEncoder"          : QuantileEncoder(), 
    "RankHotEncoder"           : RankHotEncoder(), 
    "SumEncoder"               : SumEncoder(), 
    "SummaryEncoder"           : SummaryEncoder(), 
    "TargetEncoder"            : TargetEncoder(), 
    "WOEEncoder"               : WOEEncoder(),
    
    "GapEncoder"               : GapEncoder(),  
    "MinHashEncoder"           : MinHashEncoder(), 
    "SimilarityEncoder"        : SimilarityEncoder(), 
    "TargetEncoder"            : TargetEncoder()
}

In [17]:
for s, e in available_encoders.items():
    print(f"{s}   ---   {e}")

BackwardDifferenceEncoder   ---   BackwardDifferenceEncoder()
BaseNEncoder   ---   BaseNEncoder()
BinaryEncoder   ---   BinaryEncoder()
CatBoostEncoder   ---   CatBoostEncoder()
CountEncoder   ---   CountEncoder(combine_min_nan_groups=True)
GLMMEncoder   ---   GLMMEncoder()
GrayEncoder   ---   GrayEncoder()
HashingEncoder   ---   HashingEncoder(max_process=2)
HelmertEncoder   ---   HelmertEncoder()
JamesSteinEncoder   ---   JamesSteinEncoder()
LeaveOneOutEncoder   ---   LeaveOneOutEncoder()
MEstimateEncoder   ---   MEstimateEncoder()
OneHotEncoder   ---   OneHotEncoder()
OrdinalEncoder   ---   OrdinalEncoder()
PolynomialEncoder   ---   PolynomialEncoder()
QuantileEncoder   ---   QuantileEncoder()
RankHotEncoder   ---   RankHotEncoder()
SumEncoder   ---   SumEncoder()
SummaryEncoder   ---   SummaryEncoder()
TargetEncoder   ---   TargetEncoder()
WOEEncoder   ---   WOEEncoder()
GapEncoder   ---   GapEncoder()
MinHashEncoder   ---   MinHashEncoder()
SimilarityEncoder   ---   SimilarityEnco

In [70]:
list_dataset_id = []
list_model = []  # unique_models = ['DTC' 'KNC' 'LGBMC' 'LR' 'SVC']
list_encoding = []
list_scoring = []  # unique_scoring = ['ACC', 'AUC', 'F1']
list_folds = []  # Will be constant in my case, may leave it 
list_cv_score = []  
list_std_dev = []

In [71]:
#%%time

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

import numpy as np
import time


datasets_active_state = openml.datasets.check_datasets_active(unique_datasets)
#eval_measures = ['predictive_accuracy', 'area_under_roc_curve', 'f_measure']
evaluation_metrics = {
    'ACC': 'predictive_accuracy',
    'AUC': 'area_under_roc_curve',
    'F1' : 'f_measure'
}


for dataset_id in unique_datasets:
    print(dataset_id)
    
    if datasets_active_state[dataset_id]:
        # Dataset is active
        # Define the classifier and preprocessing steps
        imputer = SimpleImputer(strategy='most_frequent')
        # ToDo: Implement the encoders and iterate over them
        encoder = OneHotEncoder(categories='auto', sparse_output=False, handle_unknown='ignore')
        encoder_string = "OHE"
        scaler = StandardScaler()
        
        # Iterate over models
        for model_string in unique_models:
            print(f"  {model_string}")
            
            # Choose classifier
            if model_string == "DTC":
                classifier = DecisionTreeClassifier()
            elif model_string == "KNC":
                classifier = KNeighborsClassifier()
            elif model_string == "LGBMC":
                classifier = LGBMClassifier()
            elif model_string == "LR":
                classifier = LinearRegression()
            elif model_string == "SVC":
                classifier = SVC()
            else:
                print(f"Classifier '{model_string}' is not implemented!")
                continue
            
            # Iterate over available encoders
            for encoder_string, encoder in available_encoders.items():
                print(f"    {encoder_string}")
                
                # Set up the pipeline
                pipeline = Pipeline(steps=[
                    ('imputer', imputer),
                    ('encoder', encoder),
                    ('scaler', scaler),
                    ('classifier', classifier)
                ])

                # Select task if it does not exist
                existing_tasks = openml.tasks.list_tasks(
                    task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
                )

                for scoring in evaluation_metrics.keys():
                    print(f"      {scoring} --- {evaluation_metrics[scoring]}")

                    # Prepare filter
                    task_filter = ((existing_tasks['did'] == dataset_id) & (existing_tasks['estimation_procedure'] == '10-fold Crossvalidation') & ((existing_tasks['evaluation_measures'].isna()) | (existing_tasks['evaluation_measures'] == evaluation_metrics[scoring])))
                    #filtered_tasks = tasks[task_filter]

                    # Check if task exists
                    if existing_tasks[task_filter].shape[0] > 0:
                        # task exists

                        task = openml.tasks.get_task(task_id=existing_tasks[task_filter].iloc[0].tid)
                    else:
                        # Create a new task and publish it
                        print("Create task")
                        new_task = openml.tasks.create_task(task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
                                                        dataset_id=3,
                                                        target_name="class",
                                                        evaluation_measure="f_measure",
                                                        estimation_procedure_id=1   # '10-fold Crossvalidation',
                        )
                        new_task.publish()

                        # Now list tasks again and select the new task
                        existing_tasks = openml.tasks.list_tasks(
                            task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
                        )
                        task = openml.tasks.get_task(task_id=existing_tasks[task_filter].iloc[0].tid)

                    # Check if the run has already be performed, if yes load it    
                    list_runs = openml.runs.list_runs(task=[task.id])
                    list_runs_values = list(list_runs.values())
                    if len(list_runs_values) == 0:
                        # Run the task
                        run = openml.runs.run_model_on_task(model=pipeline, task=task, seed=42)
                        run.publish()
                        run_id = run.id

                    else:
                        # Get the task which has already be run
                        run_id = list_runs_values[0]['run_id']

                    # Run the model on the task
                    run_result = openml.runs.get_run(run_id)

                    # For some reason sometimes the results are contained in the run object and sometimes not
                    if run_result.fold_evaluations is not None:
                        # Calculate results to get cv_score and std dev
                        cv_score = np.mean(list(run_result.fold_evaluations[evaluation_metrics[scoring]][0].values()))
                        std_dev = np.std(list(run_result.fold_evaluations[evaluation_metrics[scoring]][0].values()))
                    else:
                        # Maybe needs to sleep for short after publishing the run
                        time.sleep(3)

                        # Get evaluations
                        list_evaluations = openml.evaluations.list_evaluations(function=evaluation_metrics[scoring], runs=[run_id])
                        eval_result = list(list_evaluations.values())[0]
                        cv_score = eval_result.value
                        std_dev = math.nan

                    # Append results to list
                    list_dataset_id.append(dataset_id)
                    list_model.append(model_string)
                    list_encoding.append(encoder_string)
                    list_scoring.append(scoring)
                    list_folds.append(10)
                    list_cv_score.append(cv_score)
                    list_std_dev.append(std_dev)

3
  DTC
    BackwardDifferenceEncoder
      ACC --- predictive_accuracy
      AUC --- area_under_roc_curve


KeyboardInterrupt: 

In [None]:
evaluations_data = {}

evaluations_data['dataset_id'] = list_dataset_id
evaluations_data['model'] = list_model
evaluations_data['encoding'] = list_encoding
evaluations_data['scoring'] = list_scoring
evaluations_data['folds'] = list_folds
evaluations_data['cv_score'] = list_cv_score
evaluations_data['std_dev'] = list_std_dev

evaluations = pd.DataFrame(data=evaluations_data)

evaluations.to_csv('../../data/preprocessed/evaluations.csv')

## How to integrate the data in the pipeline?

Add all encoder evaluations as features to the data. 
Add one column 'suggested_score' where I try to map the calculated cv_scores according to the columns ```[model, encoder, scoring, dataset]```

In [None]:
# Mapping to abbreviaton
mapping_of_encoder_abbreviations = {
    "BackwardDifferenceEncoder": "", 
    "BaseNEncoder"             : "", 
    "BinaryEncoder"            : "BE",      # Given by Federico
    "CatBoostEncoder"          : "CBE",     # Given by Federico
    "CountEncoder"             : "CE",      # Given by Federico
    "GLMMEncoder"              : "RGLMME",  # Given by Federico
    "GrayEncoder"              : "", 
    "HashingEncoder"           : "", 
    "HelmertEncoder"           : "", 
    "JamesSteinEncoder"        : "", 
    "LeaveOneOutEncoder"       : "", 
    "MEstimateEncoder"         : "", 
    "OneHotEncoder"            : "OHE",     # Given by Federico
    "OrdinalEncoder"           : "OE",      # Given by Federico
    "PolynomialEncoder"        : "", 
    "QuantileEncoder"          : "", 
    "RankHotEncoder"           : "", 
    "SumEncoder"               : "SE",      # Given by Federico
    "SummaryEncoder"           : "", 
    "TargetEncoder"            : "", 
    "WOEEncoder"               : "WOEE",
    
    "GapEncoder"               : "",  
    "MinHashEncoder"           : "MHE",     # Given by Federico 
    "SimilarityEncoder"        : "", 
    "TargetEncoder"            : ""
}

In [None]:
# Calculate correlations between all calculated cv_scores for all encoders with actual target