# Import

In [1]:
import pandas as pd
from src.load_datasets import load_dataset, load_rankings

# Get data

In [2]:
data = load_dataset('../../data/raw/dataset.csv')
rankings = load_rankings('../../data/raw/rankings.csv')

Loading data ...
Loading rankings ...


### Checkout data
* Overview
* NaN values
* Unique values
* Distribution

In [3]:
data.head(6)

Unnamed: 0,encoder,dataset,model,tuning,scoring,cv_score
0,BE,3,DTC,full,ACC,0.908341
1,BE,3,DTC,full,AUC,0.906854
2,BE,3,DTC,full,F1,0.916533
3,BE,3,DTC,model,ACC,0.967145
4,BE,3,DTC,model,AUC,0.967233
5,BE,3,DTC,model,F1,0.968161


Findings: Data consists of the columns, where the first 5 are the features used later on for training.
* **encoder**: The encoder used for the training.
* **dataset**: The dataset on which it is trained on.
* **model**: The model used for the training.
* **tuning**: Indicator if and how the model was tuned.
* **scoring**: The scoring used for the training.
* **cv_score**: The cross validation score of the selected model on a dataset, using an encoder, any kind of tuning and a scoring for the training.

In [4]:
print("Sum of Nan values for each feature:", "\n", "*"*20, "\n", data.isnull().sum())

Sum of Nan values for each feature: 
 ******************** 
 encoder     0
dataset     0
model       0
tuning      0
scoring     0
cv_score    0
dtype: int64


Findings: There are no NaN values regarding the train features.

In [10]:
print(f"Encoders ({len(data['encoder'].unique())}): ", "\n", data["encoder"].unique(), "\n", "*"*20)
print(f"Datasets Id ({len(data['dataset'].unique())}): ", "\n", data["dataset"].unique(), "\n", "*"*20)
print(f"Models ({len(data)}): ", "\n", data["model"].unique(), "\n", "*"*20)
print(f"Tuning Info: ", "\n", data["tuning"].unique(), "\n", "*"*20)
print(f"Scoring: ", "\n", data["scoring"].unique(), "\n", "*"*20)

Encoders (32):  ['BE' 'BUCV10RGLMME' 'BUCV10TE' 'BUCV2RGLMME' 'BUCV2TE' 'BUCV5RGLMME'
 'BUCV5TE' 'CBE' 'CE' 'CV10RGLMME' 'CV10TE' 'CV2RGLMME' 'CV2TE'
 'CV5RGLMME' 'CV5TE' 'DE' 'DTEM10' 'DTEM2' 'DTEM5' 'ME01E' 'ME10E' 'ME1E'
 'MHE' 'OE' 'OHE' 'PBTE0001' 'PBTE001' 'PBTE01' 'RGLMME' 'SE' 'TE' 'WOEE'] 
 ********************
Datasets Id:  [    3    29    31    38    50    51    56   333   334   451   470   881
   956   959   981  1037  1111  1112  1114  1169  1235  1461  1463  1486
  1506  1511  1590  6332 23381 40536 40945 40981 40999 41005 41007 41162
 41224 42178 42343 42344 42738 42750 43098 43607 43890 43892 43896 43897
 43900 43922] 
 ********************
Models:  ['DTC' 'KNC' 'LGBMC' 'LR' 'SVC'] 
 ********************
Tuning Info:  ['full' 'model' 'no'] 
 ********************
Scoring:  ['ACC' 'AUC' 'F1'] 
 ********************


### Checkout rankings
* NaN values
* Best and Worst Encoder (via mean of all rankings, min of all rankings, max of all rankings)
* Best and Worst Encoder for different feature combinations

In [4]:
rankings

Unnamed: 0_level_0,3,3,3,3,3,3,3,3,3,3,...,43922,43922,43922,43922,43922,43922,43922,43922,43922,43922
Unnamed: 0_level_1,DTC,DTC,DTC,DTC,DTC,DTC,DTC,DTC,DTC,KNC,...,LR,LR,LR,LR,SVC,SVC,SVC,SVC,SVC,SVC
Unnamed: 0_level_2,full,full,full,model,model,model,no,no,no,full,...,model,no,no,no,full,full,full,no,no,no
Unnamed: 0_level_3,ACC,AUC,F1,ACC,AUC,F1,ACC,AUC,F1,ACC,...,F1,ACC,AUC,F1,ACC,AUC,F1,ACC,AUC,F1
encoder,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
BE,1,1,1,12,12,12,1,1,1,21,...,7,8,8,6,0,0,4,4,4,4
BUCV10RGLMME,1,1,1,14,14,13,1,1,1,16,...,12,10,10,10,14,14,10,21,21,21
BUCV10TE,1,1,1,0,0,0,1,1,1,9,...,11,9,9,9,14,14,10,12,12,12
BUCV2RGLMME,1,1,1,18,18,18,1,1,1,15,...,18,14,14,14,19,19,19,7,7,8
BUCV2TE,1,1,1,6,6,6,1,1,1,8,...,15,13,13,13,21,21,21,8,8,7
BUCV5RGLMME,1,1,1,16,17,16,1,1,1,14,...,14,12,12,11,13,13,9,14,15,15
BUCV5TE,1,1,1,4,4,4,1,1,1,7,...,13,11,11,12,13,13,9,9,9,9
CBE,3,3,4,25,24,25,4,3,4,27,...,4,5,5,5,2,2,0,1,1,0
CE,1,1,1,8,8,8,1,1,1,26,...,5,2,2,4,25,24,24,0,0,2
CV10RGLMME,1,1,1,15,16,15,1,1,1,18,...,21,22,23,24,20,20,20,23,23,23
