## Example analysis of results

### Libraries i funkcje (do wczytania)

In [1]:
import pandas as pd
import numpy as np
import json
from math import ceil

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Lista zawierająca rozmiar anchorów
anchor_list_denser = np.ceil(16 * 2 ** ((np.arange(70)) / 8)).astype(int)

In [2]:
# Funkcja do obliczania wartości całki dla krzywej uczenia
def LC_integral_value(y_array):
    # y_array powinno być ndarrat
    # x_values odpowiada anchor_list_denser
    x_values = np.ceil(16 * 2 ** ((np.arange(70)) / 8)).astype(float)
    # Indeks pierwszego NaN w y_array
    valid_length = np.argmax(np.isnan(y_array)) if np.isnan(y_array).any() else len(y_array)
    
    # Ogranicz dane tylko do fragmentu bez NaN
    y = y_array[:valid_length]
    x = x_values[:valid_length]
    
    # Unormowanie x do przedziału [0, 1]
    x_min = np.min(x)
    x_max = np.max(x)
    x_norm = (x - x_min) / (x_max - x_min)
    
    integral = np.trapz(y, x_norm)
    return integral

### 1. Wczytanie pliku csv i przekształcenie ramki danych

In [15]:
### WYPEŁNIĆ ### 
# ścieżka do pliku csv, który ma być wczytany
input_file = "__RESULTS_DT__.csv"
# Wczytujemy wyniki dla modelu Decision Tree

In [None]:
## Ten kod wczytuje dane z pliku CSV i przekształca do ramki danych transformed_df, którą można dalej wykorzystać do analizy

df = pd.read_csv(input_file)

# Pusta lista na przekształcone dane
transformed_data = []

grouped = df.groupby(['Dataset_name', 'Preprocessing_method', 'Hyperparameters'], sort = False)
# sort = False, aby zachować domyślną kolejność, a nie alfabetycznie
for (dataset_name, preprocessing_method, hyperparameters), group in grouped:
    hyperparameters = json.loads(hyperparameters)
    
    outer_splits = 5
    inner_splits = 5
    train_val_test_splits = 3
    scores_size = 70

    results = np.full((outer_splits, inner_splits, scores_size, train_val_test_splits), np.nan)
    
    for _, row in group.iterrows():
        i = int(row['Outer_split'])
        j = int(row['Inner_split'])
        k = int(row['Train-val-test'])
        scores = np.array(json.loads(row['Scores']))
        results[i, j, :, k] = scores
    
    transformed_data.append({
        'Dataset Name': dataset_name,
        'Preprocessing Method': preprocessing_method,
        'Hyperparameters': hyperparameters,
        'Results': results
    })

transformed_df = pd.DataFrame(transformed_data)

### 2. Odczytywanie wyników

In [None]:
transformed_df
# Zawiera kolumny
# 'Dataset Name', 'Preprocessing Method', 'Hyperparameters', 'Results'

Unnamed: 0,Dataset Name,Preprocessing Method,Hyperparameters,Results
0,banknote-authentication,unprocessed,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.23387097 0.19565217], [0. ..."
1,banknote-authentication,unprocessed,"{'min_samples_split': 6, 'min_samples_leaf': 1...","[[[[0. 0.16935484 0.10144928], [0. ..."
2,banknote-authentication,unprocessed,"{'min_samples_split': 28, 'min_samples_leaf': ...","[[[[0.5 0.44354839 0.44202899], [0.4444..."
3,banknote-authentication,unprocessed,"{'min_samples_split': 20, 'min_samples_leaf': ...","[[[[0.5 0.44354839 0.44202899], [0.4444..."
4,banknote-authentication,unprocessed,"{'min_samples_split': 50, 'min_samples_leaf': ...","[[[[0.5 0.44354839 0.44202899], [0.5 ..."
...,...,...,...,...
485,credit-g-mod,med_knn_none,"{'min_samples_split': 15, 'min_samples_leaf': ...","[[[[0.25 0.3 0.3 ], [0.27777778 0.3 0...."
486,credit-g-mod,med_knn_none,"{'min_samples_split': 52, 'min_samples_leaf': ...","[[[[0.5 0.3 0.3], [0.44444444 0.3 0.3 ..."
487,credit-g-mod,med_knn_none,"{'min_samples_split': 8, 'min_samples_leaf': 1...","[[[[0.125 0.36666667 0.32 ], [0.1111..."
488,credit-g-mod,med_knn_none,"{'min_samples_split': 14, 'min_samples_leaf': ...","[[[[0.0625 0.24444444 0.34 ], [0.2222..."


In [12]:
# Nazwy zbiorów
transformed_df['Dataset Name'].unique()

array(['banknote-authentication', 'blood-transfusion', 'breast_w',
       'credit-approval', 'credit_g', 'diabetes', 'kr_vs_kp', 'phoneme',
       'phoneme-mod', 'credit-g-mod'], dtype=object)

In [14]:
# Metody preprocessingu
transformed_df['Preprocessing Method'].unique()

array(['unprocessed', 'min_knn_BORUTA', 'min_knn_MI', 'min_knn_none',
       'med_knn_BORUTA', 'med_knn_MI', 'med_knn_none'], dtype=object)

Hiperparametry: dla każdego zbioru danych i każdej metody preprocessingu rozważanych jest 7 kombinacji hiperparametrów (inne dla każdego zbioru danych). Są to kolejno:
- domyślne hiperparametry (pierwszy wiersz)
- trzy najlepsze hiperparametry (trzy kolejne wiersze)
- trzy najgorsze hiperparametry (trzy ostatnie wiersze)

In [16]:
# Załóżmy, że interesują nas wyniki dla zbioru 'banknote-authentication' i różnych metod preprocessingu, ale z domyślnymi hiperparametrami

banknote_authentication_df = transformed_df[transformed_df['Dataset Name'] == 'banknote-authentication'].reset_index(drop=True)
default_df = banknote_authentication_df[transformed_df['Hyperparameters']== {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'criterion': 'gini'}].reset_index(drop=True)
default_df 

  default_df = banknote_authentication_df[transformed_df['Hyperparameters']== {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'criterion': 'gini'}].reset_index(drop=True)


Unnamed: 0,Dataset Name,Preprocessing Method,Hyperparameters,Results
0,banknote-authentication,unprocessed,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.23387097 0.19565217], [0. ..."
1,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.16935484 0.13768116], [0. ..."
2,banknote-authentication,min_knn_MI,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.13709677 0.10869565], [0. ..."
3,banknote-authentication,min_knn_none,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.15322581 0.10869565], [0. ..."
4,banknote-authentication,med_knn_BORUTA,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.04032258 0.06521739], [0. ..."
5,banknote-authentication,med_knn_MI,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.24193548 0.14492754], [0. ..."
6,banknote-authentication,med_knn_none,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.19354839 0.10144928], [0. ..."


In [18]:
# Załóżmy, że interesują nas wyniki dla zbioru 'banknote-authentication' i metody preprocessingu 'min_knn_BORUTA', ale z różnymi hiperparametrami
min_knn_BORUTA_df = banknote_authentication_df[transformed_df['Preprocessing Method'] == "min_knn_BORUTA"].reset_index(drop=True)
min_knn_BORUTA_df

  min_knn_BORUTA_df = banknote_authentication_df[transformed_df['Preprocessing Method'] == "min_knn_BORUTA"].reset_index(drop=True)


Unnamed: 0,Dataset Name,Preprocessing Method,Hyperparameters,Results
0,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.16935484 0.13768116], [0. ..."
1,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 6, 'min_samples_leaf': 1...","[[[[0. 0.16935484 0.19565217], [0.0555..."
2,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 28, 'min_samples_leaf': ...","[[[[0.3125 0.44354839 0.44202899], [0.3888..."
3,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 20, 'min_samples_leaf': ...","[[[[0.375 0.44354839 0.44202899], [0.3888..."
4,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 50, 'min_samples_leaf': ...","[[[[0.4375 0.44354839 0.44202899], [0.4444..."
5,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 46, 'min_samples_leaf': ...","[[[[0.375 0.44354839 0.44202899], [0.4444..."
6,banknote-authentication,min_knn_BORUTA,"{'min_samples_split': 47, 'min_samples_leaf': ...","[[[[0.4375 0.55645161 0.55797101], [0.5 ..."


In [19]:
# Jeśli chcemy teraz rozważać osobno wyniki dla różnych hiperparametrów:
default = min_knn_BORUTA_df.iloc[0]['Results'] # pierwszy wiersz to domyślne hiperparametry 
max1 = min_knn_BORUTA_df.iloc[1]['Results'] # hiperparametry max1
max2 = min_knn_BORUTA_df.iloc[2]['Results'] # hiperparametry max2
max3 = min_knn_BORUTA_df.iloc[3]['Results'] # hiperparametry max3
min1 = min_knn_BORUTA_df.iloc[4]['Results'] # hiperparametry min1
min2 = min_knn_BORUTA_df.iloc[5]['Results'] # hiperparametry min2
min3 = min_knn_BORUTA_df.iloc[6]['Results'] # hiperparametry min3

In [21]:
#Interpretacja results - na przykładzie default
default.shape

(5, 5, 70, 3)

Results to 4-wymiarowy array:
- pierwszy wymiar (0-4): numer zewnętrznego splitu
- drugi wymiar (0-4) : numer wewnętrznego splitu
- trzeci wymiar (0-69): numer anchora (aby sprawdzić jego wartość wystarczy sprawdzić anchor_list_denser na tej pozycji)
- czwarty wymiar (0-2): 0 - training set, 1 - validation set, 2 - test set

In [22]:
# Wyniki dla wszystkich splitów dla anchoru o indeksie 20 na zbiorze walidacyjnym
train_val_test = 1
default[:, :, 20, train_val_test]

array([[0.10483871, 0.05645161, 0.05645161, 0.07258065, 0.13709677],
       [0.05645161, 0.09677419, 0.08870968, 0.02419355, 0.13709677],
       [0.05645161, 0.08870968, 0.12903226, 0.13709677, 0.09677419],
       [0.19354839, 0.07258065, 0.0483871 , 0.08064516, 0.07258065],
       [0.17741935, 0.05645161, 0.17741935, 0.07258065, 0.04032258]])

In [24]:
# Wyniki na wszystkich anchorach dla splitu zewnętrzengo o indeksie 0 i wewnętrznego o indeksie 0
default[0, 0, :, train_val_test]
# od pewnego momentu są Nan, bo osiągneliśmy maksymalny rozmiar próbki treningowej

array([0.16935484, 0.16935484, 0.16129032, 0.16129032, 0.17741935,
       0.19354839, 0.19354839, 0.19354839, 0.19354839, 0.18548387,
       0.16129032, 0.18548387, 0.12903226, 0.10483871, 0.10483871,
       0.10483871, 0.10483871, 0.09677419, 0.10483871, 0.10483871,
       0.10483871, 0.10483871, 0.10483871, 0.05645161, 0.08064516,
       0.06451613, 0.07258065, 0.07258065, 0.07258065, 0.04032258,
       0.08064516, 0.06451613, 0.05645161, 0.05645161, 0.08870968,
       0.05645161, 0.05645161, 0.0483871 , 0.04032258, 0.03225806,
       0.04032258, 0.04032258, 0.02419355, 0.00806452, 0.04032258,
       0.01612903, 0.02419355, 0.01612903, 0.04032258,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan])

In [None]:
# Średnie wyniki na wszystkich anchorach dla wszystkich splitów na zbiorze walidacyjnym - de facto z tych danych robimy wykres LC
np.nanmean(default[:, :, :, train_val_test], axis=(0, 1))

  np.nanmean(default[:, :, :, train_val_test], axis=(0, 1))


array([0.18      , 0.1816129 , 0.17290323, 0.17258065, 0.16935484,
       0.15612903, 0.14870968, 0.14354839, 0.13612903, 0.13677419,
       0.14193548, 0.13193548, 0.12548387, 0.11548387, 0.11935484,
       0.11129032, 0.10645161, 0.10064516, 0.09774194, 0.09096774,
       0.09322581, 0.08      , 0.08225806, 0.07322581, 0.07258065,
       0.07032258, 0.07290323, 0.06322581, 0.06096774, 0.05419355,
       0.05354839, 0.04870968, 0.04258065, 0.04225806, 0.04064516,
       0.03645161, 0.03580645, 0.03129032, 0.03193548, 0.02967742,
       0.03064516, 0.02741935, 0.02580645, 0.02322581, 0.02387097,
       0.02032258, 0.02032258, 0.02064516, 0.02      ,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan])

In [None]:
# Jeśli chcemy stworzyć wykres uśrednionej LC dla najlepszych hiperparametów:
np.nanmean(max1[:, :, :, train_val_test] + max2[:, :, :, train_val_test] + max3[:, :, :, train_val_test], axis=(0, 1)) / 3

  np.nanmean(max1[:, :, :, train_val_test] + max2[:, :, :, train_val_test] + max3[:, :, :, train_val_test], axis=(0, 1)) / 3


array([0.3731401 , 0.37681159, 0.2821256 , 0.28995169, 0.28096618,
       0.28038647, 0.27729469, 0.17342995, 0.18193237, 0.16975845,
       0.16415459, 0.15951691, 0.15942029, 0.15130435, 0.14531401,
       0.14347826, 0.14376812, 0.1410628 , 0.14048309, 0.1315942 ,
       0.1257971 , 0.12      , 0.12280193, 0.11536232, 0.11072464,
       0.10898551, 0.09980676, 0.09487923, 0.09188406, 0.08753623,
       0.08676329, 0.07613527, 0.07004831, 0.06541063, 0.06057971,
       0.05777778, 0.05623188, 0.05256039, 0.04869565, 0.04444444,
       0.03797101, 0.03884058, 0.03487923, 0.03381643, 0.03207729,
       0.02975845, 0.02647343, 0.02714976, 0.02705314,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan])

### 3.  Policzenie całek dla krzywych LC

In [27]:
# Liczy całki osobno  dla train/valid/test
integrals = [[], [], []]
for idx, row in transformed_df.iterrows():
    results = row["Results"]
    for train_val_test in range(3):
        try:
            y_values = np.nanmean(results[:, :, :, train_val_test], axis=(0, 1))
            value = LC_integral_value(y_values)
        except Exception:
            value = np.nan
        integrals[train_val_test].append(value)

# Dodajemy kolumnę do ramki danych
transformed_df["Integral_train"] = integrals[0]
transformed_df['Integral_valid'] = integrals[1]
transformed_df['Integral_test'] = integrals[2]

  y_values = np.nanmean(results[:, :, :, train_val_test], axis=(0, 1))


In [28]:
transformed_df

Unnamed: 0,Dataset Name,Preprocessing Method,Hyperparameters,Results,Integral_train,Integral_valid,Integral_test
0,banknote-authentication,unprocessed,"{'min_samples_split': 2, 'min_samples_leaf': 1...","[[[[0. 0.23387097 0.19565217], [0. ...",0.000000,0.039602,0.042920
1,banknote-authentication,unprocessed,"{'min_samples_split': 6, 'min_samples_leaf': 1...","[[[[0. 0.16935484 0.10144928], [0. ...",0.002975,0.037933,0.041737
2,banknote-authentication,unprocessed,"{'min_samples_split': 28, 'min_samples_leaf': ...","[[[[0.5 0.44354839 0.44202899], [0.4444...",0.037805,0.067238,0.071462
3,banknote-authentication,unprocessed,"{'min_samples_split': 20, 'min_samples_leaf': ...","[[[[0.5 0.44354839 0.44202899], [0.4444...",0.026701,0.062341,0.066377
4,banknote-authentication,unprocessed,"{'min_samples_split': 50, 'min_samples_leaf': ...","[[[[0.5 0.44354839 0.44202899], [0.5 ...",0.112081,0.131164,0.153786
...,...,...,...,...,...,...,...
485,credit-g-mod,med_knn_none,"{'min_samples_split': 15, 'min_samples_leaf': ...","[[[[0.25 0.3 0.3 ], [0.27777778 0.3 0....",0.209097,0.300858,0.295495
486,credit-g-mod,med_knn_none,"{'min_samples_split': 52, 'min_samples_leaf': ...","[[[[0.5 0.3 0.3], [0.44444444 0.3 0.3 ...",0.221175,0.294862,0.291626
487,credit-g-mod,med_knn_none,"{'min_samples_split': 8, 'min_samples_leaf': 1...","[[[[0.125 0.36666667 0.32 ], [0.1111...",0.059128,0.317352,0.312016
488,credit-g-mod,med_knn_none,"{'min_samples_split': 14, 'min_samples_leaf': ...","[[[[0.0625 0.24444444 0.34 ], [0.2222...",0.104053,0.312906,0.299889
