In this notebook we want to verify whether the hyperparameter W consistently varies with the sample size of the training set and the noise in the data. We will test this on all the datasets considered. 

In [1]:
import uci_dataset as data
import random
random.seed(10)
n_rep = 10
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import functions
import pandas as pd
import pickle
import xarray as xr
import numpy as np

In [2]:
# Set values of the hyperparameters and load the dictionary with all the datasets and class features

cv = 10
budget = 12

with open('datasets_classlabels.pkl', 'rb') as fp:
    DFs = pickle.load(fp)

The following chunks will produce the values of W over 10 repetitions for every dataset, taking different sizes for the training set.

In [3]:
# Create a dictionary with the datasets' names 

w_df = dict()

for name in DFs['names']:
    w_df[name] = []
    
    
n = len(w_df)
n_repetitions = 10
test_sizes = [0.1, 0.2, 0.33, 0.5, 0.66]  # We will study the cases where the test size is this proportion of the total

In [14]:
#create 3D dataset
xarray_3d = xr.Dataset(
    {"thyroid": (("test_size", "repetition"), np.zeros((len(test_sizes), n_repetitions)))},
    coords={
        "test_size": test_sizes,
        "repetition": list(range(1, n_repetitions + 1)),
    },)
    
results = xarray_3d.to_dataframe()

for j in range(n):
    
    name = DFs['names'][j]
    df = DFs['dfs'][j]
    ws = []
    class_feat = DFs['class_feats'][j]
    pos_class = DFs['pos_classes'][j]
    
    # If the dataset is different from thyroid first adjourn it adding a new column
    if name != 'thyroid':
        results[name] = np.zeros(len(test_sizes)*len(n_repetitions))
    
    for test_size in test_sizes:
        
        for i in range(n_repetitions):

            X_train, _ = train_test_split(df, test_size = test_size)
            w = functions.param_selection(X_train, class_feat, pos_class, cv = cv, verbosity = False)
            results.loc[(i + 1, test_size), name] = w
        print(name + ' with test size ' + str(test_size) + ' completed')


KeyboardInterrupt: 

Brutta copia:

In [11]:
DFs['class_feat'][0]

KeyError: 'class_feat'

In [14]:
list(range(1,11))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [80]:
#create 3D dataset
xarray_3d = xr.Dataset(
    {"product_A": (("year", "quarter"), np.zeros((2,4)))},
    coords={
        "year": [2021, 2022],
        "quarter": ["Q1", "Q2", "Q3", "Q4"],
        "product_B": 0,
       },
)

In [81]:
d = xarray_3d.to_dataframe()
d

Unnamed: 0_level_0,Unnamed: 1_level_0,product_A,product_B
quarter,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Q1,2021,0.0,0
Q1,2022,0.0,0
Q2,2021,0.0,0
Q2,2022,0.0,0
Q3,2021,0.0,0
Q3,2022,0.0,0
Q4,2021,0.0,0
Q4,2022,0.0,0


In [87]:
v=d.index.get_level_values('year') == '2021'

In [110]:
d.loc['Q2','product_C'] = [2,3]

In [111]:
d

Unnamed: 0_level_0,Unnamed: 1_level_0,product_A,product_B,product_C
quarter,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q1,2021,0.0,0,1.0
Q1,2022,0.0,0,1.0
Q2,2021,0.0,0,2.0
Q2,2022,0.0,0,3.0
Q3,2021,0.0,0,0.0
Q3,2022,0.0,0,0.0
Q4,2021,0.0,0,0.0
Q4,2022,0.0,0,0.0


In [72]:
 np.random.randn(2,4)

array([[-0.75063736, -1.86914845,  2.52901982, -1.37515322],
       [-0.80331296, -0.71204998,  0.22428133, -1.39845241]])

In [77]:
np.zeros((2,4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [29]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,thyroid,sdbbbv
repetition,test_size,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.1,0.0,0.0
1,0.2,0.0,0.0
1,0.33,0.0,0.0
1,0.5,0.0,0.0
1,0.66,0.0,0.0
2,0.1,0.0,0.0
2,0.2,0.0,0.0
2,0.33,0.0,0.0
2,0.5,2.0,0.0
2,0.66,0.0,0.0


In [19]:
results.loc[(2,0.5), 'thyroid'] = 2

In [28]:
results.loc[(10,0.1),'sdbbbv'] = 20

In [21]:
np.zeros((len(test_sizes), n_repetitions))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])