# Model selection

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

import matplotlib.pyplot as plt

Usiamo *Heart Disease* dataset [documentazione qui](https://archive.ics.uci.edu/ml/datasets/Heart+Disease)

In [2]:
data = pd.read_csv('heart.csv')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
data.dropna(axis=0, inplace=True)

In [4]:
np.unique(data['target'])

array([0, 1], dtype=int64)

In [5]:
# get the labels
y = data.pop('target')
y = pd.get_dummies(y, drop_first=True).values.squeeze()

In [6]:
# controlliamo lo sbilanciamento
np.bincount(y)/len(y)

array([0.45544554, 0.54455446])

In [7]:
categorical = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
X_num = StandardScaler().fit_transform(data[numerical].values)

ohe = OneHotEncoder(sparse=False, drop='if_binary')
X_cat = ohe.fit_transform(data[categorical].values)

X = np.concatenate([X_num, X_cat], axis=1)

feature_names = numerical + ohe.get_feature_names_out(categorical).tolist()


print(X.shape, y.shape)
print(feature_names)

(303, 23) (303,)
['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca', 'sex_1', 'cp_0', 'cp_1', 'cp_2', 'cp_3', 'fbs_1', 'restecg_0', 'restecg_1', 'restecg_2', 'exang_1', 'slope_0', 'slope_1', 'slope_2', 'thal_0', 'thal_1', 'thal_2', 'thal_3']


In [10]:
X_cat

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 1.],
       [1., 1., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [11]:
rnd = 12

### Separazione Train-Val-Test
- separariamo un insieme di test e uno di  validazione $X_{test}, ~y_{test}, X_{val}, ~y_{val}$ stratificando a seconda della classe

- [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html)

### Validazione iper-parametri
- scegliere 2 o 3 iper-parametri da validare. 
- definire una grid-search sugli iper-parametri scelti
- implementare la procedura di model selection
- stampare la migliore configurazione di iper-parametri
 
 
**iper-parametri**:

 - max_depth: profondità massima dell'albero
 - min_samples_split: numero minimo di esempi da considerare per dividere un nodo
 - min_samples_leaf: numero minimo di esempi per una foglia
 - max_leaf_nodes: numero massimo di foglie

 
 #### strumenti utili
- [ParameterGrid](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html#sklearn.model_selection.ParameterGrid)
- [np.linspace](https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html)
- [np.logspace](https://docs.scipy.org/doc/numpy/reference/generated/numpy.logspace.html)

#### utile progress bar
- progress bar: [tqdm](https://tqdm.github.io/) (pip install tqdm)

In [14]:
!pip install tqdm



In [15]:
%whos

Variable                 Type             Data/Info
---------------------------------------------------
DecisionTreeClassifier   ABCMeta          <class 'sklearn.tree._cla<...>.DecisionTreeClassifier'>
KNeighborsClassifier     ABCMeta          <class 'sklearn.neighbors<...>on.KNeighborsClassifier'>
OneHotEncoder            type             <class 'sklearn.preproces<...>_encoders.OneHotEncoder'>
StandardScaler           type             <class 'sklearn.preproces<...>ng._data.StandardScaler'>
X                        ndarray          303x23: 6969 elems, type `float64`, 55752 bytes
X_cat                    ndarray          303x17: 5151 elems, type `float64`, 41208 bytes
X_num                    ndarray          303x6: 1818 elems, type `float64`, 14544 bytes
categorical              list             n=7
data                     DataFrame             age  sex  cp  trestb<...>\n[303 rows x 13 columns]
feature_names            list             n=23
np                       module           <m

In [16]:
# TODO
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [26]:
#train_test_split
from  sklearn.model_selection import ShuffleSplit

splitter = ShuffleSplit(test_size = 0.4, n_splits = 1, random_state = rnd)
splitter

ShuffleSplit(n_splits=1, random_state=12, test_size=0.4, train_size=None)

In [19]:
# X e y sono stati giià creati

array([[ 0.9521966 ,  0.76395577, -0.25633371, ...,  1.        ,
         0.        ,  0.        ],
       [-1.91531289, -0.09273778,  0.07219949, ...,  0.        ,
         1.        ,  0.        ],
       [-1.47415758, -0.09273778, -0.81677269, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.50364073,  0.70684287, -1.029353  , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.29046364, -0.09273778, -2.2275329 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.29046364, -0.09273778, -0.19835726, ...,  0.        ,
         1.        ,  0.        ]])

In [27]:
splitter.split(X)

<generator object BaseShuffleSplit.split at 0x000001DD04DC97B0>

In [28]:
#1 modo
next(splitter.split(X))

(array([292, 298, 190, 220,  13,  61,  36,  55, 129, 219,  98,  66,  19,
         30, 178, 289, 218, 267,   5, 145, 142,  62, 188, 205,   8, 185,
         40, 201, 242, 120,  45,  33, 128,  15, 231, 161,  39, 103, 168,
         17, 113,  95,  96, 293, 192, 222, 166, 126,  93, 108, 265, 230,
        147,   4,  26, 221, 291, 213, 132,  79, 288,  56,  22, 206, 154,
         72,  76, 138,  46, 164, 177, 286,  54, 115,  71, 163, 254, 180,
        210, 176, 270,   2,  28, 302, 272,   9,  69, 150,  44, 153, 196,
         97, 102, 184, 248,  57, 215, 195,  80, 148, 165, 112,   0, 301,
         53, 287, 162,  91, 114,  70, 116, 249,  18, 244, 276,  43, 179,
        197, 105, 157,  20, 290, 237, 235, 212, 183, 182, 121,  99,  84,
         59,  58, 202, 156, 299, 280, 198, 170,  60, 175, 107, 266, 181,
        245, 203,  65, 260, 225, 246,  25, 273, 187,  27, 110, 160, 240,
        173,  73, 109, 123, 134, 208, 146, 119, 158, 100,  89,  82, 269,
        204, 104,  74, 118, 141,  49, 278, 259, 130

In [29]:
#2 modo
for train_index, test_index in splitter.split(X):
    print("TRAIN: ", train_index, "TEST: ", test_index)

TRAIN:  [292 298 190 220  13  61  36  55 129 219  98  66  19  30 178 289 218 267
   5 145 142  62 188 205   8 185  40 201 242 120  45  33 128  15 231 161
  39 103 168  17 113  95  96 293 192 222 166 126  93 108 265 230 147   4
  26 221 291 213 132  79 288  56  22 206 154  72  76 138  46 164 177 286
  54 115  71 163 254 180 210 176 270   2  28 302 272   9  69 150  44 153
 196  97 102 184 248  57 215 195  80 148 165 112   0 301  53 287 162  91
 114  70 116 249  18 244 276  43 179 197 105 157  20 290 237 235 212 183
 182 121  99  84  59  58 202 156 299 280 198 170  60 175 107 266 181 245
 203  65 260 225 246  25 273 187  27 110 160 240 173  73 109 123 134 208
 146 119 158 100  89  82 269 204 104  74 118 141  49 278 259 130 241 253
 155] TEST:  [ 92  85  75 233 243  78   1 236 279 209 258 207 193  37 127 171 229 271
  21  42 172 263 264 125  48 294 200  12  86 262  67  11 268  77 228   3
 167  14 296 284 216   6  16 217 122 251 234 169   7  41 159 136 111 282
 194  34 275  35  52 214 101 2

In [30]:
train_idx , test_idx =  next(splitter.split(X))

In [34]:
X[train_idx]

array([[ 0.40075247,  2.19177836, -0.41093757, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.29046364,  0.47839125, -0.10172985, ...,  0.        ,
         0.        ,  1.        ],
       [-0.37126932, -0.09273778,  1.13510102, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.5110413 ,  2.42022998,  0.05287401, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.39335191, -1.80612489,  1.01914812, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.40075247, -0.09273778, -0.95205107, ...,  0.        ,
         1.        ,  0.        ]])

In [31]:
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [32]:
X_train.shape, X_test.shape

((181, 23), (122, 23))

In [35]:
np.bincount(y_train)/len(y_train),np.bincount(y_test)/len(y_test)

(array([0.43646409, 0.56353591]), array([0.48360656, 0.51639344]))

In [36]:
#nel caso in cui invece un dataset sbilanciato le famiglie di classi splitter da utilizzare sono quelle
#chiamate Stratified
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(random_state= rnd, test_size = 0.4, n_splits = 1)

train_idx , test_idx =  next(splitter.split(X,y))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

X_train.shape, X_test.shape


((181, 23), (122, 23))

In [37]:
np.bincount(y_train)/len(y_train),np.bincount(y_test)/len(y_test)

(array([0.45303867, 0.54696133]), array([0.45901639, 0.54098361]))

In [None]:
#siamo arrivati che abbiamo train e test set
#dobbiamo crearci il validation set


In [38]:
splitter = StratifiedShuffleSplit(random_state= rnd, test_size = 0.4, n_splits = 1)

train_idx, val_idx = next(splitter.split(X_train,y_train))

X_val, y_val = X_train[val_idx], y_train[val_idx]
X_train, y_train = X_train[train_idx], y_train[train_idx]

X_train.shape, X_val.shape, X_test.shape


((108, 23), (73, 23), (122, 23))

In [None]:
#abbiamo concluso la divisione in train-validation-test

### Implementare la GridSearch

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

#ragioniamo su max_depth e min_sample_leaf

In [40]:
X_train.shape

(108, 23)

In [41]:
max_depth_values = np.arange(2,30)
min_sample_values = np.arange(4,10)

best_score = 0
best_params = {}

for max_depth in max_depth_values:
    for min_samples in min_sample_values:
        
        dt =  DecisionTreeClassifier(max_depth = max_depth, min_samples_leaf = min_samples)
        dt.fit(X_train, y_train)
        
        #per valutare che faccio?
        
        y_pred_val = dt.predict(X_val)
        score = f1_score(y_val, y_pred_val)
        
        if score> best_score:
            best_score = score
            best_params = {'max_depth':max_depth, 'min_samples_leaf ':min_samples}
    

In [42]:
best_score, best_params

(0.8717948717948718, {'max_depth': 14, 'min_samples_leaf ': 8})

In [43]:
dt = DecisionTreeClassifier(max_depth = 14, min_samples_leaf = 8)

In [44]:
dt.fit(np.concatenate([X_train, X_val], axis=0), np.concatenate([y_train,y_val]))

DecisionTreeClassifier(max_depth=14, min_samples_leaf=8)

In [45]:
y_pred_test = dt.predict(X_test)
f1_score(y_test, y_pred_test)

0.7826086956521738

### libreria
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [13]:
# TODO