In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
# Import dataset

df = pd.read_csv('~/Marek/2_PhD/ISI/Work/data/new_isi_data.csv')
# Remove unnamed column: ,index_col=0
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,channel_name,resected,onset_channel,outcome,HFO_80.0-250.0,HFO_250.0-600.0,spike_rate,power_1.0-4.0,...,lin_corr_median_80.0-250.0,ren_80.0-250.0,phase_sync_80.0-250.0,coherence_80.0-250.0,lin_corr_median_250.0-600.0,ren_250.0-600.0,phase_sync_250.0-600.0,coherence_250.0-600.0,segm_type,segm_number
0,0,583,ra1,0,NON_SOZ,11.0,11,34,22,0.533375,...,0.995615,0.003025,0.985229,0.997413,0.990955,0.004121,0.970733,0.994151,0,0
1,1,583,ra2,0,NON_SOZ,11.0,16,40,49,0.518898,...,0.999169,0.000649,0.996365,0.999503,0.997006,0.001574,0.987965,0.997992,0,0
2,2,583,ra3,0,NON_SOZ,11.0,20,35,61,0.466505,...,0.999172,0.000648,0.996397,0.999509,0.996981,0.001719,0.987949,0.997969,0,0
3,3,583,ra4,0,NON_SOZ,11.0,27,37,66,0.444008,...,0.997592,0.001411,0.991613,0.998744,0.99482,0.002903,0.984822,0.997324,0,0
4,4,583,ra5,0,NON_SOZ,11.0,11,58,63,0.588096,...,0.994437,0.003369,0.981503,0.996723,0.996824,0.001831,0.987556,0.997855,0,0


In [3]:
# Drop index column

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,patient_id,channel_name,resected,onset_channel,outcome,HFO_80.0-250.0,HFO_250.0-600.0,spike_rate,power_1.0-4.0,power_4.0-8.0,...,lin_corr_median_80.0-250.0,ren_80.0-250.0,phase_sync_80.0-250.0,coherence_80.0-250.0,lin_corr_median_250.0-600.0,ren_250.0-600.0,phase_sync_250.0-600.0,coherence_250.0-600.0,segm_type,segm_number
0,583,ra1,0,NON_SOZ,11.0,11,34,22,0.533375,0.781969,...,0.995615,0.003025,0.985229,0.997413,0.990955,0.004121,0.970733,0.994151,0,0
1,583,ra2,0,NON_SOZ,11.0,16,40,49,0.518898,0.780688,...,0.999169,0.000649,0.996365,0.999503,0.997006,0.001574,0.987965,0.997992,0,0
2,583,ra3,0,NON_SOZ,11.0,20,35,61,0.466505,0.740345,...,0.999172,0.000648,0.996397,0.999509,0.996981,0.001719,0.987949,0.997969,0,0
3,583,ra4,0,NON_SOZ,11.0,27,37,66,0.444008,0.721173,...,0.997592,0.001411,0.991613,0.998744,0.99482,0.002903,0.984822,0.997324,0,0
4,583,ra5,0,NON_SOZ,11.0,11,58,63,0.588096,0.937518,...,0.994437,0.003369,0.981503,0.996723,0.996824,0.001831,0.987556,0.997855,0,0


In [4]:
# Drop "nan" channels across all segments

df_nans = df[df.isnull().any(axis=1)]
for pts in df_nans.patient_id.unique():
    for chns in list(df_nans.loc[df_nans.patient_id == pts,'channel_name'].unique()):
        df = df.drop(df[(df.patient_id == pts) & (df.channel_name == chns)].index)

In [5]:
# Dataframe preparation, target column 

outcome_11 = 11

df = df[df.outcome <= outcome_11]
mask = df.onset_channel == 'NON_SOZ'
df.loc[mask, 'onset_channel'] = 0
mask = df.onset_channel == 'SOZ'
df.loc[mask, 'onset_channel'] = 1
tmp = df['resected'].values * df['onset_channel'].values
df['target'] = tmp.astype(int)
df = df.rename(columns={'spike_rate':'spike_rate_0.0-0.0',
                        'pse':'pse_0.0-0.0',
                        'pac':'pac_0.0-0.0',
                        'fac':'fac_0.0-0.0',
                        'lfr':'lfr_0.0-0.0'})

features = [x for x in df if any(c.isdigit() for c in x)]
features.sort()

In [6]:
# Drop no target pts

target = 'target'
for pts in df.patient_id.unique():
    if sum(df.loc[df.patient_id == pts,target]) == 0:
        #print('patient',str(pts),'does not have target - droping this patient')
        df = df[df.patient_id != pts]

In [7]:
# Keep only segm_type = 4 

df = df.loc[df['segm_type'] == 4]
print(df.shape)
df.head()

(1959, 68)


Unnamed: 0,patient_id,channel_name,resected,onset_channel,outcome,HFO_80.0-250.0,HFO_250.0-600.0,spike_rate_0.0-0.0,power_1.0-4.0,power_4.0-8.0,...,ren_80.0-250.0,phase_sync_80.0-250.0,coherence_80.0-250.0,lin_corr_median_250.0-600.0,ren_250.0-600.0,phase_sync_250.0-600.0,coherence_250.0-600.0,segm_type,segm_number,target
34674,583,ra1,0,0,11.0,19,56,50,0.526201,0.474988,...,0.008171,0.962278,0.992315,0.951701,0.015386,0.894617,0.968141,4,0,0
34675,583,ra2,0,0,11.0,35,62,86,0.560877,0.560381,...,0.001929,0.989986,0.99841,0.982513,0.007849,0.952962,0.988048,4,0,0
34676,583,ra3,0,0,11.0,36,66,101,0.558763,0.576171,...,0.002038,0.989737,0.998386,0.982065,0.007308,0.951814,0.987657,4,0,0
34677,583,ra4,0,0,11.0,47,66,102,0.554024,0.565439,...,0.005044,0.975965,0.995656,0.975849,0.009962,0.941062,0.984107,4,0,0
34678,583,ra5,0,0,11.0,32,75,96,0.649021,0.658895,...,0.009224,0.94479,0.987954,0.980963,0.007094,0.949647,0.986945,4,0,0


In [8]:
# Number of unique patients

print(len(df.patient_id.unique()))
patients = df.patient_id.unique()
patients = np.sort(patients)
patients

18


array([  61,   63,   71,   77,   82,   89,   93,  583,  657,  717,  723,
        953,  965, 1002, 1021, 1041, 1043, 1630])

In [9]:
# Scale selected features with standard scaler (z-score)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder        
scaler = StandardScaler()
for pts in df.patient_id.unique(): 
    df.loc[df.patient_id == pts,features] = scaler.fit_transform(df.loc[df.patient_id == pts,features])

In [10]:
# Create a list of the columns
columns_to_keep = ['patient_id'] + [col for col in df.columns[5:65]] + ['target']

# A new DataFrame with only those columns
lazy_df = df[columns_to_keep]

In [11]:
# View lazy_df

lazy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1959 entries, 34674 to 40452
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   patient_id                   1959 non-null   int64  
 1   HFO_80.0-250.0               1959 non-null   float64
 2   HFO_250.0-600.0              1959 non-null   float64
 3   spike_rate_0.0-0.0           1959 non-null   float64
 4   power_1.0-4.0                1959 non-null   float64
 5   power_4.0-8.0                1959 non-null   float64
 6   power_8.0-12.0               1959 non-null   float64
 7   power_12.0-20.0              1959 non-null   float64
 8   power_20.0-45.0              1959 non-null   float64
 9   power_65.0-80.0              1959 non-null   float64
 10  power_80.0-250.0             1959 non-null   float64
 11  power_250.0-600.0            1959 non-null   float64
 12  hlx_0.0-0.0                  1959 non-null   float64
 13  hlx_1.0-4.0  

In [None]:
# Archive legacy Lazy model 

'''
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from IPython.display import display
from sklearn.model_selection import LeaveOneGroupOut
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Define the group column
group = 'patient_id'

# Split the data into X and y
X = lazy_df.drop(columns=target)
y = lazy_df[target]

logo = LeaveOneGroupOut()
for train_index, test_index in logo.split(X, y, groups=lazy_df[group]):
    X_train, X_test = X.iloc[train_index].drop(columns=[group]), X.iloc[test_index].drop(columns=[group])
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    selected_patient_id = lazy_df.iloc[test_index][group].unique()
    print("Selected patient ID: ", selected_patient_id)
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train, X_test, y_train, y_test)
'''

In [12]:
# Revised Lazy model, including display object, a library for handling imbalanced datasets in machine learning

import pandas as pd
pd.set_option('display.max_colwidth', -1)
from IPython.display import display
from sklearn.model_selection import LeaveOneGroupOut
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Define the group column
group = 'patient_id'

# Split the data into X and y
X = lazy_df.drop(columns=target)
y = lazy_df[target]

logo = LeaveOneGroupOut()

for train_index, test_index in logo.split(X, y, groups=lazy_df[group]):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    selected_patient_id = lazy_df.iloc[test_index][group].unique()
    print("Selected patient ID: ", selected_patient_id)
    print("X_train shape: ", X_train.shape)
    print("X_test shape: ", X_test.shape)
    print("y_train shape: ", y_train.shape)
    print("y_test shape: ", y_test.shape)
    # oversampling the minority class using SMOTE
    sm = SMOTE(sampling_strategy='auto')
    X_train, y_train = sm.fit_resample(X_train, y_train)
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train.drop(columns=[group]), X_test.drop(columns=[group]), y_train, y_test)
    display(models.style.set_properties(**{'font-size': '9pt'}))


Selected patient ID:  [61]
X_train shape:  (1809, 61)
X_test shape:  (150, 61)
y_train shape:  (1809,)
y_test shape:  (150,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.44it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,0.813333,0.8487,0.8487,0.859006,0.060872
RidgeClassifier,0.813333,0.8487,0.8487,0.859006,0.028867
CalibratedClassifierCV,0.833333,0.807329,0.807329,0.8715,1.357625
LinearSVC,0.82,0.800236,0.800236,0.862496,0.386101
BaggingClassifier,0.86,0.717494,0.717494,0.885972,1.206248
NuSVC,0.853333,0.713948,0.713948,0.881586,0.40016
NearestCentroid,0.733333,0.702128,0.702128,0.802233,0.013656
RandomForestClassifier,0.913333,0.693853,0.693853,0.919058,1.349251
LogisticRegression,0.806667,0.689125,0.689125,0.85094,0.055938
RidgeClassifierCV,0.8,0.685579,0.685579,0.846538,0.029601


Selected patient ID:  [63]
X_train shape:  (1839, 61)
X_test shape:  (120, 61)
y_train shape:  (1839,)
y_test shape:  (120,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.35it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CalibratedClassifierCV,0.816667,0.905983,0.905983,0.879178,1.379496
LinearSVC,0.8,0.897436,0.897436,0.868571,0.387227
LogisticRegression,0.783333,0.888889,0.888889,0.857813,0.044806
LinearDiscriminantAnalysis,0.775,0.884615,0.884615,0.852372,0.037469
PassiveAggressiveClassifier,0.708333,0.850427,0.850427,0.807176,0.018286
XGBClassifier,0.908333,0.790598,0.790598,0.934,0.364334
RidgeClassifierCV,0.783333,0.726496,0.726496,0.857619,0.041256
RidgeClassifier,0.783333,0.726496,0.726496,0.857619,0.015988
Perceptron,0.758333,0.713675,0.713675,0.841436,0.0162
BernoulliNB,0.75,0.709402,0.709402,0.835951,0.015481


Selected patient ID:  [71]
X_train shape:  (1790, 61)
X_test shape:  (169, 61)
y_train shape:  (1790,)
y_test shape:  (169,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.50it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BaggingClassifier,0.970414,0.887805,0.887805,0.973691,1.152354
AdaBoostClassifier,0.91716,0.860366,0.860366,0.93818,0.990874
SGDClassifier,0.83432,0.817683,0.817683,0.887017,0.073406
GaussianNB,0.704142,0.75061,0.75061,0.801207,0.012548
LinearDiscriminantAnalysis,0.804734,0.705488,0.705488,0.867863,0.05378
RidgeClassifierCV,0.804734,0.705488,0.705488,0.867863,0.042861
RidgeClassifier,0.804734,0.705488,0.705488,0.867863,0.016789
PassiveAggressiveClassifier,0.781065,0.693293,0.693293,0.852829,0.020796
XGBClassifier,0.952663,0.684756,0.684756,0.956462,0.32363
LGBMClassifier,0.934911,0.67561,0.67561,0.945256,0.163309


Selected patient ID:  [77]
X_train shape:  (1791, 61)
X_test shape:  (168, 61)
y_train shape:  (1791,)
y_test shape:  (168,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.37it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.988095,0.993976,0.993976,0.990043,0.185447
RandomForestClassifier,0.982143,0.990964,0.990964,0.985888,1.378026
ExtraTreesClassifier,0.97619,0.987952,0.987952,0.981998,0.298851
XGBClassifier,0.970238,0.98494,0.98494,0.978278,0.368887
NuSVC,0.946429,0.972892,0.972892,0.964226,0.388267
LogisticRegression,0.910714,0.954819,0.954819,0.943846,0.050304
CalibratedClassifierCV,0.89881,0.948795,0.948795,0.937037,1.331754
LinearSVC,0.892857,0.945783,0.945783,0.933617,0.40279
AdaBoostClassifier,0.880952,0.939759,0.939759,0.92674,1.003282
RidgeClassifierCV,0.85119,0.924699,0.924699,0.909273,0.046118


Selected patient ID:  [82]
X_train shape:  (1874, 61)
X_test shape:  (85, 61)
y_train shape:  (1874,)
y_test shape:  (85,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.30it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTreeClassifier,0.776471,0.562039,0.562039,0.78757,0.281519
KNeighborsClassifier,0.694118,0.55344,0.55344,0.734602,0.014535
AdaBoostClassifier,0.823529,0.550369,0.550369,0.811351,1.043661
PassiveAggressiveClassifier,0.611765,0.54484,0.54484,0.674468,0.020714
BernoulliNB,0.741176,0.541769,0.541769,0.763439,0.015367
GaussianNB,0.6,0.538084,0.538084,0.665079,0.013057
NearestCentroid,0.658824,0.53317,0.53317,0.708742,0.014265
LGBMClassifier,0.858824,0.531941,0.531941,0.822107,0.189023
CalibratedClassifierCV,0.717647,0.528256,0.528256,0.747227,1.367205
LogisticRegression,0.717647,0.528256,0.528256,0.747227,0.048028


Selected patient ID:  [89]
X_train shape:  (1789, 61)
X_test shape:  (170, 61)
y_train shape:  (1789,)
y_test shape:  (170,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.50it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.776471,0.704475,0.704475,0.838732,0.015113
NearestCentroid,0.676471,0.652006,0.652006,0.769593,0.013811
DecisionTreeClassifier,0.888235,0.64429,0.64429,0.906756,0.254235
GaussianNB,0.605882,0.614969,0.614969,0.716288,0.013312
BaggingClassifier,0.941176,0.612654,0.612654,0.937155,1.275943
Perceptron,0.764706,0.579475,0.579475,0.829427,0.020118
AdaBoostClassifier,0.852941,0.566358,0.566358,0.882829,1.005267
SGDClassifier,0.623529,0.564815,0.564815,0.730602,0.039943
RandomForestClassifier,0.935294,0.550154,0.550154,0.928125,1.320373
LGBMClassifier,0.935294,0.550154,0.550154,0.928125,0.147638


Selected patient ID:  [93]
X_train shape:  (1799, 61)
X_test shape:  (160, 61)
y_train shape:  (1799,)
y_test shape:  (160,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.41it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,0.89375,0.784632,0.784632,0.919034,0.055743
RidgeClassifierCV,0.89375,0.784632,0.784632,0.919034,0.049875
RidgeClassifier,0.89375,0.784632,0.784632,0.919034,0.017633
ExtraTreeClassifier,0.89375,0.784632,0.784632,0.919034,0.013537
LogisticRegression,0.8875,0.781385,0.781385,0.91511,0.043808
CalibratedClassifierCV,0.88125,0.778139,0.778139,0.911196,1.329264
LinearSVC,0.86875,0.771645,0.771645,0.903386,0.36562
AdaBoostClassifier,0.85625,0.765152,0.765152,0.895577,0.996627
NearestCentroid,0.6875,0.677489,0.677489,0.783989,0.0138
BernoulliNB,0.68125,0.674242,0.674242,0.77951,0.01496


Selected patient ID:  [583]
X_train shape:  (1872, 61)
X_test shape:  (87, 61)
y_train shape:  (1872,)
y_test shape:  (87,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.30it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTreeClassifier,0.908046,0.952381,0.952381,0.93202,0.247707
PassiveAggressiveClassifier,0.827586,0.910714,0.910714,0.880711,0.016208
GaussianNB,0.804598,0.89881,0.89881,0.865812,0.013047
Perceptron,0.781609,0.886905,0.886905,0.850673,0.015219
NearestCentroid,0.758621,0.875,0.875,0.835249,0.013971
BernoulliNB,0.701149,0.845238,0.845238,0.795198,0.014571
KNeighborsClassifier,0.632184,0.809524,0.809524,0.743781,0.016772
ExtraTreesClassifier,0.91954,0.797619,0.797619,0.936592,0.2815
LGBMClassifier,0.896552,0.785714,0.785714,0.922154,0.176574
BaggingClassifier,0.885057,0.779762,0.779762,0.915025,1.331668


Selected patient ID:  [657]
X_train shape:  (1914, 61)
X_test shape:  (45, 61)
y_train shape:  (1914,)
y_test shape:  (45,)


100%|███████████████████████████████████████████| 29/29 [00:09<00:00,  3.21it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CalibratedClassifierCV,0.822222,0.904762,0.904762,0.863659,1.500021
LinearSVC,0.8,0.892857,0.892857,0.848,0.440921
SGDClassifier,0.755556,0.869048,0.869048,0.816223,0.114906
KNeighborsClassifier,0.711111,0.845238,0.845238,0.783494,0.01409
ExtraTreesClassifier,0.844444,0.761905,0.761905,0.874875,0.285946
SVC,0.822222,0.75,0.75,0.859829,0.232682
LinearDiscriminantAnalysis,0.8,0.738095,0.738095,0.844755,0.054027
RidgeClassifierCV,0.8,0.738095,0.738095,0.844755,0.039281
RidgeClassifier,0.8,0.738095,0.738095,0.844755,0.017003
RandomForestClassifier,0.8,0.738095,0.738095,0.844755,1.423121


Selected patient ID:  [717]
X_train shape:  (1889, 61)
X_test shape:  (70, 61)
y_train shape:  (1889,)
y_test shape:  (70,)


100%|███████████████████████████████████████████| 29/29 [00:09<00:00,  3.17it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearSVC,0.842857,0.914062,0.914062,0.873048,0.449746
CalibratedClassifierCV,0.842857,0.914062,0.914062,0.873048,1.431165
SGDClassifier,0.814286,0.898438,0.898438,0.852075,0.060518
LogisticRegression,0.8,0.890625,0.890625,0.841565,0.063465
LinearDiscriminantAnalysis,0.857143,0.846354,0.846354,0.880952,0.044712
RidgeClassifierCV,0.857143,0.846354,0.846354,0.880952,0.04115
RidgeClassifier,0.857143,0.846354,0.846354,0.880952,0.016112
AdaBoostClassifier,0.857143,0.846354,0.846354,0.880952,1.056086
SVC,0.857143,0.770833,0.770833,0.87744,0.245851
DecisionTreeClassifier,0.828571,0.755208,0.755208,0.857143,0.256893


Selected patient ID:  [723]
X_train shape:  (1873, 61)
X_test shape:  (86, 61)
y_train shape:  (1873,)
y_test shape:  (86,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.26it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PassiveAggressiveClassifier,0.953488,0.973333,0.973333,0.956429,0.017673
SGDClassifier,0.918605,0.953333,0.953333,0.926436,0.038151
LogisticRegression,0.953488,0.934545,0.934545,0.955112,0.055471
LinearSVC,0.94186,0.927879,0.927879,0.944756,0.40869
CalibratedClassifierCV,0.94186,0.927879,0.927879,0.944756,1.447814
KNeighborsClassifier,0.755814,0.86,0.86,0.795565,0.014519
XGBClassifier,0.94186,0.850303,0.850303,0.940669,0.341926
GaussianNB,0.837209,0.829091,0.829091,0.856831,0.012943
RidgeClassifierCV,0.860465,0.803636,0.803636,0.872508,0.030579
RidgeClassifier,0.860465,0.803636,0.803636,0.872508,0.015983


Selected patient ID:  [953]
X_train shape:  (1871, 61)
X_test shape:  (88, 61)
y_train shape:  (1871,)
y_test shape:  (88,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.30it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.75,0.864198,0.864198,0.806746,0.01432
BernoulliNB,0.647727,0.808642,0.808642,0.727385,0.015256
LinearDiscriminantAnalysis,0.886364,0.80776,0.80776,0.901224,0.052294
RidgeClassifierCV,0.886364,0.80776,0.80776,0.901224,0.031343
RidgeClassifier,0.886364,0.80776,0.80776,0.901224,0.016484
AdaBoostClassifier,0.875,0.801587,0.801587,0.893011,1.042586
CalibratedClassifierCV,0.875,0.801587,0.801587,0.893011,1.424652
PassiveAggressiveClassifier,0.863636,0.795414,0.795414,0.884888,0.018528
LogisticRegression,0.863636,0.795414,0.795414,0.884888,0.042484
SGDClassifier,0.852273,0.789242,0.789242,0.876831,0.040056


Selected patient ID:  [965]
X_train shape:  (1873, 61)
X_test shape:  (86, 61)
y_train shape:  (1873,)
y_test shape:  (86,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.28it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Perceptron,0.825581,0.909639,0.909639,0.879211,0.016423
SGDClassifier,0.813953,0.903614,0.903614,0.871684,0.041419
LinearSVC,0.802326,0.89759,0.89759,0.864102,0.42808
LinearDiscriminantAnalysis,0.802326,0.89759,0.89759,0.864102,0.047864
CalibratedClassifierCV,0.802326,0.89759,0.89759,0.864102,1.44436
RidgeClassifierCV,0.802326,0.89759,0.89759,0.864102,0.031586
RidgeClassifier,0.802326,0.89759,0.89759,0.864102,0.016634
LogisticRegression,0.802326,0.89759,0.89759,0.864102,0.044621
PassiveAggressiveClassifier,0.709302,0.849398,0.849398,0.800748,0.019558
AdaBoostClassifier,0.837209,0.75502,0.75502,0.88513,1.044186


Selected patient ID:  [1002]
X_train shape:  (1892, 61)
X_test shape:  (67, 61)
y_train shape:  (1892,)
y_test shape:  (67,)


100%|███████████████████████████████████████████| 29/29 [00:09<00:00,  3.21it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SGDClassifier,0.80597,0.893443,0.893443,0.844847,0.079239
XGBClassifier,0.895522,0.79235,0.79235,0.904653,0.345763
NuSVC,0.865672,0.775956,0.775956,0.882556,0.425918
SVC,0.865672,0.775956,0.775956,0.882556,0.266377
BaggingClassifier,0.865672,0.775956,0.775956,0.882556,1.363916
Perceptron,0.865672,0.775956,0.775956,0.882556,0.014874
KNeighborsClassifier,0.716418,0.769126,0.769126,0.77658,0.014576
RidgeClassifierCV,0.835821,0.759563,0.759563,0.861068,0.047358
RidgeClassifier,0.835821,0.759563,0.759563,0.861068,0.017998
LinearDiscriminantAnalysis,0.835821,0.759563,0.759563,0.861068,0.049926


Selected patient ID:  [1021]
X_train shape:  (1900, 61)
X_test shape:  (59, 61)
y_train shape:  (1900,)
y_test shape:  (59,)


100%|███████████████████████████████████████████| 29/29 [00:09<00:00,  3.11it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,0.898305,0.848916,0.848916,0.894314,1.061138
Perceptron,0.864407,0.840108,0.840108,0.864407,0.017078
SGDClassifier,0.847458,0.812331,0.812331,0.846197,0.052517
GaussianNB,0.813559,0.803523,0.803523,0.81719,0.012758
BernoulliNB,0.779661,0.794715,0.794715,0.787313,0.017425
NuSVC,0.864407,0.79336,0.79336,0.85547,0.428342
LogisticRegression,0.864407,0.79336,0.79336,0.85547,0.046306
LinearDiscriminantAnalysis,0.847458,0.781165,0.781165,0.839539,0.060017
RidgeClassifierCV,0.847458,0.781165,0.781165,0.839539,0.030554
RidgeClassifier,0.847458,0.781165,0.781165,0.839539,0.017616


Selected patient ID:  [1041]
X_train shape:  (1852, 61)
X_test shape:  (107, 61)
y_train shape:  (1852,)
y_test shape:  (107,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.29it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SGDClassifier,0.766355,0.682349,0.682349,0.788725,0.059804
PassiveAggressiveClassifier,0.728972,0.634615,0.634615,0.757696,0.018797
KNeighborsClassifier,0.766355,0.630838,0.630838,0.78265,0.015422
CalibratedClassifierCV,0.757009,0.625343,0.625343,0.775722,1.369294
LinearSVC,0.747664,0.619849,0.619849,0.768795,0.474968
LogisticRegression,0.747664,0.619849,0.619849,0.768795,0.048814
NuSVC,0.766355,0.605082,0.605082,0.778842,0.418489
LabelPropagation,0.803738,0.601305,0.601305,0.801109,0.469028
LabelSpreading,0.803738,0.601305,0.601305,0.801109,0.638245
LinearDiscriminantAnalysis,0.728972,0.583104,0.583104,0.751669,0.052575


Selected patient ID:  [1043]
X_train shape:  (1888, 61)
X_test shape:  (71, 61)
y_train shape:  (1888,)
y_test shape:  (71,)


100%|███████████████████████████████████████████| 29/29 [00:09<00:00,  3.13it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PassiveAggressiveClassifier,0.549296,0.694444,0.694444,0.61822,0.020264
AdaBoostClassifier,0.732394,0.65681,0.65681,0.770122,1.062137
NuSVC,0.788732,0.641577,0.641577,0.807258,0.420869
SGDClassifier,0.676056,0.624552,0.624552,0.72739,0.052601
LogisticRegression,0.71831,0.601254,0.601254,0.756257,0.042193
SVC,0.774648,0.586022,0.586022,0.791379,0.238345
DecisionTreeClassifier,0.84507,0.578853,0.578853,0.831407,0.268785
NearestCentroid,0.676056,0.577061,0.577061,0.72501,0.014082
ExtraTreeClassifier,0.71831,0.553763,0.553763,0.751933,0.013292
CalibratedClassifierCV,0.704225,0.545699,0.545699,0.741947,1.461133


Selected patient ID:  [1630]
X_train shape:  (1788, 61)
X_test shape:  (171, 61)
y_train shape:  (1788,)
y_test shape:  (171,)


100%|███████████████████████████████████████████| 29/29 [00:08<00:00,  3.48it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PassiveAggressiveClassifier,0.74269,0.863354,0.863354,0.810778,0.021298
LinearDiscriminantAnalysis,0.883041,0.844099,0.844099,0.905974,0.052627
RidgeClassifierCV,0.877193,0.840994,0.840994,0.901983,0.02959
RidgeClassifier,0.877193,0.840994,0.840994,0.901983,0.015493
SGDClassifier,0.789474,0.79441,0.79441,0.842636,0.04454
LGBMClassifier,0.877193,0.794099,0.794099,0.900509,0.151831
XGBClassifier,0.877193,0.794099,0.794099,0.900509,0.33243
LogisticRegression,0.812865,0.759938,0.759938,0.857533,0.04206
KNeighborsClassifier,0.707602,0.750932,0.750932,0.785132,0.016056
LinearSVC,0.795322,0.750621,0.750621,0.845761,0.385993


Explanation:

1) This code uses the following libraries:

pandas: a library for data manipulation and analysis
sklearn (Scikit-learn): a library for machine learning in Python
IPython.display: a library for creating display objects for the Jupyter notebook
lazypredict: a library for making machine learning easier and faster
imblearn: a library for handling imbalanced datasets in machine learning.


2) The "group" variable:

In this code represents the "patient_id" column in the dataset and is used as a grouping variable in the LeaveOneGroupOut cross-validation method. 


3) LeaveOneGroupOut (LOGO)

Is a cross-validation method used in machine learning to evaluate the performance of a model. The idea behind LOGO is to perform leave-one-out cross-validation (LOOCV) on group labels, instead of data samples. In this method, the data is divided into groups, and in each iteration, one entire group is left out for testing, and the rest of the groups are used for training. This way, the model is trained on data from all but one group, and then evaluated on the left-out group.The method "logo.split" is used to perform the cross-validation by splitting the data into training and test sets.

More practical:

The code is using the Leave-One-Group-Out cross-validation strategy, where each patient is considered a group. For each iteration of the loop, one patient is selected for testing and the remaining patients are used for training the model. In your example, if patient 61 is selected for testing, the model is trained on the data from patients 63, 71, 77, 82, 89, 93, 583, 657, 717, 723, 953, 965, 1002, 1021, 1041, 1043, and 1630. This is repeated for all patients, so each patient is used once for testing and all other patients are used for training.


4) SMOTE (Synthetic Minority Over-sampling Technique)

Is a popular oversampling method used to handle imbalanced datasets in machine learning. Imbalanced datasets occur when the number of samples belonging to one class is much higher than the number of samples belonging to the other class. This can result in a biased model that performs poorly on the minority class. SMOTE works by creating synthetic samples of the minority class instead of simply duplicating the existing samples. The algorithm does this by selecting a sample from the minority class and finding its k nearest neighbors. Then, it generates synthetic samples by interpolating between the selected sample and its neighbors. This process helps to create more diverse and representative samples for the minority class and thus helps to balance the class distribution.
In this code, the SMOTE method is used to oversample the minority class in the training data. The SMOTE object "sm" is created using the "SMOTE" class from the imblearn library, with the sampling_strategy set to 'auto'. The "fit_resample" method is then used to apply the SMOTE method to the training data. The result is a balanced training set with an equal number of samples from each class.


5) An explanation of each evaluation metric:

Accuracy: The proportion of correctly classified samples out of the total number of samples. It is a simple and commonly used metric to evaluate the performance of a classifier. However, it may not be suitable when the classes are imbalanced.

Balanced Accuracy: Similar to accuracy, but takes into account the imbalance of the classes. It is the average of sensitivity (true positive rate) and specificity (true negative rate).

ROC AUC: Receiver Operating Characteristic (ROC) curve is a plot of the true positive rate (sensitivity) versus the false positive rate (1 - specificity) for different classification thresholds. Area Under the Curve (AUC) of the ROC curve is a metric that measures the performance of the classifier across all possible thresholds. It is a useful metric when the classes are imbalanced, and it gives an idea of how well the classifier can discriminate between positive and negative samples.

F1 Score: The harmonic mean of precision and recall. It is a metric that balances the importance of precision (the proportion of true positives out of all predicted positives) and recall (the proportion of true positives out of all actual positives). It is a good metric when the classes are imbalanced, and it gives a balanced view of the classifier's performance.
