In [90]:
import os 
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, KFold, GroupKFold
from xgboost import XGBClassifier
import xgboost as xgb

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GroupShuffleSplit
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from typing import Any, Dict, Union
from yellowbrick import model_selection as ms
from yellowbrick.model_selection import validation_curve

import shap 
from sklearn import metrics

os.chdir('/home/melissa/PROJECT_DIRECTORIES/EEGFeatureExtraction/Scripts/XGBoost/')
%run dataset_prep.py
%run prepare_dataset.py
%run groupedkfold.py

## Import DataFrame

Make sure data is one-hot encoded for categorical features

In [10]:
conn_human = pd.read_csv('/home/melissa/RESULTS/GRAPH/HUMAN/' + 'all_patients_encoded_connectivity.csv')

In [11]:
conn_human.drop('Unnamed: 0', axis = 1, inplace = True)

In [13]:
## split patient_IDs into train and test
conn_human

Unnamed: 0,Patient_ID,Frequency,Channel,Genotype,Value,Idx,Metric_coh,Metric_pli,Metric_plv,Metric_wpli
0,P10 N1,1,3,0,0.582767,165,1,0,0,0
1,P10 N1,1,4,0,0.572689,165,1,0,0,0
2,P10 N1,1,5,0,0.907305,165,1,0,0,0
3,P10 N1,1,0,0,0.722517,165,1,0,0,0
4,P10 N1,1,1,0,0.531733,165,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1583990,P6 N2,0,7,0,0.246244,1542,0,0,0,1
1583991,P6 N2,0,8,0,0.276410,1542,0,0,0,1
1583992,P6 N2,0,9,0,0.756640,1542,0,0,0,1
1583993,P6 N2,0,6,0,0.277802,1542,0,0,0,1


## Splitting IDs into train and test

In [14]:
clean_ids_file = [ 'P10 N1','P15 N2','P16 N1',
             'P20 N1','P21 N1','P5 N1', 'P17 N1','P21 N2', 'P21 N3',
             'P27 N1', 'P24 N1', 'P26 N2', 'P28 N1', 'P28 N2', 'P6 N1', 'P7 N1', 'P1 N1']

SYNGAP_ls = ['P3 N1', 'P3 N2', 'P5 N1', 'P5 N2', 'P6 N1', 'P6 N2', 'P7 N1', 'P7 N2',
          'P10 N1', 'P10 N2', 'P15 N1', 'P15 N2', 'P16 N1', 'P16 N2', 'P20 N1', 'P20 N2', 'P26 N1', 'P26 N2']
WT_ls = ['P1 N1', 'P1 N2', 'P4 N1', 'P4 N2', 'P17 N1', 'P17 N2', 'P24 N1', 'P24 N2',
      'P27 N1', 'P27 N2', 'P21 N1', 'P21 N2', 'P21 N3', 'P23 N1', 'P23 N2',
      'P28 N1', 'P28 N2']

In [23]:
genotype = []
for animal in clean_ids_file:
    if animal in SYNGAP_ls:
        gen = 1
        genotype.append(gen)
    elif animal in WT_ls:
        gen = 0
        genotype.append(gen)

train_test_dict = {'Animal_ID': clean_ids_file, 'Genotype': genotype}
train_test_df = pd.DataFrame(data = train_test_dict)

In [43]:
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 1)
split = splitter.split(train_test_df, groups=train_test_df['Animal_ID'])
train_inds, test_inds = next(split)

train = train_test_df.iloc[train_inds]
test = train_test_df.iloc[test_inds]

In [61]:
def custom_split(data, group_column, test_size=0.2, random_state=None):
    unique_groups = data[group_column].str[:2].unique()  # Change to [:3] if you want the first 3 letters
    train_groups, test_groups = train_test_split(unique_groups, test_size=test_size, random_state=random_state)
    
    train_mask = data[group_column].str[:2].isin(train_groups)  # Change to [:3] if you want the first 3 letters
    test_mask = ~train_mask
    
    return data[train_mask], data[test_mask]

# Assuming 'Animal_ID' is the column containing the IDs
train, test = custom_split(train_test_df, group_column='Animal_ID', test_size=0.3, random_state=7)

In [66]:
train_ids = train['Animal_ID']
train_ids

3     P20 N1
4     P21 N1
5      P5 N1
7     P21 N2
8     P21 N3
9     P27 N1
10    P24 N1
11    P26 N2
12    P28 N1
13    P28 N2
15     P7 N1
Name: Animal_ID, dtype: object

In [67]:
test_ids = test['Animal_ID']
test_ids

0     P10 N1
1     P15 N2
2     P16 N1
6     P17 N1
14     P6 N1
16     P1 N1
Name: Animal_ID, dtype: object

In [103]:
train_conn_df = conn_human.loc[conn_human['Patient_ID'].isin(train_ids)]
train_conn_df

Unnamed: 0,Patient_ID,Frequency,Channel,Genotype,Value,Idx,Metric_coh,Metric_pli,Metric_plv,Metric_wpli
204719,P21 N1,1,3,1,0.833039,57,1,0,0,0
204720,P21 N1,1,4,1,0.867293,57,1,0,0,0
204721,P21 N1,1,5,1,0.940457,57,1,0,0,0
204722,P21 N1,1,0,1,0.630438,57,1,0,0,0
204723,P21 N1,1,1,1,0.635571,57,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1562634,P20 N1,0,7,0,0.791240,632,0,0,0,1
1562635,P20 N1,0,8,0,0.714989,632,0,0,0,1
1562636,P20 N1,0,9,0,0.234917,632,0,0,0,1
1562637,P20 N1,0,6,0,0.347220,632,0,0,0,1


In [104]:
test_conn_df = conn_human.loc[conn_human['Patient_ID'].isin(test_ids)]
test_conn_df

Unnamed: 0,Patient_ID,Frequency,Channel,Genotype,Value,Idx,Metric_coh,Metric_pli,Metric_plv,Metric_wpli
0,P10 N1,1,3,0,0.582767,165,1,0,0,0
1,P10 N1,1,4,0,0.572689,165,1,0,0,0
2,P10 N1,1,5,0,0.907305,165,1,0,0,0
3,P10 N1,1,0,0,0.722517,165,1,0,0,0
4,P10 N1,1,1,0,0.531733,165,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1184394,P1 N1,0,7,1,0.218607,1634,0,0,0,1
1184395,P1 N1,0,8,1,0.171348,1634,0,0,0,1
1184396,P1 N1,0,9,1,0.199631,1634,0,0,0,1
1184397,P1 N1,0,6,1,0.376458,1634,0,0,0,1


In [98]:
group_by_patient_id = train_conn_df.groupby(['Patient_ID'])
groups_by_patient_id_list = np.array(train_conn_df['Patient_ID'].values)
groups_by_patient_id_list 

array(['P21 N1', 'P21 N1', 'P21 N1', ..., 'P20 N1', 'P20 N1', 'P20 N1'],
      dtype=object)

In [112]:
genotype_column_train = train_conn_df.pop('Genotype')  # Remove the 'Genotype' column and store it
train_conn_df.insert(1, 'Genotype', genotype_column_train)  # Insert the 'Genotype' column at the beginning
genotype_column_test = test_conn_df.pop('Genotype')  # Remove the 'Genotype' column and store it
test_conn_df.insert(1, 'Genotype', genotype_column_test)  # Insert the 'Genotype' column at the beginning

In [113]:
X_train = train_conn_df.iloc[:, 2:]
y_train = train_conn_df.iloc[:, 1]

In [114]:
n_splits = 3
group_kfold = GroupKFold(n_splits = n_splits)
print(group_kfold.get_n_splits(X_train, y_train, groups = groups_by_patient_id_list))

result = []
y_result = []
for train_idx, val_idx in group_kfold.split(X_train, y_train, groups = groups_by_patient_id_list):
    train_fold = X_train.iloc[train_idx]
    val_fold = X_train.iloc[val_idx]
    train_y_fold = y_train.iloc[train_idx]
    val_y_fold = y_train.iloc[val_idx]
    result.append((train_fold, val_fold))
    y_result.append((train_y_fold, val_y_fold))
    
train_fold_1, val_fold_1 = result[0][0],result[0][1]
train_fold_2, val_fold_2 = result[1][0],result[1][1]
train_fold_3, val_fold_3 = result[2][0],result[2][1]

y_train_fold_1, y_val_fold_1 = y_result[0][0],y_result[0][1]
y_train_fold_2, y_val_fold_2 = y_result[1][0],y_result[1][1]
y_train_fold_3, y_val_fold_3 = y_result[2][0],y_result[2][1]

3


In [115]:
y_train_fold_3

487679     0
487680     0
487681     0
487682     0
487683     0
          ..
1532394    1
1532395    1
1532396    1
1532397    1
1532398    1
Name: Genotype, Length: 360480, dtype: int64

In [116]:
X_train = train_conn_df.iloc[:, 2:]
y_train = train_conn_df.iloc[:, 1]

In [117]:
X_train

Unnamed: 0,Frequency,Channel,Value,Idx,Metric_coh,Metric_pli,Metric_plv,Metric_wpli
204719,1,3,0.833039,57,1,0,0,0
204720,1,4,0.867293,57,1,0,0,0
204721,1,5,0.940457,57,1,0,0,0
204722,1,0,0.630438,57,1,0,0,0
204723,1,1,0.635571,57,1,0,0,0
...,...,...,...,...,...,...,...,...
1562634,0,7,0.791240,632,0,0,0,1
1562635,0,8,0.714989,632,0,0,0,1
1562636,0,9,0.234917,632,0,0,0,1
1562637,0,6,0.347220,632,0,0,0,1


In [118]:
y_train

204719     1
204720     1
204721     1
204722     1
204723     1
          ..
1562634    0
1562635    0
1562636    0
1562637    0
1562638    0
Name: Genotype, Length: 550320, dtype: int64

In [119]:
options = {'max_depth': hp.quniform('max_depth', 1, 8, 1), #tree
            'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
            'subsample': hp.uniform('subsample', 0.5, 1), #stochastic
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
            'reg_alpha': hp.uniform('reg_alpha', 0, 10), 
            'reg_lambda': hp.uniform('reg_lambda', 1, 10),
            'gamma': hp.loguniform('gamma', -10, 10),
            'learning_rate': hp.loguniform('learning_rate', -7, 0), 
            'random_state': 42
          }

In [120]:
def hyperparameter_tuning(space: Dict[str, Union[float, int]],
                         X_train: pd.DataFrame, y_train: pd.Series, 
                         X_test: pd.DataFrame, y_test: pd.Series, 
                         early_stopping_rounds: int = 50, 
                         metric: callable = roc_auc_score) -> Dict[str, Any]:
    
    '''Perform hyperparameter runing for an XGBoost classifier. 
    
    This function takes a dictionary of hyperparameters, training and test data, and an optional value
    for early stopping rounds, and returns a dictionary with the loss and model resulting from 
    the tuning process. The model is trained using the training data and evaluated on the test 
    data. The loss is computed as the negative of the accuracy score.
    
    space: Dict[str, Union[float, int]]
    A dictionary of hyperparameters for the XGBoost classifier
    
    X_train: pd.DataFrame
    The training data
    
    y_train: pd.Series
    The training target
    
    X_test: pd.Dataframe
    The test data
    
    y_test: pd.Series
    The test target
    
    early_stopping rounds: int, optional 
    The number of early stopping rounds to use. The deault is 50
    
    metric: callable
    Metric to maximise. Default is accuracy
    
    Returns: 
    Dict[str, Any]
        A dictionary with the loss and model resulting from the tuning process. 
        The loss is a float, and the model is an XGBoost classifier'''
    
    int_vals = ['max_depth', 'reg_alpha']
    
    space = {k: (int(val) if k in int_vals else val)
            for k, val in space.items()}
    
    space['early_stopping_rounds'] = early_stopping_rounds
    
    model = xgb.XGBClassifier(**space)
    evaluation = [(X_train, y_train), 
                 (X_test, y_test)]
    model.fit(X_train, y_train, eval_set = evaluation, verbose = False)
    
    score = metrics.roc_auc_score(y_test, model.predict(X_test))
    return {'loss': -score, 'status': STATUS_OK, 'model': model}
    

In [121]:
trials = Trials()
best = fmin(fn = lambda space: hyperparameter_tuning(space, X_train = train_fold_1, y_train = y_train_fold_1,
                                                     X_test = val_fold_1, y_test = y_val_fold_1),
            space = options,
            algo = tpe.suggest,
            max_evals = 500,
            trials = trials)

 12%|▌    | 60/500 [21:32<2:38:00, 21.55s/trial, best loss: -0.7449438202247192]


KeyboardInterrupt: 