In [53]:
import os
import numpy as np 
import pandas as pd 

from sklearn import model_selection, preprocessing 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GroupShuffleSplit,  StratifiedGroupKFold, train_test_split, cross_val_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import resample
#from iterstrat.ml_stratifiers import StratifiedGroupKFold

import xgboost as xgb
from xgboost import XGBClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from typing import Any, Dict, Union
from yellowbrick import model_selection as ms
from yellowbrick.model_selection import validation_curve

import mlflow
import mlflow.xgboost
import urllib
import zipfile

### Load data 

In [27]:
os.chdir('/home/melissa/PROJECT_DIRECTORIES/EEGFeatureExtraction/Scripts/Preprocessing/')
%run constants.py

all_features = pd.read_csv('/home/melissa/PROJECT_DIRECTORIES/EEGFeatureExtraction/Results/all_features.csv', index_col = 0)
all_features['Genotype'] = np.where(all_features['Animal_ID'].isin(WT_ls), 0, 
                                    np.where(all_features['Animal_ID'].isin(GAP_ls), 1, -1))
columns = ['Genotype'] + [col for col in all_features.columns if col != 'Genotype']
all_features = all_features[columns]
X = all_features.iloc[:, 1:].drop(['Animal_ID'], axis = 1)
y = all_features.iloc[:, 0]
animal_ids = all_features['Animal_ID']
# Initial train-test split by group
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=animal_ids))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
train_ids = np.unique(animal_ids.iloc[train_idx])
test_ids = np.unique(animal_ids.iloc[test_idx])

## Load selected features 

In [31]:
results_path = '/home/melissa/PROJECT_DIRECTORIES/EEGFeatureExtraction/Results/'
feature_df = pd.read_csv(f'{results_path}acceptedfeatures.csv')
features = feature_df['AcceptedFeatures'].to_list()

In [60]:
X_train = X_train.loc[:, features]
X_test = X_test.loc[:, features]
train_groups = animal_ids.iloc[train_idx] 

In [70]:
def hyperparameter_tuning(space: Dict[str, Union[float, int]],
                          X: pd.DataFrame, y: pd.Series, 
                          groups: pd.Series, 
                          early_stopping_rounds: int = 50, 
                          metric: callable = accuracy_score) -> Dict[str, Any]:
    
    int_vals = ['max_depth','n_estimators']
    
    space = {k: (int(val) if k in int_vals else val)
            for k, val in space.items()}
    
    model = xgb.XGBClassifier(**space, eval_metric='logloss', early_stopping_rounds=early_stopping_rounds)
    sgkf = StratifiedGroupKFold(n_splits=5)
    
    cross_val_scores = []
    for train_idx, val_idx in sgkf.split(X, y, groups):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]
        
        # Check if both classes are present in the validation set
        if len(np.unique(y_val_cv)) < 2:
            continue
        
        model.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=False)
        preds = model.predict(X_val_cv)
        score = roc_auc_score(y_val_cv, preds)
        cross_val_scores.append(score)
    
    if not cross_val_scores:
        # If no valid cross-validation scores, return a high loss
        return {'loss': 1.0, 'status': STATUS_OK, 'model': model}
    
    mean_score = np.mean(cross_val_scores)
    
    return {'loss': -mean_score, 'status': STATUS_OK, 'model': model}

In [71]:
# Create or set the experiment
experiment_name = 'ex1'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    mlflow.create_experiment(name=experiment_name)
mlflow.set_experiment(experiment_name=experiment_name)

<Experiment: artifact_location='file:///home/melissa/PROJECT_DIRECTORIES/EEGFeatureExtraction/Scripts/Preprocessing/mlruns/753569834781238875', creation_time=1719226914833, experiment_id='753569834781238875', last_update_time=1719226914833, lifecycle_stage='active', name='ex1', tags={}>

In [None]:
with mlflow.start_run():
    params = {'random_state': 42}
    
    rounds = [{
    'max_depth': hp.quniform('max_depth', 1, 15, 1),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'reg_alpha': hp.loguniform('reg_alpha', -3, 2),
    'reg_lambda': hp.loguniform('reg_lambda', -3, 2),
    'gamma': hp.loguniform('gamma', -10, 1),
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'random_state': 42}]

    # Perform bootstrapping on animal IDs
    bootstraps = 5  # Number of bootstrap samples
    for i in range(bootstraps):
        bootstrapped_ids = resample(train_groups.unique(), replace=True)
        bootstrapped_idx = train_groups.isin(bootstrapped_ids)
        
        X_train_bootstrap = X_train[bootstrapped_idx]
        y_train_bootstrap = y_train[bootstrapped_idx]
        train_groups_bootstrap = train_groups[bootstrapped_idx]
        
        for round_params in rounds:
            params = {**params, **round_params}
            trials = Trials()
            with mlflow.start_run(nested=True):
                best = fmin(fn=lambda space: hyperparameter_tuning(space, X_train_bootstrap, y_train_bootstrap, train_groups_bootstrap),
                            space=params,
                            algo=tpe.suggest,
                            max_evals=50,
                            trials=trials,
                            timeout=60*5)
                params = {**params, **best}
                for param, val in params.items():
                    mlflow.log_param(param, val)
                params['max_depth'] = int(params['max_depth'])
    
    xg = xgb.XGBClassifier(eval_metric='logloss', early_stopping_rounds=50, **params)
    xg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
    
    for metric in [accuracy_score, precision_score, recall_score, f1_score]:
        mlflow.log_metric(metric.__name__, metric(y_test, xg.predict(X_test)))
    
    model_info = mlflow.xgboost.log_model(xg, artifact_path='model')

 58%|████▋   | 29/50 [05:02<03:39, 10.43s/trial, best loss: -0.7512093381366937]
 32%|██▌     | 16/50 [05:05<10:49, 19.10s/trial, best loss: -0.7611073497904333]
 32%|██▌     | 16/50 [05:01<10:41, 18.87s/trial, best loss: -0.7941846297556318]
 94%|███████▌| 47/50 [05:01<00:19,  6.43s/trial, best loss: -0.7796012935427409]
  8%|▋        | 4/50 [00:45<07:57, 10.37s/trial, best loss: -0.7736417492010298]

In [None]:
#model variables point to a directory that stores information about the model 
print(experiment_name)
model_info.run_id