In [1]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow
from mlflow.models import infer_signature

from sklearn.model_selection import GridSearchCV


import sys
sys.path.append('../')

import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Classification_Helpers as helpers

In [2]:
pd.set_option('display.max_columns', None)  # Use 'None' to display all columns

# Set up MLFlow

In [3]:
# Start MLFlow
#!mlflow server --host 127.0.0.1 --port 8080

In [4]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [5]:
subject_list = ["293", "294", "298"]
label_list  = [1, 2, 3, 4, 5, 7]

## Dataframes that do not depend on folds

In [6]:
def import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory):
    """
    Import and concatenate feature datasets for each subject.

    Args:
    - subject_list (list): List of subject names.

    Returns:
    - pd.DataFrame: Concatenated feature DataFrame.
    - list: List of all labels.
    """
    subject_feature_dfs = {}

    for subject_idx, subject in enumerate(subject_list):
        subject_feature_dfs[subject] = pd.DataFrame()

        for data_type in ["EEG", "EMG"]:
            data_frames = []

            for file in list_of_filenames:
                path = os.path.join(str(parent_directory), "Features", str(subject), str(data_type), file)
                if os.path.exists(path):
                    data_frames.append(pd.read_csv(path))

            df_both_data_types = pd.concat(data_frames, axis=1)

            if not subject_feature_dfs[subject].empty:
                subject_feature_dfs[subject] = pd.concat([subject_feature_dfs[subject], df_both_data_types], axis=1).drop(columns=['Unnamed: 0'], inplace=False)
                subject_feature_dfs[subject] = helpers.keep_first_duplicate_columns(subject_feature_dfs[subject])
            else:
                df_both_data_types = helpers.keep_first_duplicate_columns(df_both_data_types)
                subject_feature_dfs[subject] = df_both_data_types.drop(columns=['Unnamed: 0'], inplace=False)

        subject_feature_dfs[subject]["Subject"] = subject_idx

    feature_df = pd.concat(subject_feature_dfs.values(), ignore_index=True)

    # For duplicate columns, only keep one
    feature_df = helpers.keep_first_duplicate_columns(feature_df)
    
    return feature_df

In [7]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Advanced_Features.csv", "Signature_Statistics.csv"]

feature_df = import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

all_labels = feature_df["Label"]

## Dataframes that DO depend on folds

In [8]:
list_of_filenames = ["Vectorization_Features.csv"]

fold_dependant_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")


list_of_filenames = ["Vectorization_Features_for_Final_Test.csv"]

fold_dependant_final_test_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

# Experiments with Single Features

In [9]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

There are 1955 features in the main dataframe.


## Save features for Data Exploration

In [11]:
feature_df.to_csv("Features/All_Features.csv")

## Filter

In [12]:
#list_of_strings_in_column_name = ["Persistence_image_Statistic_", "Persistence_Landscape_Statistic_", "Vectorization"]
list_of_strings_in_column_name = ["Nothing"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_name)

print("There now are "+str(len(feature_df.columns))+" features in the main dataframe.")

There now are 1955 features in the main dataframe.


# Preprocess

In [13]:
feature_df.fillna(0, inplace=True)

In [14]:
# TODO This can be in the helper file as well
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "")

## Features that do not depend on folds

In [15]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

## Fold-dependant Features

In [16]:
train_fold_dependant_features_dfs_all_folds, _ = helpers.filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, train_indices, label_list)
validation_fold_dependant_features_dfs_all_folds, _ = helpers.filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, validation_indices, label_list)


## Reformat data

In [17]:
# Comment out if you want to leave out fold-dependant features
train_features_dfs_all_folds = helpers.combine_all_features(train_features_dfs_all_folds, train_fold_dependant_features_dfs_all_folds)
validation_features_dfs_all_folds = helpers.combine_all_features(validation_features_dfs_all_folds, validation_fold_dependant_features_dfs_all_folds)

In [18]:
# Get train and validation sets
# TO DO What is this step for?
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

# MLFLow & Cross Validation

In [19]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory="")

In [20]:
# Concatenate train data of all folds such that the function GridSearchCV works on them
train_dfs = [X_train[0], X_train[1],  X_train[2],  X_train[3],  X_train[4]]
concatenated_X_train = pd.concat(train_dfs, ignore_index=True)
test_dfs = [X_test[0], X_test[1],  X_test[2],  X_test[3],  X_test[4]]
concatenated_X_test = pd.concat(test_dfs, ignore_index=True)
concatenated_X = pd.concat([concatenated_X_train, concatenated_X_test], ignore_index=True)

concatenated_y_train = y_train[0] + y_train[1] +  y_train[2] + y_train[3] + y_train[4]
concatenated_y_test = y_test[0] + y_test[1] +  y_test[2] + y_test[3] + y_test[4]
concatenated_y = concatenated_y_train + concatenated_y_test


In [21]:
# Define the start and end indices for each fold's training and test sets
train_test_splits = []
current_train_start = 0
current_test_start = sum(len(X_train[i]) for i in range(5))


for i in range(5):
    train_len = len(X_train[i])
    test_len = len(X_test[i])

    indices_for_training = np.arange(current_train_start, current_train_start + train_len)
    indices_for_testing = np.arange(current_test_start, current_test_start + test_len)
    
    train_test_splits.append((indices_for_training, indices_for_testing))
    
    current_train_start += train_len
    current_test_start += test_len

# Define a custom generator for cross-validation
class CustomCV:
    def __init__(self, train_test_splits):
        self.train_test_splits = train_test_splits
    
    def split(self, X, y=None, groups=None):
        for train_idx, test_idx in self.train_test_splits:
            yield train_idx, test_idx
    
    def get_n_splits(self, X, y, groups=None):
        return len(self.train_test_splits)

# Create the custom cross-validation object
custom_cv = CustomCV(train_test_splits)

## Random Forest

In [36]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Define the parameter grid after first initial manual experiments
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4, 6]
}

# Result: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


# Another parameter grid
param_grid = {
    'n_estimators': [500],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1]
}

# Result: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}

# Another parameter grid for finer tuning
param_grid = {
    'n_estimators': [300, 400, 500, 600],
    'max_depth': [None, 50, 70],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Result: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

rf = RandomForestClassifier(random_state = 42)

grid_search = GridSearchCV(rf, param_grid, cv=custom_cv, scoring='accuracy', verbose = 3)

grid_search.fit(concatenated_X, concatenated_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.951 total time=   5.0s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.939 total time=   5.2s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.956 total time=   5.2s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.954 total time=   4.9s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.957 total time=   5.1s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.951 total time=   6.7s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.937 total time=   6.8s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.95

In [49]:
import math 

params = {"random_state": 42, "n_estimators": 500, "min_samples_split": 2}
rf = RandomForestClassifier(**params)
all_accuracies = []

for fold in range(5):
    rf.fit(X_train[fold], y_train[fold])
    y_pred = rf.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", fold + 1, ":", accuracy)

average_accuracy = np.mean(all_accuracies)
print("Average Accuracy:", average_accuracy)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.9508021390374332


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.9387755102040817


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.9547900968783638


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.9535637149028078
Accuracy for fold 5 : 0.9565217391304348
Average Accuracy: 0.9508906400306243


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [52]:
mlflow_params = params
features = X_train[0].columns
mlflow_params["features"] = features

In [53]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))


    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Random Forest Sleep Data")

    # Infer the model signature
    signature = infer_signature(X_train[fold].values, rf.predict(X_train[fold].values))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="random_forest-sleep-data",
        signature=signature,
        input_example=X_train[fold],
        registered_model_name="random_forest-sleep-data",
    )


KeyboardInterrupt



# XGBoost

In [29]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}

for fold in range(5):
    y_train[fold] = [mapping[num] for num in y_train[fold]]
    y_test[fold] = [mapping[num] for num in y_test[fold]]

In [23]:
concatenated_X = concatenated_X.drop('Subject', axis=1)

In [24]:
# Define the parameter grid after first initial manual experiments for approximately finding a sweet spot (with MLFlow, code below)

# Main parameter grid for finetuning
param_grid = {
    "learning_rate": [0.17, 0.19, 0.21],
    "n_estimators": [250, 300],
    "max_depth": [7, 8, 9],
    "min_child_weight": [0],
    "subsample": [0.5]
}

# Result: params = {'learning_rate': 0.19, 'max_depth': 7, 'min_child_weight': 0, 'n_estimators': 300, 'subsample': 0.5}

# Another parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.19],
    "n_estimators": [300, 400],
    "max_depth": [5, 6, 7],
    "min_child_weight": [0],
    "max_delta_step": [0, 1],
    "subsample": [0.5]
}

# Result: params = {'learning_rate': 0.19, 'max_delta_step': 1, 'max_depth': 5, 'min_child_weight': 0, 'n_estimators': 400, 'subsample': 0.5}


# Another parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.18, 0.19, 0.2],
    "n_estimators": [400, 500],
    "max_depth": [3, 4, 5],
    "min_child_weight": [0, 1],
    "max_delta_step": [1, 2],
    "subsample": [0.5]
}

# Result:  {'learning_rate': 0.18, 'max_delta_step': 1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 400, 'subsample': 0.5}

xb = xgb.XGBClassifier(seed=1)

# GridSearchCV
grid_search = GridSearchCV(xb, param_grid, cv=custom_cv, scoring='accuracy', verbose = 3)

grid_search.fit(concatenated_X, concatenated_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END learning_rate=0.18, max_delta_step=1, max_depth=3, min_child_weight=0, n_estimators=400, subsample=0.5;, score=0.982 total time=  10.7s



KeyboardInterrupt



In [31]:
# Manual experiments
params = {"seed": 1, 'learning_rate': 0.18, 'max_delta_step': 1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 400, 'subsample': 0.5}
          
#params = best_params
xb = xgb.XGBClassifier(**params)
all_accuracies = []
all_feature_importances = []

for fold in range(len(X_train)):
    # Remove duplicate columns
    X_train[fold] = helpers.keep_first_duplicate_columns(X_train[fold])

    xb.fit(X_train[fold], y_train[fold])

    X_test[fold] = X_test[fold].loc[:, ~X_test[fold].columns.duplicated()]

    y_pred = xb.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", accuracy)

    # Get feature importances for the current fold
    feature_importances = xb.feature_importances_
    all_feature_importances.append(feature_importances)

average_accuracy = np.mean(all_accuracies)
print("Mean Accuracy:", average_accuracy)

Accuracy for fold 0.9807486631016042
Accuracy for fold 0.9849624060150376
Accuracy for fold 0.984930032292788
Accuracy for fold 0.9773218142548596
Accuracy for fold 0.9826086956521739
Mean Accuracy: 0.9821143222632926


In [None]:
# list_of_strings_in_column_name = ["Persistence_image_Statistic_", "Persistence_Landscape_Statistic_", "Vectorization"]

# params = {"seed": 1, "learning_rate": 0.2, "n_estimators": 300, "max_depth": 8, "min_child_weight": 0, "max_delta_step": 1, "subsample":0.5}
# Mean Accuracy: 0.9806019593752222

In [None]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))


    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "XGBoost Sleep Data")

    # Infer the model signature
    signature = infer_signature(X_train[fold].values, rf.predict(X_train[fold].values))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="xgboost-sleep-data",
        signature=signature,
        input_example=X_train[fold],
        registered_model_name="xgboost-sleep-data",
    )

# Final Test Set

In [22]:
def concatenate_final_training_and_test_indices(train_indices, validation_indices, subject_list, label_list):
    """
    The new training data consists of the previous training plus the previous validation data
    """

    final_train_indices = {}

    for subject in subject_list:

        # Initialize
        final_train_indices[subject] = {}
        
        train_indices_for_subject = train_indices[subject]
        validation_indices_for_subject = validation_indices[subject]

        for label in label_list:
            # It does not matter which fold we choose, so simply choose fold 0
            if isinstance(train_indices_for_subject["Label_"+str(label)]["Fold_0"], (np.ndarray, list)):
                train_indices_to_combine = train_indices_for_subject["Label_"+str(label)]["Fold_0"]
            else:
                train_indices_to_combine = [train_indices_for_subject["Label_"+str(label)]["Fold_0"]]

            if isinstance(validation_indices_for_subject["Label_"+str(label)]["Fold_0"], (np.ndarray, list)):
                validation_indices_to_combine = validation_indices_for_subject["Label_"+str(label)]["Fold_0"]
            else:
                validation_indices_to_combine = [validation_indices_for_subject["Label_"+str(label)]["Fold_0"]]

            final_train_indices[subject]["Label_"+str(label)] = np.concatenate((train_indices_to_combine, validation_indices_to_combine))


    return final_train_indices
      
    

In [23]:
# Get final training indices
final_train_indices = concatenate_final_training_and_test_indices(train_indices, \
                            validation_indices, subject_list, label_list)

In [24]:
# Concatenate ATOL features with all other features
feature_df = pd.concat([feature_df, fold_dependant_final_test_feature_df], axis=1)

In [25]:
feature_df = helpers.keep_first_duplicate_columns(feature_df)

In [27]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

In [None]:
final_accuracies = [] 

for seed in range(10):

    params = {'random_state': seed, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

    rf = RandomForestClassifier(**params)

    rf.fit(X_train_final, y_train_final)
    y_pred = rf.predict(X_test_final)
    accuracy = accuracy_score(y_pred, y_test_final)
    print("Accuracy:", accuracy)

    final_accuracies.append(accuracy)

print("Mean accuracy " + str(np.mean(final_accuracies)) + ", with standard deviation "+str(np.std(final_accuracies)) + ".")

In [30]:
y_train_final = [mapping[num] for num in y_train_final]
y_test_final = [mapping[num] for num in y_test_final]

In [None]:
final_accuracies = [] 

importances = []

for seed in range(10):

    params = {"seed": seed, 'learning_rate': 0.18, 'max_delta_step': 1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 400, 'subsample': 0.5}

    xb = xgb.XGBClassifier(**params)

    xb.fit(X_train_final, y_train_final)
    y_pred = xb.predict(X_test_final)
    accuracy = accuracy_score(y_pred, y_test_final)
    print("Accuracy:", accuracy)

    final_accuracies.append(accuracy)

    feature_importances = xb.feature_importances_
    importances.append(feature_importances)


print("Mean accuracy " + str(np.mean(final_accuracies)) + ", with standard deviation "+str(np.std(final_accuracies)) + ".")

Accuracy: 0.9794344473007712
Accuracy: 0.9802913453299057


In [None]:
feature = "_Persistence_Landscape_Statistic_Mean"

importances_of_single_feature = []

for imp in importances: 
    mask = X_train_final.columns.str.contains(feature)
    # Use numpy to get the indices where the mask is True
    indices = np.where(mask)[0]
    importances_of_single_feature.append(sum(imp[indices]))


print("Mean feature importance " + str(np.mean(importances_of_single_feature)) + ", with standard deviation "+str(np.std(importances_of_single_feature)) + ".")

In [None]:
print(str(round(np.mean(importances_of_single_feature),4)) + " \pm "+str(round(np.std(importances_of_single_feature), 4)))