In [22]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow
from mlflow.models import infer_signature
from sklearn.model_selection import GridSearchCV

import sys
sys.path.append('../')

import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Classification_Helpers as helpers

In [2]:
pd.set_option('display.max_columns', None)  # Use 'None' to display all columns

# Set up MLFlow

In [3]:
# Start MLFlow
#!mlflow server --host 127.0.0.1 --port 8080

In [4]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [5]:
subject_list = ["293", "294", "298"]
label_list  = [1, 2, 3, 4, 5, 7]

## Dataframes that do not depend on folds

In [6]:
def import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory):
    """
    Import and concatenate feature datasets for each subject.

    Args:
    - subject_list (list): List of subject names.

    Returns:
    - pd.DataFrame: Concatenated feature DataFrame.
    - list: List of all labels.
    """
    subject_feature_dfs = {}

    for subject_idx, subject in enumerate(subject_list):
        subject_feature_dfs[subject] = pd.DataFrame()

        for data_type in ["EEG", "EMG"]:
            data_frames = []

            for file in list_of_filenames:
                path = os.path.join(str(parent_directory), "Features", str(subject), str(data_type), file)
                if os.path.exists(path):
                    data_frames.append(pd.read_csv(path))

            df_both_data_types = pd.concat(data_frames, axis=1)

            if not subject_feature_dfs[subject].empty:
                subject_feature_dfs[subject] = pd.concat([subject_feature_dfs[subject], df_both_data_types], axis=1).drop(columns=['Unnamed: 0'], inplace=False)
                subject_feature_dfs[subject] = helpers.keep_first_duplicate_columns(subject_feature_dfs[subject])
            else:
                df_both_data_types = helpers.keep_first_duplicate_columns(df_both_data_types)
                subject_feature_dfs[subject] = df_both_data_types.drop(columns=['Unnamed: 0'], inplace=False)

        subject_feature_dfs[subject]["Subject"] = subject_idx

    feature_df = pd.concat(subject_feature_dfs.values(), ignore_index=True)

    # For duplicate columns, only keep one
    feature_df = helpers.keep_first_duplicate_columns(feature_df)
    
    return feature_df

In [8]:
list_of_filenames = ["Statistical_Features_KATS_Statistics.csv", "Statistical_Features_Additional_Features.csv", "Statistical_Features_Level_Shift_Features.csv", "Statistical_Features_Autocorrelation_Features.csv"]

feature_df = import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

all_labels = feature_df["Label"]

# Experiments with Single Features

In [9]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

There are 56 features in the main dataframe.


## Save features for Data Exploration

In [11]:
feature_df.to_csv("Features/All_Features.csv")

## Filter

In [12]:
list_of_strings_in_column_name = ["Nothing"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_name)

print("There now are "+str(len(feature_df.columns))+" features in the main dataframe.")

There now are 56 features in the main dataframe.


# Preprocess

In [13]:
feature_df.fillna(0, inplace=True)

In [14]:
# TODO This can be in the helper file as well
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "")

In [15]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

In [16]:
# Get train and validation sets
# TO DO What is this step for?
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

# MLFLow & Cross Validation

In [17]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory="")

In [28]:
concatenated_X  = pd.concat([X_train[0], X_test[0]], ignore_index=True)

concatenated_y = y_train[0] + y_test[0]

## Random Forest

### Grid Search

In [33]:
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Main parameter grid
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4, 6]
}

# Result: params = {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}

# Another parameter grid for finer tuning 
param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [20, 30, 40],
    'min_samples_split': [2],
    'min_samples_leaf': [1, 2, 3]
}

# Result: params = {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


# Another parameter grid for finer tuning 
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [40, 50, 60],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Result: params =  {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}

rf = RandomForestClassifier(random_state = 42)

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', verbose = 3)

grid_search.fit(concatenated_X, concatenated_y)


best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.909 total time=   1.4s
[CV 2/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.916 total time=   1.3s
[CV 3/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.913 total time=   1.3s
[CV 4/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.914 total time=   1.4s
[CV 5/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.917 total time=   1.4s
[CV 1/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.913 total time=   2.0s
[CV 2/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.920 total time=   2.0s
[CV 3/5] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.912 total time=   2

In [18]:
import math 

params = {"random_state": 42, "n_estimators": 500, "min_samples_split": 2}
rf = RandomForestClassifier(**params)
all_accuracies = []

for fold in range(5):
    rf.fit(X_train[fold], y_train[fold])
    y_pred = rf.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", fold + 1, ":", accuracy)

average_accuracy = np.mean(all_accuracies)
print("Average Accuracy:", average_accuracy)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.906951871657754


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.9119226638023631


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.9149623250807319


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.9168466522678186
Accuracy for fold 5 : 0.9119565217391304
Average Accuracy: 0.9125280069095597


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [52]:
mlflow_params = params
features = X_train[0].columns
mlflow_params["features"] = features

In [53]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))


    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Random Forest Sleep Data")

    # Infer the model signature
    signature = infer_signature(X_train[fold].values, rf.predict(X_train[fold].values))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="random_forest-sleep-data",
        signature=signature,
        input_example=X_train[fold],
        registered_model_name="random_forest-sleep-data",
    )


KeyboardInterrupt



## XGBoost

### GridSearch

In [49]:
# Main parameter grid for finetuning after first manual experiments
param_grid = {
    "learning_rate": [0.19, 0.21, 0.23],
    "n_estimators": [400, 500, 600],
    "max_depth": [5, 8, 15],
    "min_child_weight": [0],
    "subsample": [0.5]
}

# Result: {'learning_rate': 0.19, 'max_depth': 8, 'min_child_weight': 0, 'n_estimators': 600, 'subsample': 0.5}

# Another parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.18, 0.19, 0.2],
    "n_estimators": [600],
    "max_depth": [7, 8, 9],
    "min_child_weight": [0],
    "subsample": [0.5]
}

# Result: {'learning_rate': 0.19, 'max_depth': 8, 'min_child_weight': 0, 'n_estimators': 600, 'subsample': 0.5}


xb = xgb.XGBClassifier(seed=1)

# GridSearchCV
grid_search = GridSearchCV(xb, param_grid, cv=5, scoring='accuracy', verbose = 3)

grid_search.fit(concatenated_X, concatenated_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END learning_rate=0.18, max_depth=7, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.913 total time=  32.9s
[CV 2/5] END learning_rate=0.18, max_depth=7, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.917 total time=  31.5s
[CV 3/5] END learning_rate=0.18, max_depth=7, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.921 total time=  29.6s
[CV 4/5] END learning_rate=0.18, max_depth=7, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.918 total time=  29.6s
[CV 5/5] END learning_rate=0.18, max_depth=7, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.917 total time=  35.3s
[CV 1/5] END learning_rate=0.18, max_depth=8, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.912 total time=  37.8s
[CV 2/5] END learning_rate=0.18, max_depth=8, min_child_weight=0, n_estimators=600, subsample=0.5;, score=0.917 total time=  33.8s
[CV 3/5] END learning_r

In [55]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}

for fold in range(5):
    y_train[fold] = [mapping[num] for num in y_train[fold]]
    y_test[fold] = [mapping[num] for num in y_test[fold]]

KeyError: 0

In [47]:
params = {"seed": 1, "learning_rate": 0.2, "n_estimators": 500, "max_depth": 15, "min_child_weight": 0, "subsample":0.5}

#params = best_params
xb = xgb.XGBClassifier(**params)
all_accuracies = []
all_feature_importances = []

for fold in range(len(X_train)):
    # Remove duplicate columns
    X_train[fold] = helpers.keep_first_duplicate_columns(X_train[fold])

    xb.fit(X_train[fold], y_train[fold])

    X_test[fold] = X_test[fold].loc[:, ~X_test[fold].columns.duplicated()]

    y_pred = xb.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", accuracy)

    # Get feature importances for the current fold
    feature_importances = xb.feature_importances_
    all_feature_importances.append(feature_importances)

average_accuracy = np.mean(all_accuracies)
print("Mean Accuracy:", average_accuracy)

KeyboardInterrupt: 

In [None]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))


    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "XGBoost Sleep Data")

    # Infer the model signature
    signature = infer_signature(X_train[fold].values, rf.predict(X_train[fold].values))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="xgboost-sleep-data",
        signature=signature,
        input_example=X_train[fold],
        registered_model_name="xgboost-sleep-data",
    )

# Final Test Set

In [57]:
def concatenate_final_training_and_test_indices(train_indices, validation_indices, subject_list, label_list):
    """
    The new training data consists of the previous training plus the previous validation data
    """

    final_train_indices = {}

    for subject in subject_list:

        # Initialize
        final_train_indices[subject] = {}
        
        train_indices_for_subject = train_indices[subject]
        validation_indices_for_subject = validation_indices[subject]

        for label in label_list:
            # It does not matter which fold we choose, so simply choose fold 0
            if isinstance(train_indices_for_subject["Label_"+str(label)]["Fold_0"], (np.ndarray, list)):
                train_indices_to_combine = train_indices_for_subject["Label_"+str(label)]["Fold_0"]
            else:
                train_indices_to_combine = [train_indices_for_subject["Label_"+str(label)]["Fold_0"]]

            if isinstance(validation_indices_for_subject["Label_"+str(label)]["Fold_0"], (np.ndarray, list)):
                validation_indices_to_combine = validation_indices_for_subject["Label_"+str(label)]["Fold_0"]
            else:
                validation_indices_to_combine = [validation_indices_for_subject["Label_"+str(label)]["Fold_0"]]

            final_train_indices[subject]["Label_"+str(label)] = np.concatenate((train_indices_to_combine, validation_indices_to_combine))


    return final_train_indices
      
    

In [58]:
# Get final training indices
final_train_indices = concatenate_final_training_and_test_indices(train_indices, \
                            validation_indices, subject_list, label_list)

In [59]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

# XGBoost

In [63]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}

y_train_final = [mapping[num] for num in y_train_final]
y_test_final = [mapping[num] for num in y_test_final]

In [64]:
final_accuracies = [] 

for seed in range(10):

    params = {'seed': seed, 'learning_rate': 0.19, 'max_depth': 8, 'min_child_weight': 0, 'n_estimators': 600, 'subsample': 0.5}

    xb = xgb.XGBClassifier(**params)

    xb.fit(X_train_final, y_train_final)
    y_pred = xb.predict(X_test_final)
    accuracy = accuracy_score(y_pred, y_test_final)
    print("Accuracy:", accuracy)

    final_accuracies.append(accuracy)

print("Mean accuracy " + str(np.mean(final_accuracies)) + ", with standard deviation "+str(np.std(final_accuracies)) + ".")

Accuracy: 0.9177377892030848
Accuracy: 0.9177377892030848
Accuracy: 0.921165381319623
Accuracy: 0.9177377892030848
Accuracy: 0.9143101970865467
Accuracy: 0.9177377892030848
Accuracy: 0.9160239931448158
Accuracy: 0.9185946872322194
Accuracy: 0.9203084832904884
Accuracy: 0.9160239931448158
Mean accuracy 0.9177377892030847, with standard deviation 0.0019160822429304186.


### Random Forest

In [39]:
final_accuracies = [] 

for seed in range(10):

    params =  {'random_state': seed, 'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}

    rf = RandomForestClassifier(**params)

    rf.fit(X_train_final, y_train_final)
    y_pred = rf.predict(X_test_final)
    accuracy = accuracy_score(y_pred, y_test_final)
    print("Accuracy:", accuracy)
    
    final_accuracies.append(accuracy)

print("Mean accuracy " + str(np.mean(final_accuracies)) + ", with standard deviation "+str(np.std(final_accuracies)) + ".")

Accuracy: 0.9134532990574121
Accuracy: 0.9134532990574121
Accuracy: 0.9143101970865467
Accuracy: 0.9160239931448158
Accuracy: 0.9177377892030848
Accuracy: 0.9134532990574121
Accuracy: 0.9151670951156813
Accuracy: 0.9160239931448158
Accuracy: 0.9143101970865467
Accuracy: 0.9177377892030848
Mean accuracy 0.9151670951156812, with standard deviation 0.001580041894994496.
