In [4]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow
from mlflow.models import infer_signature
from sklearn.model_selection import GridSearchCV
import math 


import sys
sys.path.append('../')

import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Classification_Helpers as helpers

In [None]:
pd.set_option('display.max_columns', None)  # Use 'None' to display all columns

# Set up MLFlow

In [None]:
# Start MLFlow
#!mlflow server --host 127.0.0.1 --port 8080

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [None]:
subject_list = ["293", "294", "298"]
label_list  = [1, 2, 3, 4, 5, 7]

## Dataframes that do not depend on folds

In [None]:
def import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory):
    """
    Import and concatenate feature datasets for each subject.

    Args:
    - subject_list (list): List of subject names.

    Returns:
    - pd.DataFrame: Concatenated feature DataFrame.
    - list: List of all labels.
    """
    subject_feature_dfs = {}

    for subject_idx, subject in enumerate(subject_list):
        subject_feature_dfs[subject] = pd.DataFrame()

        for data_type in ["EEG", "EMG"]:
            data_frames = []

            for file in list_of_filenames:
                path = os.path.join(str(parent_directory), "Features", str(subject), str(data_type), file)
                if os.path.exists(path):
                    data_frames.append(pd.read_csv(path))

            df_both_data_types = pd.concat(data_frames, axis=1)

            if not subject_feature_dfs[subject].empty:
                subject_feature_dfs[subject] = pd.concat([subject_feature_dfs[subject], df_both_data_types], axis=1).drop(columns=['Unnamed: 0'], inplace=False)
                subject_feature_dfs[subject] = helpers.keep_first_duplicate_columns(subject_feature_dfs[subject])
            else:
                df_both_data_types = helpers.keep_first_duplicate_columns(df_both_data_types)
                subject_feature_dfs[subject] = df_both_data_types.drop(columns=['Unnamed: 0'], inplace=False)

        subject_feature_dfs[subject]["Subject"] = subject_idx

    feature_df = pd.concat(subject_feature_dfs.values(), ignore_index=True)

    # For duplicate columns, only keep one
    feature_df = helpers.keep_first_duplicate_columns(feature_df)
    
    return feature_df

In [None]:
list_of_filenames = ["Statistical_Features_KATS_Statistics.csv", "Statistical_Features_Additional_Features.csv", "Statistical_Features_Level_Shift_Features.csv", "Statistical_Features_Autocorrelation_Features.csv"]

feature_df = import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

all_labels = feature_df["Label"]

# Experiments with Single Features

In [None]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

## Save features for Data Exploration

In [None]:
feature_df.to_csv("Features/All_Features.csv")

## Filter

In [None]:
list_of_strings_in_column_name = ["Nothing"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_name)

print("There now are "+str(len(feature_df.columns))+" features in the main dataframe.")

# Preprocess

In [None]:
feature_df.fillna(0, inplace=True)

In [None]:
# TODO This can be in the helper file as well
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "")

In [None]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

In [None]:
# Get train and validation sets
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

# MLFLow & Cross Validation

In [None]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory="")

In [None]:
concatenated_X  = pd.concat([X_train[0], X_test[0]], ignore_index=True)

concatenated_y = y_train[0] + y_test[0]

## Random Forest

In [None]:
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Main parameter grid
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4, 6]
}


# Another parameter grid for finer tuning 
param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [20, 30, 40],
    'min_samples_split': [2],
    'min_samples_leaf': [1, 2, 3]
}


# Another parameter grid for finer tuning 
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [40, 50, 60],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}


rf_for_cross_validation = RandomForestClassifier(random_state = 42)

best_params, best_score = helpers.perform_grid_search(rf_for_cross_validation, param_grid, 5, concatenated_X, concatenated_y)

In [None]:
params_for_manual_tuning = {"random_state": 42, "n_estimators": 500, "min_samples_split": 2}
rf_for_manual_tuning = RandomForestClassifier(**params_for_manual_tuning)

helpers.manual_tuning(rf_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

## XGBoost

In [None]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}

for fold in range(5):
    y_train[fold] = [mapping[num] for num in y_train[fold]]
    y_test[fold] = [mapping[num] for num in y_test[fold]]
    

In [None]:
concatenated_X  = pd.concat([X_train[0], X_test[0]], ignore_index=True)

concatenated_y = y_train[0] + y_test[0]

In [None]:
# Main parameter grid for finetuning after first manual experiments
param_grid = {
    "learning_rate": [0.19, 0.21, 0.23],
    "n_estimators": [400, 500, 600],
    "max_depth": [5, 8, 15],
    "min_child_weight": [0],
    "subsample": [0.5]
}


# Another parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.18, 0.19, 0.2],
    "n_estimators": [600],
    "max_depth": [7, 8, 9],
    "min_child_weight": [0],
    "subsample": [0.5]
}



xb_for_cross_validation = xgb.XGBClassifier(seed=1)

best_params, best_score = helpers.perform_grid_search(xb_for_cross_validation, param_grid, 5, concatenated_X, concatenated_y)

In [None]:
# For manual finetuning
params_for_manual_tuning = {"seed": 1, "learning_rate": 0.2, "n_estimators": 500, "max_depth": 15, "min_child_weight": 0, "subsample":0.5}

xb_for_manual_tuning = xgb.XGBClassifier(**params_for_manual_tuning)

helpers.manual_tuning(xb_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

# Final Test Set

In [None]:
final_train_indices = helpers.get_indices_of_final_training_set(train_indices, validation_indices, subject_list, label_list)

In [None]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

## RandomForest

In [None]:
params_with_initialized_random_state = {
    'random_state': 0, 
    'max_depth': 40, 
    'min_samples_leaf': 1, 
    'min_samples_split': 2, 
    'n_estimators': 150
}
        
rf = RandomForestClassifier(**params_with_initialized_random_state)

mean_accuracy, importances = helpers.final_evaluation(rf, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)

## XGBoost

In [None]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}

y_train_final = [mapping[num] for num in y_train_final]
y_test_final = [mapping[num] for num in y_test_final]

In [None]:
params_with_initialized_random_state = {
    'seed': 0, 
    'learning_rate': 0.19, 
    'max_depth': 8, 
    'min_child_weight': 0, 
    'n_estimators': 600, 
    'subsample': 0.5
}


xb = xgb.XGBClassifier(**params_with_initialized_random_state)

mean_accuracy, importances =  helpers.final_evaluation(xb, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)