In [None]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow

from sklearn.model_selection import PredefinedSplit, GridSearchCV


from mlflow.models import infer_signature
# Import functions from other Jupyter notebook
import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Brain_Imaging_Classification_Helpers as bi_helpers
import Utils.Classification_Helpers as helpers

# Set Up MLFLow

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [None]:
subject_list = ["m292", "m294", "m298", "m299", "m300"]
label_list = [0, 1, 2, 3, 4]

## EEG/EMG Data

In [None]:
# Import dataframes that do not depend on folds
list_of_filenames = ["Statistical_Features_KATS_Statistics.csv", "Statistical_Features_Additional_Features.csv",
                     "Statistical_Features_Level_Shift_Features.csv", "Statistical_Features_Autocorrelation_Features.csv"]


time_series_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="Time_Series")

## Brain Imaging

In [None]:
list_of_filenames = ["Traditional_Features_Statistics.csv", "Traditional_Features_Direct_Coordinates.csv"]

_, subject_feature_df = bi_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="Brain_Imaging")

brain_imaging_feature_df = bi_helpers.cut_dataframe_to_same_length_as_TS(subject_feature_df, subject_list)

## Merging

In [None]:
feature_df = helpers.merge_feature_dfs(time_series_feature_df, brain_imaging_feature_df)
feature_df = brain_imaging_feature_df

## Preprocessing

In [None]:
feature_df.fillna(0, inplace=True)

## Save Features for Data Exploration

In [None]:
feature_df.to_csv("Features/All_Statistical_Features.csv")

## Feature Selection

In [None]:
print("There are "+str(len(feature_df.columns))+" features.")

In [None]:
list_of_strings_in_column_names_to_remove = ["Nothing"] # "Nothing" If you want to use all features

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_names_to_remove)

In [None]:
feature = "BI_Intensity_Histograms_mean_intensities" 

#feature_df = helpers.only_use_one_feature_for_classification(feature, [feature_df])[0]

In [None]:
print("There are "+str(len(feature_df.columns))+" features after filtering.")

# Cross Validation

In [None]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "Time_Series")

In [None]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

In [None]:
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

## MLFLow and GridSearch

Concatenate dataset for Gridsearch

In [None]:
concatenated_X, concatenated_y = helpers.concatenate_data(X_train, X_test, y_train, y_test)

In [None]:
train_test_splits = helpers.define_fold_start_and_end_indices(X_train, X_test)

### Random Forest

In [None]:
rf_for_cross_validation = RandomForestClassifier(random_state = 42)


# Define the parameter grid after first initial manual experiments
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4, 6]
}

# Parameter grid for finer tuning
param_grid = {
    'n_estimators': [400, 600],
    'max_depth': [20, 30],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2, 3]
}

# Parameter grid for even finer tuning
param_grid = {
    'n_estimators': [500, 600, 700, 800],
    'max_depth': [20],
    'min_samples_split': [3, 4, 5],
    'min_samples_leaf': [1]
}


best_params, best_score = helpers.perform_grid_search(rf_for_cross_validation, param_grid, 5, concatenated_X, concatenated_y)

In [None]:
### For manual finetuning
params_for_manual_tuning = {"random_state": 42, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 700}

rf_for_manual_tuning = RandomForestClassifier(**params_for_manual_tuning)

helpers.manual_tuning(rf_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

### XGBoost

In [None]:
# Define the parameter grid after first initial manual experiments for approximately finding a sweet spot (with MLFlow, code below)

# Main parameter grid for finetuning
param_grid = {
    "learning_rate": [0.17, 0.19, 0.21],
    "n_estimators": [250, 300],
    "max_depth": [3, 4, 5],
    "min_child_weight": [0],
    "subsample": [0.5]
}


# Parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.18, 0.19, 0.2],
    "n_estimators": [300, 350],
    "max_depth": [5, 6],
    "min_child_weight": [0],
    "subsample": [0.5]
}


xb_for_cross_validation = xgb.XGBClassifier(seed=1)

best_params, best_score = helpers.perform_grid_search(xb_for_cross_validation, param_grid, 5, concatenated_X, concatenated_y)


In [None]:
# For manual finetuning
params_for_manual_tuning = {'seed': 1, 'learning_rate': 0.19, 'max_depth': 5, 'min_child_weight': 0, 'n_estimators': 350, 'subsample': 0.5}

xb_for_manual_tuning = xgb.XGBClassifier(**params_for_manual_tuning)

helpers.manual_tuning(xb_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

Note that even when fixing the seed, there are still some things that can vary in the XGBoost algorithm, such that we cannot be sure
to always reach the exact same mean accuracy with the same parameters and same features.

# Final Test Set

In [None]:
final_train_indices = helpers.get_indices_of_final_training_set(train_indices, validation_indices, subject_list, label_list)

In [None]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

## Random Forest

In [None]:
params_with_initialized_random_state = {
    "random_state": 0,
    'max_depth': 20,
    'min_samples_leaf': 1,
    'min_samples_split': 3,
    'n_estimators': 700
}
        
rf = RandomForestClassifier(**params_with_initialized_random_state)

mean_accuracy, importances = helpers.final_evaluation(rf, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)

## XGBoost

In [None]:
params_with_initialized_random_state = {
    "seed": 0, 
    "learning_rate": 0.19, 
    "n_estimators": 350, 
    "max_depth": 5, 
    "min_child_weight": 0, 
    "subsample":0.5
}
        
xb = xgb.XGBClassifier(**params_with_initialized_random_state)

mean_accuracy, importances =  helpers.final_evaluation(xb, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)