In [None]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow

from sklearn.model_selection import GridSearchCV

from mlflow.models import infer_signature
# Import functions from other Jupyter notebook
import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Brain_Imaging_Classification_Helpers as bi_helpers
import Utils.Classification_Helpers as helpers
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Set Up MLFLow

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [None]:
subject_list = ["m292", "m294", "m298", "m299", "m300"]
label_list = [0, 1, 2, 3, 4]

In [None]:
list_of_filenames_time_series = [
    "Topological_Summary_Statistics.csv", 
    "Signature_Statistics.csv", 
    "Advanced_Features.csv"
]

atol_vectorization_filename_time_series = [
    "Vectorization_Features.csv"
]

atol_vectorization_for_final_testing_filename_time_series = [
    "Vectorization_Features_for_Final_Test.csv"
]

time_series_filenames = (
    list_of_filenames_time_series,
    atol_vectorization_filename_time_series,
    atol_vectorization_for_final_testing_filename_time_series
)

In [None]:
list_of_filenames_brain_imaging = [
    "Topological_Summary_Statistics.csv", 
    "Signature_Statistics.csv", 
    "Advanced_Features.csv", 
    "Direct_Coordinate_Features.csv"
]
    
atol_vectorization_filename_brain_imaging = [
    "ATOL_Vectorization_Features.csv"
]
    
atol_vectorization_for_final_testing_filename_brain_imaging = [
    "ATOL_Vectorization_Features_for_Final_Test.csv"
]

brain_imaging_filenames = (
    list_of_filenames_brain_imaging,
    atol_vectorization_filename_brain_imaging,
    atol_vectorization_for_final_testing_filename_brain_imaging
)


In [None]:
feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df = helpers.create_and_merge_feature_dataframes(
    subject_list,
    time_series_filenames,
    brain_imaging_filenames
)

In [None]:
# Save features for Data Exploration
feature_df.to_csv("Features/All_Features.csv")

## Feature Selection

In [None]:
dfs = [feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df]

In [None]:
list_of_strings_in_column_names_to_remove = ["Persistence_image_Statistic", "Persistence_Landscape_Statistic"] 

feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df = helpers.remove_features(dfs, list_of_strings_in_column_names_to_remove)

In [None]:
feature = "Persistence_Landscape_Statistic_Kurtosis" 

#feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df = helpers.only_use_one_feature_for_classification(feature, dfs)

## Creating Training and Validation Sets

In [None]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "Time_Series")

In [None]:
X_train, y_train, X_test, y_test = helpers.create_training_and_validation_sets(feature_df, fold_dependant_feature_df, train_indices, validation_indices, label_list)

# MLFLow and GridSearch

Concatenate dataset for Gridsearch and create custom split

In [None]:
concatenated_X, concatenated_y = helpers.concatenate_data(X_train, X_test, y_train, y_test)


In [None]:
train_test_splits = helpers.define_fold_start_and_end_indices(X_train, X_test)

In [None]:
# Create the custom cross-validation object
custom_cv = helpers.CustomCV(train_test_splits)

## Random Forest

In [None]:
rf_for_cross_validation = RandomForestClassifier(random_state = 42)

# Define the parameter grid
first_param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4, 6]
}

# Another parameter grid for finer tuning
finer_param_grid = {
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [4, 5, 6],
    'min_samples_leaf': [1, 2, 3]
}

best_params, best_score = helpers.perform_grid_search(rf_for_cross_validation, finer_param_grid, custom_cv, concatenated_X, concatenated_y)

In [None]:
params_for_manual_tuning = {"random_state": 1, "n_estimators": 1700, "min_samples_split": 6, "max_depth": 15}

rf_for_manual_tuning = RandomForestClassifier(**params_for_manual_tuning)

helpers.manual_tuning(rf_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

### XGBoost

In [None]:
# Define the initial parameter grid
param_grid_for_cross_validation = {
    "learning_rate": [0.1, 0.18, 0.2],
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5],
    "min_child_weight": [1, 2, 3],
    "subsample": [0.5]
}

xb_for_cross_validation = xgb.XGBClassifier(seed=1)

best_params, best_score = helpers.perform_grid_search(xb_for_cross_validation, param_grid_for_cross_validation, custom_cv, concatenated_X, concatenated_y)


In [None]:
### params_for_manual_tuning = {"random_state": 1, "n_estimators": 1700, "min_samples_split": 6, "max_depth": 15}

xb_for_manual_tuning = xgb.XGBClassifier(**params_for_manual_tuning)

helpers.manual_tuning(xb_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

Note that even when fixing the seed, there are still some things that can vary in the XGBoost algorithm, such that we cannot be sure
to always reach the exact same mean accuracy with the same parameters and same features.

# Final Test Set

In [None]:
final_train_indices = helpers.get_indices_of_final_training_set(train_indices, validation_indices, subject_list, label_list)

In [None]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

## Random Forest

In [None]:
params_with_initialized_random_state = {
    "random_state": 0,
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 100
}
        
rf = RandomForestClassifier(**params_with_initialized_random_state)

mean_accuracy, importances = helpers.final_evaluation(rf, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)

## XGBoost

In [None]:
params_with_initialized_random_state = {
    "seed": 0, 
    "learning_rate": 0.19, 
    "n_estimators": 250, 
    "max_depth": 4, 
    "min_child_weight": 0, 
    "max_delta_step": 1, 
    "subsample":0.5
}
        
xb = xgb.XGBClassifier(**params_with_initialized_random_state)

mean_accuracy, importances =  helpers.final_evaluation(xb, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)

# Decision Tree Feature Importance

In [None]:
feature = "BC_Vectorization_Coord"

helpers.compute_decision_tree_feature_importance(importances, X_train_final, feature)