In [None]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow
from mlflow.models import infer_signature
import math 


from sklearn.model_selection import GridSearchCV


import sys
sys.path.append('../')

import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Classification_Helpers as helpers

In [None]:
pd.set_option('display.max_columns', None)  # Use 'None' to display all columns

# Set up MLFlow

In [None]:
# Start MLFlow
#!mlflow server --host 127.0.0.1 --port 8080

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [None]:
subject_list = ["293", "294", "298"]
label_list  = [1, 2, 3, 4, 5, 7]

## Dataframes that do not depend on folds

In [None]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Advanced_Features.csv", "Signature_Statistics.csv"]

feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")
feature_df.fillna(0, inplace=True)

all_labels = feature_df["Label"]

## Dataframes that DO depend on folds

In [None]:
list_of_filenames = ["Vectorization_Features.csv"]

fold_dependant_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")


list_of_filenames = ["Vectorization_Features_for_Final_Test.csv"]

fold_dependant_final_test_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

## Save features for Data Exploration

In [None]:
feature_df.to_csv("Features/All_Features.csv")

## Filter

In [None]:
dfs = [feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df]

list_of_strings_in_column_names_to_remove = [] 

feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df = helpers.remove_features(dfs, list_of_strings_in_column_names_to_remove)

In [None]:
feature = "Persistence_Landscape_Statistic_Kurtosis" 

#feature_df, fold_dependant_feature_df, fold_dependant_final_test_feature_df = helpers.only_use_one_feature_for_classification(feature, dfs)

## Creating training and validation sets

In [None]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "")

In [None]:
X_train, y_train, X_test, y_test = helpers.create_training_and_validation_sets(feature_df, fold_dependant_feature_df, train_indices, validation_indices, label_list)

# MLFLow & Cross Validation

In [None]:
concatenated_X, concatenated_y = helpers.concatenate_data(X_train, X_test, y_train, y_test)

In [None]:
train_test_splits = helpers.define_fold_start_and_end_indices(X_train, X_test)

In [None]:
# Create the custom cross-validation object
custom_cv = helpers.CustomCV(train_test_splits)

## Random Forest

In [None]:
# Define the parameter grid after first initial manual experiments
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4, 6]
}



# Another parameter grid
param_grid = {
    'n_estimators': [500],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1]
}


# Another parameter grid for finer tuning
param_grid = {
    'n_estimators': [300, 400, 500, 600],
    'max_depth': [None, 50, 70],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

rf_for_cross_validation = RandomForestClassifier(random_state = 1)


best_params, best_score = helpers.perform_grid_search(rf_for_cross_validation, param_grid, custom_cv, concatenated_X, concatenated_y)

In [None]:
params_for_manual_tuning = {"random_state": 1, "n_estimators": 500, "min_samples_split": 2}

rf_for_manual_tuning = RandomForestClassifier(**params_for_manual_tuning)

helpers.manual_tuning(rf_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

## XGBoost

In [None]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}

for fold in range(5):
    y_train[fold] = [mapping[num] for num in y_train[fold]]
    y_test[fold] = [mapping[num] for num in y_test[fold]]

In [None]:
concatenated_X, concatenated_y = helpers.concatenate_data(X_train, X_test, y_train, y_test)


In [None]:
# Define the parameter grid after first initial manual experiments for approximately finding a sweet spot (with MLFlow, code below)

# Main parameter grid for finetuning
param_grid = {
    "learning_rate": [0.17, 0.19, 0.21],
    "n_estimators": [250, 300],
    "max_depth": [7, 8, 9],
    "min_child_weight": [0],
    "subsample": [0.5]
}

# Another parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.19],
    "n_estimators": [300, 400],
    "max_depth": [5, 6, 7],
    "min_child_weight": [0],
    "max_delta_step": [0, 1],
    "subsample": [0.5]
}


# Another parameter grid for finer tuning
param_grid = {
    "learning_rate": [0.18, 0.19, 0.2],
    "n_estimators": [400, 500],
    "max_depth": [3, 4, 5],
    "min_child_weight": [0, 1],
    "max_delta_step": [1, 2],
    "subsample": [0.5]
}

xb_for_cross_validation = xgb.XGBClassifier(seed=1)

best_params, best_score = helpers.perform_grid_search(xb_for_cross_validation, param_grid, custom_cv, concatenated_X, concatenated_y)


In [None]:
params = {"seed": 1, 'learning_rate': 0.18, 'max_delta_step': 1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 400, 'subsample': 0.5}

xb_for_manual_tuning = xgb.XGBClassifier(**params)

helpers.manual_tuning(xb_for_manual_tuning, X_train, X_test, y_train, y_test, params_for_manual_tuning)

# Final Test Set

In [None]:
final_train_indices = helpers.get_indices_of_final_training_set(train_indices, validation_indices, subject_list, label_list)

In [None]:
# Concatenate ATOL features with all other features
feature_df = pd.concat([feature_df, fold_dependant_final_test_feature_df], axis=1)

In [None]:
feature_df = helpers.keep_first_duplicate_columns(feature_df)

In [None]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

In [None]:
# Change the labels to adequate labels for XGBoost
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}


y_train_final = [mapping[num] for num in y_train_final]
y_test_final = [mapping[num] for num in y_test_final]

In [None]:
params_with_initialized_random_state = {
    "seed": 0, 
    'learning_rate': 0.18, 
    'max_delta_step': 1, 
    'max_depth': 4, 
    'min_child_weight': 1,
    'n_estimators': 400, 
    'subsample': 0.5
}
        
xb = xgb.XGBClassifier(**params_with_initialized_random_state)

mean_accuracy, importances =  helpers.final_evaluation(xb, params_with_initialized_random_state, X_train_final, y_train_final, X_test_final, y_test_final)