In [2]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow
from mlflow.models import infer_signature


# Import functions from other Jupyter notebook
import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Brain_Imaging_Classification_Helpers as bi_helpers
import Utils.Classification_Helpers as helpers

# Set Up MLFLow

In [6]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [7]:
subject_list = ["m292", "m294"]
label_list = [0, 1, 2, 3, 4]

## EEG/EMG Data

In [8]:
# Import dataframes that do not depend on folds
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]
time_series_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="Time_Series")


## Brain Imaging

In [9]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]
_, subject_feature_df = bi_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="Brain_Imaging")

brain_imaging_feature_df = bi_helpers.cut_dataframe_to_same_length_as_TS(subject_feature_df, subject_list)

## Merging

In [10]:
feature_df = helpers.merge_feature_dfs(time_series_feature_df, brain_imaging_feature_df)

In [11]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

There are 671 features in the main dataframe.


In [12]:
list_of_strings_in_column_names_to_remove = ["_Vectorization_Coordinate_"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_names_to_remove)

print("There are "+str(len(feature_df.columns))+" features in the main dataframe after filtering.")

There are 71 features in the main dataframe after filtering.


# Cross Validation

In [13]:
# TODO This can be in the helper file as well
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "Time_Series")

In [14]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

## Reformat Data

In [15]:
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

In [32]:
np.save('Features/All_Train_Features.npy', \
            np.array(X_train, dtype=object), allow_pickle=True)

np.save('Features/All_Train_Labels.npy', \
            np.array(y_train, dtype=object), allow_pickle=True)


np.save('Features/All_Test_Features.npy', \
            np.array(X_test, dtype=object), allow_pickle=True)

np.save('Features/All_Test_Labels.npy', \
            np.array(y_test, dtype=object), allow_pickle=True)


# MLFLow

## Random Forest

In [26]:
# TODO use utils function again

params = {"random_state": 42, "n_estimators": 900, "min_samples_split": 2}
rf = RandomForestClassifier(**params)
all_accuracies = []

for fold in range(5):
    rf.fit(X_train[fold], y_train[fold])
    y_pred = rf.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", fold + 1, ":", accuracy)

average_accuracy = np.mean(all_accuracies)
print("Average Accuracy:", average_accuracy)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.9833333333333333


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.9666666666666667


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.9583333333333334


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.9666666666666667
Accuracy for fold 5 : 1.0
Average Accuracy: 0.975


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [27]:
mlflow_params = params
features = X_train[0].columns
mlflow_params["features"] = features

In [35]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))

    # Log data
    mlflow.log_artifact("Features/All_Train_Features.npy")
    mlflow.log_artifact("Features/All_Train_Labels.npy")
    mlflow.log_artifact("Features/All_Test_Features.npy")
    mlflow.log_artifact("Features/All_Test_Labels.npy")
    
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Random Forest - Both Modalities")
    mlflow.set_tag('mlflow.runName', 'Random Forest - Both Modalities')

## XGBoost

In [None]:
params = {"seed": 1, "learning_rate": 0.1, "n_estimators": 100, "max_depth": 5}
xb = xgb.XGBClassifier(**params)
all_accuracies = []

for fold in range(len(X_train)):
    # Remove duplicate columns
    X_train[fold] = helpers.keep_first_duplicate_columns(X_train[fold])

    xb.fit(X_train[fold], y_train[fold])

    X_test[fold] = X_test[fold].loc[:, ~X_test[fold].columns.duplicated()]

    y_pred = xb.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", fold, ":", accuracy)

average_accuracty = np.mean(all_accuracies)
print("Mean Accuracy:", average_accuracty)

Accuracy for fold 0 : 0.975
Accuracy for fold 1 : 0.9833333333333333
Accuracy for fold 2 : 0.975


In [None]:
mlflow_params = params
features = X_train[0].columns
mlflow_params["features"] = features

In [None]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))

    # Log data
    mlflow.log_artifact("Features/All_Train_Features.npy")
    mlflow.log_artifact("Features/All_Train_Labels.npy")
    mlflow.log_artifact("Features/All_Test_Features.npy")
    mlflow.log_artifact("Features/All_Test_Labels.npy")
    
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "XGBoost - Both Modalities")
    mlflow.set_tag('mlflow.runName', 'XGBoost - Both Modalities')