In [1]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os

# Import functions from other Jupyter notebook
import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Brain_Imaging_Classification_Helpers as bi_helpers
import Utils.Classification_Helpers as helpers

# Import and Concatenate Data

In [2]:
subject_list = ["m292", "m294"]
label_list = [0, 1, 2, 3, 4]

## EEG/EMG Data

In [3]:
# Import dataframes that do not depend on folds
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]
time_series_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="Time_Series")


## Brain Imaging

In [4]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]
_, subject_feature_df = bi_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="Brain_Imaging")

brain_imaging_feature_df = bi_helpers.cut_dataframe_to_same_length_as_TS(subject_feature_df, subject_list)

## Merging

In [29]:
def merge_feature_dfs(time_series_feature_df, brain_imaging_feature_df):
    """
    Merge time series and brain imaging feature DataFrames.
    
    Args:
    - time_series_feature_df (pandas.DataFrame): DataFrame containing time series features.
    - brain_imaging_feature_df (pandas.DataFrame): DataFrame containing brain imaging features.
    
    Returns:
    - pandas.DataFrame: Merged DataFrame containing both time series and brain imaging features.
    """
    feature_df = pd.merge(time_series_feature_df, brain_imaging_feature_df, left_index=True, right_index=True, suffixes=('', '_BI'))
    feature_df.reset_index(inplace=True)
    feature_df.drop(columns=["index"], inplace=True)

    return feature_df

feature_df = merge_feature_dfs(time_series_feature_df, brain_imaging_feature_df)

In [30]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

There are 671 features in the main dataframe.


In [31]:
list_of_strings_in_column_name = ["_Vectorization_Coordinate_", "L1"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_name)

print("There are "+str(len(feature_df.columns))+" features in the main dataframe after filtering.")

There are 55 features in the main dataframe after filtering.


In [49]:
# Check for duplicate rows
duplicate_rows = feature_df.duplicated()

# Print duplicate rows
print(feature_df[duplicate_rows])

0      False
1      False
2      False
3      False
4      False
       ...  
735    False
736    False
737    False
738    False
739    False
Length: 740, dtype: bool

# Cross Validation

In [32]:
# TODO This can be in the helper file as well
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "Time_Series")

In [33]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

## Reformat Data

In [39]:
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

## Random Forest

In [40]:
helpers.train_rf_cross_validation(X_train, y_train, X_test, y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 1.0


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 1.0


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 1.0


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.9916666666666667
Accuracy for fold 5 : 1.0
Average Accuracy: 0.9983333333333334


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


## XGBoost

In [44]:
mean_accuracy_xgb = helpers.train_xgb_cross_validation(X_train, y_train, X_test, y_test)

Accuracy for fold 0 : 1.0
Accuracy for fold 1 : 1.0
Accuracy for fold 2 : 1.0
Accuracy for fold 3 : 1.0
Accuracy for fold 4 : 1.0
Mean Accuracy: 1.0
