In [4]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os

import sys
sys.path.append('../../')

import Anesthesia_Data.Helpers as helpers

importing Jupyter notebook from /Users/piabaronetzky/Desktop/Helmholtz/Code/Anesthesia_Data/Time_Series/../../Anesthesia_Data/Helpers.ipynb


# Import and Concatenate Data

In [18]:
subject_list = ["m292", "m294"]
label_list  = [0, 1, 2, 3, 4]

In [19]:
def import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory):
    """
    Import and concatenate feature datasets for each subject.

    Args:
    - subject_list (list): List of subject names.

    Returns:
    - pd.DataFrame: Concatenated feature DataFrame.
    - list: List of all labels.
    """
    subject_feature_dfs = {}

    for subject_idx, subject in enumerate(subject_list):
        subject_feature_dfs[subject] = pd.DataFrame()

        for data_type in ["EEG", "EMG"]:
            data_frames = []

            for file in list_of_filenames:
                path = os.path.join(str(parent_directory), "Features", str(subject), str(data_type), file)
                if os.path.exists(path):
                    data_frames.append(pd.read_csv(path))

            df_both_data_types = pd.concat(data_frames, axis=1)

            if not subject_feature_dfs[subject].empty:
                subject_feature_dfs[subject] = pd.concat([subject_feature_dfs[subject], df_both_data_types], axis=1)
            else:
                subject_feature_dfs[subject] = df_both_data_types

        subject_feature_dfs[subject]["Subject"] = subject_idx

    feature_df = pd.concat(subject_feature_dfs.values(), ignore_index=True)

    # For duplicate columns, only keep one
    feature_df = feature_df.loc[:, ~feature_df.columns.duplicated()]

    feature_df.drop(columns=['Unnamed: 0'], inplace=True)
    
    return feature_df

In [20]:
# Import dataframes that do not depend on folds
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]

    
feature_df = import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

all_labels = feature_df["Label"]

In [21]:
# Dataframes that DO depend on folds

# Import dataframes that do not depend on folds
list_of_filenames = ["Vectorization_Features.csv"]

    
fold_dependant_feature_df = import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

# Experiments with Single Features

In [23]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

There are 41 features in the main dataframe.


In [24]:
list_of_strings_in_column_name = ["_Vectorization_Coordinate_", "L1"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_name)

print("There now are "+str(len(feature_df.columns))+" features in the main dataframe.")

There now are 41 features in the main dataframe.


# Cross Validation

## Features that do not depend on folds

In [25]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory="")

AttributeError: module 'Anesthesia_Data.Helpers' has no attribute 'load_folds'

In [20]:
def filter_dataframe_with_indices(feature_df, indices_dict_all_subjects, label_list, n_folds = 5):
    
    feature_df_all_folds = {}
    all_labels = {}

    for fold_idx in range(n_folds):

        fold_df = pd.DataFrame()
        
        
        for subject_idx, subject in enumerate(indices_dict_all_subjects.keys()):
            filtered_subject_df = feature_df[feature_df["Subject"] == subject_idx]

            for label in label_list:                
                
                filtered_label_df = filtered_subject_df.loc[filtered_subject_df["Label"] == label]

                set_indices_in_filtered_df = indices_dict_all_subjects[subject]["Label_"+str(label)]["Fold_"+str(fold_idx)]

                feature_df_with_set_indices = filtered_label_df.iloc[set_indices_in_filtered_df]

                fold_df = pd.concat([fold_df, feature_df_with_set_indices], ignore_index=True)

        all_labels["Fold_"+str(fold_idx)] = fold_df["Label"]
        fold_df.drop(columns=["Label"], inplace=True)

        feature_df_all_folds["Fold_"+str(fold_idx)] = fold_df

    return feature_df_all_folds, all_labels

train_features_dfs_all_folds, train_labels_all_folds = filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = filter_dataframe_with_indices(feature_df, validation_indices, label_list)

## Fold-dependant Features

In [24]:
def filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, indices_dict_all_subjects, label_list, n_folds = 5):

    # TODO remove duplicate code (see above)
    
    feature_df_all_folds = {}

    for fold_idx in range(n_folds):

        filtered_fold_df = fold_dependant_feature_df[fold_dependant_feature_df["Fold"] == fold_idx]

        fold_df = pd.DataFrame()
        
        for subject_idx, subject in enumerate(indices_dict_all_subjects.keys()):
            filtered_subject_df = filtered_fold_df[filtered_fold_df["Subject"] == subject_idx]


            for label in label_list:                
                
                filtered_label_df = filtered_subject_df.loc[filtered_subject_df["Label"] == label]

                set_indices_in_filtered_df = indices_dict_all_subjects[subject]["Label_"+str(label)]["Fold_"+str(fold_idx)]

                feature_df_with_set_indices = filtered_label_df.iloc[set_indices_in_filtered_df]

                fold_df = pd.concat([fold_df, feature_df_with_set_indices], ignore_index=True)

        feature_df_all_folds["Fold_"+str(fold_idx)] = fold_df


    return feature_df_all_folds

train_fold_dependant_features_dfs_all_folds = filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, train_indices, label_list)
validation_fold_dependant_features_dfs_all_folds = filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, validation_indices, label_list)

In [25]:
def combine_all_features(features_dfs_all_folds, fold_dependant_features_dfs_all_folds, n_folds = 5):

    features_df_all_folds_all_features = {}

    for fold in range(n_folds):
        features_df_all_folds_all_features["Fold_"+str(fold)] = pd.concat([features_dfs_all_folds["Fold_"+str(fold)], fold_dependant_features_dfs_all_folds["Fold_"+str(fold)]], axis=1)
        
    return features_df_all_folds_all_features

train_features_df_all_folds_all_features = combine_all_features(train_features_dfs_all_folds, train_fold_dependant_features_dfs_all_folds)
validation_features_df_all_folds_all_features = combine_all_features(validation_features_dfs_all_folds, validation_fold_dependant_features_dfs_all_folds)

## Reformat data

In [26]:
def initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds, n_folds = 5):
    """
    Initialize dictionaries with folds as keys and assign features and labels accordingly.
    
    Args:
    - train_features_dfs_all_folds (dict): Dictionary containing training features for all folds.
    - train_labels_all_folds (dict): Dictionary containing training labels for all folds.
    - validation_features_dfs_all_folds (dict): Dictionary containing validation features for all folds.
    - validation_labels_all_folds (dict): Dictionary containing validation labels for all folds.
    - n_folds (int): Number of folds.
    
    Returns:
    - dict: Dictionary containing training features for each fold.
    - dict: Dictionary containing training labels for each fold.
    - dict: Dictionary containing validation features for each fold.
    - dict: Dictionary containing validation labels for each fold.
    """
    X_train = {}
    y_train = {}
    X_test = {}
    y_test = {}

    for fold in range(n_folds):
        # Set random seed for reproducibility
        np.random.seed(42)

        # Shuffle indices
        indices_train = np.random.permutation(len(train_features_dfs_all_folds["Fold_" + str(fold)]))
        indices_test = np.random.permutation(len(validation_features_dfs_all_folds["Fold_" + str(fold)]))

        # Shuffle rows of X_train[fold] and y_train[fold]
        X_train_fold = train_features_dfs_all_folds["Fold_" + str(fold)].iloc[indices_train]
        y_train_fold = [train_labels_all_folds["Fold_" + str(fold)][index] for index in indices_train]

        # Shuffle rows of X_test[fold] and y_test[fold]
        X_test_fold = validation_features_dfs_all_folds["Fold_" + str(fold)].iloc[indices_test]
        y_test_fold = [validation_labels_all_folds["Fold_" + str(fold)][index] for index in indices_test]

        X_train[fold] = X_train_fold
        y_train[fold] = y_train_fold
        X_test[fold] = X_test_fold
        y_test[fold] = y_test_fold

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = initialize_fold_dicts(train_features_df_all_folds_all_features, train_labels_all_folds, validation_features_df_all_folds_all_features, validation_labels_all_folds)

# Random Forest

## Cross Validation

In [27]:
def train_rf_cross_validation(X_train, y_train, X_test, y_test, n_estimators=900, n_folds = 5, random_state=5):
    """
    Train RandomForestClassifier using cross-validation, print accuracy for each fold, and calculate average accuracy.
    
    Args:
    - X_train (dict): Dictionary containing training features for each fold.
    - y_train (dict): Dictionary containing training labels for each fold.
    - X_test (dict): Dictionary containing validation features for each fold.
    - y_test (dict): Dictionary containing validation labels for each fold.
    - n_estimators (int): Number of trees in the forest (default=900).
    - random_state (int): Random seed (default=5).
    
    Returns:
    - float: Average accuracy across all folds.
    """
    
    rf = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators)
    all_accuracies = []

    for fold in range(5):
        rf.fit(X_train[fold], y_train[fold])
        y_pred = rf.predict(X_test[fold])
        accuracy = accuracy_score(y_pred, y_test[fold])
        all_accuracies.append(accuracy)
        print("Accuracy for fold", fold + 1, ":", accuracy)

    average_accuracy = np.mean(all_accuracies)
    print("Average Accuracy:", average_accuracy)
    pass

train_rf_cross_validation(X_train, y_train, X_test, y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.9916666666666667


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.95


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.95


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.9333333333333333
Accuracy for fold 5 : 0.9636363636363636
Average Accuracy: 0.9577272727272728


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


## Final Test set

In [454]:
# TODO