In [59]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb


In [60]:
subject_list = ["m292", "m294"]

In [72]:
def import_and_concatenate_brain_imaging_feature_datasets(subjects):
    
    """
    Load feature DataFrames for specified subjects.
    
    Args:
    - subjects (list): List of subject names.
    
    Returns:
    - dict: Dictionary containing subject feature DataFrames.
    - list: List of all labels across subjects.
    """
    subject_feature_dfs = {}
    all_labels = []

    for subject in subjects:
        subject_feature_dfs[subject] = pd.DataFrame()
        single_data_frames = []

        # Topological Features
        #single_data_frames.append(pd.read_csv("Features/"+str(subject)+"/Topological_Summary_Statistics.csv"))
        #single_data_frames.append(pd.read_csv("Features/"+str(subject)+"/Advanced_Features.csv"))
        #single_data_frames.append(pd.read_csv("Features/"+str(subject)+"/Signature_Statistics.csv"))

        # Non-topological Features
        single_data_frames.append(pd.read_csv("Features/"+str(subject)+"/Traditional_Features.csv"))

        
        for df_idx, df in enumerate(single_data_frames):
            df.drop(df.columns[df.columns.str.contains('unnamed',case=False)], axis=1, inplace=True)

            if len(subject_feature_dfs[subject].index) > 0:
                subject_feature_dfs[subject] = pd.concat([subject_feature_dfs[subject], df], axis=1)
            else:
                subject_feature_dfs[subject] = pd.concat([subject_feature_dfs[subject], df], ignore_index=True)
                subject_feature_dfs[subject].drop(subject_feature_dfs[subject].columns[subject_feature_dfs[subject].columns.str.contains('_left',case=False)], axis=1, inplace=True)

        for label in [0, 1, 2, 3, 4]:
            if len(single_data_frames) > 1:
                subject_feature_dfs[subject] = subject_feature_dfs[subject].drop(subject_feature_dfs[subject][subject_feature_dfs[subject]["Label"]==label].index[-1])
            else:
                subject_feature_dfs[subject] = subject_feature_dfs[subject].drop(subject_feature_dfs[subject][subject_feature_dfs[subject]["Label"]==label].index[-1])

        
        subject_feature_dfs[subject]["Subject"] = subjects.index(subject)

        if len(single_data_frames) > 1:
            all_labels.extend(list(subject_feature_dfs[subject]["Label"].iloc[:, 0]))
        else: 
            all_labels.extend(list(subject_feature_dfs[subject]["Label"]))


    brain_imaging_feature_df = pd.concat([subject_feature_dfs[subject] for subject in subjects], ignore_index=True)
    brain_imaging_feature_df.drop(columns=["Label"], inplace=True)

    return subject_feature_dfs, all_labels, brain_imaging_feature_df

subject_feature_dfs, all_labels, feature_df = import_and_concatenate_brain_imaging_feature_datasets(subject_list)

# Save Concatenated Features for Data Exploration

In [73]:
# Save dataframe for data exploration
feature_df.to_csv("Features/All_Features.csv")

# Cross Validation

In [74]:
def load_indices_all_subjects(subject_list):
    """
    Load train, validation, and test set indices for all subjects.
    
    Args:
    - subject_list (list): List of subject names.
    
    Returns:
    - dict: Dictionary containing train indices for all subjects.
    - dict: Dictionary containing validation indices for all subjects.
    - dict: Dictionary containing test indices for all subjects.
    """
    train_indices = {}
    validation_indices = {}
    test_indices_dict_all_subjects = {}

    for subject in subject_list:
        # Train indices
        train_indices[subject] = np.load("../Time_Series/Train_Test_Splitting/"+str(subject)+"/Train_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()

        # Validation indices
        validation_indices[subject] = np.load("../Time_Series/Train_Test_Splitting/"+str(subject)+"/Validation_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()

        # Final Test set indices
        test_indices_dict_all_subjects[subject] = np.load("../Time_Series/Train_Test_Splitting/"+str(subject)+"/Final_Test_Set_Indices_All_Labels.npy", allow_pickle=True).item()

    return train_indices, validation_indices, test_indices_dict_all_subjects

train_indices, validation_indices, test_indices = load_indices_all_subjects(subject_list)

In [75]:
def filter_dataframe_with_indices(feature_df, all_labels, indices_dict_all_subjects):
    """
    indices_dict_all_subjects: Dictionary. Structure {subject: {label: {fold: indices list}}}.
    The indices list is a list of indices within each "label" dataframe (for each subject). If there are
    71 segments of each label (for a subject), then the indices list would be a subset of range(0, 71).
    """

    features_dfs_all_folds = {}
    labels_all_folds = {}

    # Initialize dictionarys with folds as keys and the train/validation sets/ their labels as values
    subject = list(indices_dict_all_subjects.keys())[0]
    for fold, fold_key in enumerate(indices_dict_all_subjects[subject][list(indices_dict_all_subjects[subject].keys())[0]]):
        features_dfs_all_folds[fold_key] = pd.DataFrame()
        labels_all_folds[fold_key] = []

    
    for subject_idx, subject in enumerate(indices_dict_all_subjects.keys()):

        # dictionarys with folds as labels and the train/validation sets/ their labels as values 
        labels_with_set_indices = {}
        features_dfs_for_subject = {}

        filtered_subject_df = feature_df[feature_df["Subject"] == subject_idx]

        indices_filtered_subject_df = filtered_subject_df.index # Save indices for labels
        labels_for_subject = [all_labels[idx] for idx in indices_filtered_subject_df]
        
        filtered_subject_df = filtered_subject_df.reset_index()


        for fold, fold_key in enumerate(indices_dict_all_subjects[subject][list(indices_dict_all_subjects[subject].keys())[0]]):
            
            features_dfs_for_subject[fold_key] = pd.DataFrame()
            labels_with_set_indices[fold_key] = []

        

        for label, label_key in enumerate(train_indices[subject].keys()):

            indices_of_label_within_subject_dataframe = [index for index, value in enumerate(labels_for_subject) if value == label]


            filtered_label_df = filtered_subject_df.loc[indices_of_label_within_subject_dataframe]

            #filtered_label_df.drop(columns = ["level_0"], inplace = True)

            filtered_label_df = filtered_label_df.reset_index()


            for fold, fold_key in enumerate(indices_dict_all_subjects[subject][label_key].keys()):

                set_indices_in_filtered_df = indices_dict_all_subjects[subject][label_key][fold_key]


                feature_df_with_set_indices = filtered_label_df.loc[set_indices_in_filtered_df]

                labels_with_set_indices[fold_key].extend([label]*len(feature_df_with_set_indices.index))
                
        
                features_dfs_for_subject[fold_key] = pd.concat([features_dfs_for_subject[fold_key], feature_df_with_set_indices], ignore_index=True)

        
        for fold, fold_key in enumerate(indices_dict_all_subjects[subject][list(indices_dict_all_subjects[subject].keys())[0]]):
            features_dfs_all_folds[fold_key] =  pd.concat([features_dfs_for_subject[fold_key], features_dfs_for_subject[fold_key]], ignore_index=True)

            labels_all_folds[fold_key].extend(labels_with_set_indices[fold_key])
            
            # Postprocessing
            features_dfs_all_folds[fold_key].drop(columns = ["level_0"], inplace = True)
            features_dfs_all_folds[fold_key].drop(columns = ["index"], inplace = True)


    
    return features_dfs_all_folds, labels_all_folds

train_features_dfs_all_folds, train_labels_all_folds = filter_dataframe_with_indices(feature_df, all_labels, train_indices)
validation_features_dfs_all_folds, validation_labels_all_folds = filter_dataframe_with_indices(feature_df, all_labels, validation_indices)

In [76]:
def initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds, n_folds = 5):
    """
    Initialize dictionaries with folds as keys and assign features and labels accordingly.
    
    Args:
    - train_features_dfs_all_folds (dict): Dictionary containing training features for all folds.
    - train_labels_all_folds (dict): Dictionary containing training labels for all folds.
    - validation_features_dfs_all_folds (dict): Dictionary containing validation features for all folds.
    - validation_labels_all_folds (dict): Dictionary containing validation labels for all folds.
    - n_folds (int): Number of folds.
    
    Returns:
    - dict: Dictionary containing training features for each fold.
    - dict: Dictionary containing training labels for each fold.
    - dict: Dictionary containing validation features for each fold.
    - dict: Dictionary containing validation labels for each fold.
    """
    X_train = {}
    y_train = {}
    X_test = {}
    y_test = {}

    for fold in range(n_folds):
        # Set random seed for reproducibility
        np.random.seed(42)

        # Shuffle indices
        indices_train = np.random.permutation(len(train_features_dfs_all_folds["Fold_" + str(fold)]))
        indices_test = np.random.permutation(len(validation_features_dfs_all_folds["Fold_" + str(fold)]))

        # Shuffle rows of X_train[fold] and y_train[fold]
        X_train_fold = train_features_dfs_all_folds["Fold_" + str(fold)].iloc[indices_train]
        y_train_fold = [train_labels_all_folds["Fold_" + str(fold)][index] for index in indices_train]

        # Shuffle rows of X_test[fold] and y_test[fold]
        X_test_fold = validation_features_dfs_all_folds["Fold_" + str(fold)].iloc[indices_test]
        y_test_fold = [validation_labels_all_folds["Fold_" + str(fold)][index] for index in indices_test]

        X_train[fold] = X_train_fold
        y_train[fold] = y_train_fold
        X_test[fold] = X_test_fold
        y_test[fold] = y_test_fold

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

# Random Forest

In [77]:
def train_rf_cross_validation(X_train, y_train, X_test, y_test, n_estimators=500, random_state=5):
    """
    Train RandomForestClassifier using cross-validation and calculate mean accuracy.
    
    Args:
    - X_train (dict): Dictionary containing training features for each fold.
    - y_train (dict): Dictionary containing training labels for each fold.
    - X_test (dict): Dictionary containing validation features for each fold.
    - y_test (dict): Dictionary containing validation labels for each fold.
    - n_estimators (int): Number of trees in the forest (default=900).
    - random_state (int): Random seed (default=5).
    
    Returns:
    - float: Mean accuracy across all folds.
    """
    rf = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators)
    all_accuracies = []

    for fold in range(len(X_train)):
        rf.fit(X_train[fold], y_train[fold])
        y_pred = rf.predict(X_test[fold])
        accuracy = accuracy_score(y_pred, y_test[fold])
        all_accuracies.append(accuracy)
        print("Accuracy for fold", fold, ":", accuracy)

    mean_accuracy = np.mean(all_accuracies)
    print("Mean Accuracy:", mean_accuracy)
    pass

train_rf_cross_validation(X_train, y_train, X_test, y_test)


# Use less data for parameter tuning

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 0 : 0.7666666666666667


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.6


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.75


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.7833333333333333
Accuracy for fold 4 : 0.8363636363636363
Mean Accuracy: 0.7472727272727273


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


# XGBoost

In [58]:
def train_xgb_cross_validation(X_train, y_train, X_test, y_test, seed=41):
    """
    Train XGBoost Classifier using cross-validation and calculate mean accuracy.
    
    Args:
    - X_train (dict): Dictionary containing training features for each fold.
    - y_train (dict): Dictionary containing training labels for each fold.
    - X_test (dict): Dictionary containing validation features for each fold.
    - y_test (dict): Dictionary containing validation labels for each fold.
    - seed (int): Random seed (default=41).
    
    Returns:
    - float: Mean accuracy across all folds.
    """
    model = xgb.XGBClassifier(seed=seed)
    all_accuracies = []

    for fold in range(len(X_train)):
        # Remove duplicate columns
        X_train[fold] = X_train[fold].loc[:, ~X_train[fold].columns.duplicated()]

        model.fit(X_train[fold], y_train[fold])

        X_test[fold] = X_test[fold].loc[:, ~X_test[fold].columns.duplicated()]

        y_pred = model.predict(X_test[fold])
        accuracy = accuracy_score(y_pred, y_test[fold])
        all_accuracies.append(accuracy)
        print("Accuracy for fold", fold, ":", accuracy)

    mean_accuracy = np.mean(all_accuracies)
    print("Mean Accuracy:", mean_accuracy)
    return mean_accuracy

mean_accuracy_xgb = train_xgb_cross_validation(X_train, y_train, X_test, y_test)

Accuracy for fold 0 : 0.9833333333333333
Accuracy for fold 1 : 0.9333333333333333
Accuracy for fold 2 : 1.0
Accuracy for fold 3 : 0.9833333333333333
Accuracy for fold 4 : 0.9818181818181818
Mean Accuracy: 0.9763636363636363
