In [128]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import os

In [129]:
subject_list = ["m292", "m294"]
label_list  = [0, 1, 2, 3, 4]

In [134]:
def import_and_concatenate_datasets(subjects, list_of_filenames, parent_directory):
    """
    Load feature DataFrames for specified subjects.

    Args:
    - subjects (list): List of subject names.
    - list_of_filenames (list): List of filenames to load for each subject.
    - parent_directory (str): Parent directory where subject data is stored.

    Returns:
    - dict: Dictionary containing subject feature DataFrames.
    - list: List of all labels across subjects.
    - pd.DataFrame: Concatenated feature DataFrame.
    """
    subject_feature_dfs = {}
    all_labels = []

    for subject in subjects:
        single_data_frames = []

        for filename in list_of_filenames:
            path = os.path.join(parent_directory, "Features", subject, filename)
            if os.path.exists(path):
                single_data_frames.append(pd.read_csv(path))

        path = os.path.join(parent_directory, "Features", subject, "Traditional_Features.csv")
        if os.path.exists(path):
            single_data_frames.append(pd.read_csv(path))

        combined_df = pd.concat(single_data_frames, axis=1)

        # Drop unnamed columns
        combined_df = combined_df.loc[:, ~combined_df.columns.str.contains('unnamed', case=False)]

        # Remove _left columns
        combined_df.drop(combined_df.filter(like='_left').columns, axis=1, inplace=True)

        # Remove duplicates based on Label column
        combined_df.drop_duplicates(subset='Label', inplace=True)

        # Add Subject column
        combined_df['Subject'] = subjects.index(subject)

        subject_feature_dfs[subject] = combined_df

        all_labels.extend(combined_df['Label'])

    # Concatenate all DataFrames
    feature_df = pd.concat(subject_feature_dfs.values(), ignore_index=True)

    # Drop Label column
    feature_df.drop(columns=["Label"], inplace=True)

    return subject_feature_dfs, all_labels, feature_df


In [135]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]

subject_feature_dfs, all_labels, feature_df = import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="")

# Save Concatenated Features for Data Exploration

In [123]:
# Save dataframe for data exploration
feature_df.to_csv("Features/All_Features.csv")

# Cross Validation

In [124]:
def load_indices_all_subjects(subject_list):
    """
    Load train, validation, and test set indices for all subjects.
    
    Args:
    - subject_list (list): List of subject names.
    
    Returns:
    - dict: Dictionary containing train indices for all subjects.
    - dict: Dictionary containing validation indices for all subjects.
    - dict: Dictionary containing test indices for all subjects.
    """
    train_indices = {}
    validation_indices = {}
    test_indices_dict_all_subjects = {}

    for subject in subject_list:
        # Train indices
        train_indices[subject] = np.load("../Time_Series/Train_Test_Splitting/"+str(subject)+"/Train_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()

        # Validation indices
        validation_indices[subject] = np.load("../Time_Series/Train_Test_Splitting/"+str(subject)+"/Validation_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()

        # Final Test set indices
        test_indices_dict_all_subjects[subject] = np.load("../Time_Series/Train_Test_Splitting/"+str(subject)+"/Final_Test_Set_Indices_All_Labels.npy", allow_pickle=True).item()

    return train_indices, validation_indices, test_indices_dict_all_subjects

train_indices, validation_indices, test_indices = load_indices_all_subjects(subject_list)

In [125]:
def filter_dataframe_with_indices(feature_df, all_labels, indices_dict_all_subjects):
    """
    indices_dict_all_subjects: Dictionary. Structure {subject: {label: {fold: indices list}}}.
    The indices list is a list of indices within each "label" dataframe (for each subject). If there are
    71 segments of each label (for a subject), then the indices list would be a subset of range(0, 71).
    """

    features_dfs_all_folds = {}
    labels_all_folds = {}

    # Initialize dictionarys with folds as keys and the train/validation sets/ their labels as values
    subject = list(indices_dict_all_subjects.keys())[0]
    for fold, fold_key in enumerate(indices_dict_all_subjects[subject][list(indices_dict_all_subjects[subject].keys())[0]]):
        features_dfs_all_folds[fold_key] = pd.DataFrame()
        labels_all_folds[fold_key] = []

    
    for subject_idx, subject in enumerate(indices_dict_all_subjects.keys()):

        # dictionarys with folds as labels and the train/validation sets/ their labels as values 
        labels_with_set_indices = {}
        features_dfs_for_subject = {}

        filtered_subject_df = feature_df[feature_df["Subject"] == subject_idx]

        indices_filtered_subject_df = filtered_subject_df.index # Save indices for labels
        labels_for_subject = [all_labels[idx] for idx in indices_filtered_subject_df]
        
        filtered_subject_df = filtered_subject_df.reset_index()


        for fold, fold_key in enumerate(indices_dict_all_subjects[subject][list(indices_dict_all_subjects[subject].keys())[0]]):
            
            features_dfs_for_subject[fold_key] = pd.DataFrame()
            labels_with_set_indices[fold_key] = []

        

        for label, label_key in enumerate(train_indices[subject].keys()):

            indices_of_label_within_subject_dataframe = [index for index, value in enumerate(labels_for_subject) if value == label]


            filtered_label_df = filtered_subject_df.loc[indices_of_label_within_subject_dataframe]

            #filtered_label_df.drop(columns = ["level_0"], inplace = True)

            filtered_label_df = filtered_label_df.reset_index()


            for fold, fold_key in enumerate(indices_dict_all_subjects[subject][label_key].keys()):

                set_indices_in_filtered_df = indices_dict_all_subjects[subject][label_key][fold_key]


                feature_df_with_set_indices = filtered_label_df.loc[set_indices_in_filtered_df]

                labels_with_set_indices[fold_key].extend([label]*len(feature_df_with_set_indices.index))
                
        
                features_dfs_for_subject[fold_key] = pd.concat([features_dfs_for_subject[fold_key], feature_df_with_set_indices], ignore_index=True)

        
        for fold, fold_key in enumerate(indices_dict_all_subjects[subject][list(indices_dict_all_subjects[subject].keys())[0]]):
            features_dfs_all_folds[fold_key] =  pd.concat([features_dfs_for_subject[fold_key], features_dfs_for_subject[fold_key]], ignore_index=True)

            labels_all_folds[fold_key].extend(labels_with_set_indices[fold_key])
            
            # Postprocessing
            features_dfs_all_folds[fold_key].drop(columns = ["level_0"], inplace = True)
            features_dfs_all_folds[fold_key].drop(columns = ["index"], inplace = True)


    
    return features_dfs_all_folds, labels_all_folds

train_features_dfs_all_folds, train_labels_all_folds = filter_dataframe_with_indices(feature_df, all_labels, train_indices)
validation_features_dfs_all_folds, validation_labels_all_folds = filter_dataframe_with_indices(feature_df, all_labels, validation_indices)

KeyError: "None of [Index([14, 22, 70, 56, 37, 39, 17, 32, 67, 51, 33, 15, 45, 59, 60, 57, 69, 16,\n       40, 66,  6, 48,  0, 47, 46, 29, 26, 49, 13, 38, 72, 71, 36, 50, 20, 10,\n       18, 42, 35,  1, 11,  4,  9, 19, 62, 54, 43],\n      dtype='int64')] are in the [index]"

In [126]:
def initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds, n_folds = 5):
    """
    Initialize dictionaries with folds as keys and assign features and labels accordingly.
    
    Args:
    - train_features_dfs_all_folds (dict): Dictionary containing training features for all folds.
    - train_labels_all_folds (dict): Dictionary containing training labels for all folds.
    - validation_features_dfs_all_folds (dict): Dictionary containing validation features for all folds.
    - validation_labels_all_folds (dict): Dictionary containing validation labels for all folds.
    - n_folds (int): Number of folds.
    
    Returns:
    - dict: Dictionary containing training features for each fold.
    - dict: Dictionary containing training labels for each fold.
    - dict: Dictionary containing validation features for each fold.
    - dict: Dictionary containing validation labels for each fold.
    """
    X_train = {}
    y_train = {}
    X_test = {}
    y_test = {}

    for fold in range(n_folds):
        # Set random seed for reproducibility
        np.random.seed(42)

        # Shuffle indices
        indices_train = np.random.permutation(len(train_features_dfs_all_folds["Fold_" + str(fold)]))
        indices_test = np.random.permutation(len(validation_features_dfs_all_folds["Fold_" + str(fold)]))

        # Shuffle rows of X_train[fold] and y_train[fold]
        X_train_fold = train_features_dfs_all_folds["Fold_" + str(fold)].iloc[indices_train]
        y_train_fold = [train_labels_all_folds["Fold_" + str(fold)][index] for index in indices_train]

        # Shuffle rows of X_test[fold] and y_test[fold]
        X_test_fold = validation_features_dfs_all_folds["Fold_" + str(fold)].iloc[indices_test]
        y_test_fold = [validation_labels_all_folds["Fold_" + str(fold)][index] for index in indices_test]

        X_train[fold] = X_train_fold
        y_train[fold] = y_train_fold
        X_test[fold] = X_test_fold
        y_test[fold] = y_test_fold

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

# Random Forest

In [127]:
def train_rf_cross_validation(X_train, y_train, X_test, y_test, n_estimators=500, random_state=5):
    """
    Train RandomForestClassifier using cross-validation and calculate mean accuracy.
    
    Args:
    - X_train (dict): Dictionary containing training features for each fold.
    - y_train (dict): Dictionary containing training labels for each fold.
    - X_test (dict): Dictionary containing validation features for each fold.
    - y_test (dict): Dictionary containing validation labels for each fold.
    - n_estimators (int): Number of trees in the forest (default=900).
    - random_state (int): Random seed (default=5).
    
    Returns:
    - float: Mean accuracy across all folds.
    """
    rf = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators)
    all_accuracies = []

    for fold in range(len(X_train)):
        rf.fit(X_train[fold], y_train[fold])
        y_pred = rf.predict(X_test[fold])
        accuracy = accuracy_score(y_pred, y_test[fold])
        all_accuracies.append(accuracy)
        print("Accuracy for fold", fold, ":", accuracy)

    mean_accuracy = np.mean(all_accuracies)
    print("Mean Accuracy:", mean_accuracy)
    pass

train_rf_cross_validation(X_train, y_train, X_test, y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 0 : 0.8


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.7833333333333333


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.7833333333333333


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.85
Accuracy for fold 4 : 0.8909090909090909
Mean Accuracy: 0.8215151515151515


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


# XGBoost

In [110]:
def train_xgb_cross_validation(X_train, y_train, X_test, y_test, seed=41):
    """
    Train XGBoost Classifier using cross-validation and calculate mean accuracy.
    
    Args:
    - X_train (dict): Dictionary containing training features for each fold.
    - y_train (dict): Dictionary containing training labels for each fold.
    - X_test (dict): Dictionary containing validation features for each fold.
    - y_test (dict): Dictionary containing validation labels for each fold.
    - seed (int): Random seed (default=41).
    
    Returns:
    - float: Mean accuracy across all folds.
    """
    model = xgb.XGBClassifier(seed=seed)
    all_accuracies = []

    for fold in range(len(X_train)):
        # Remove duplicate columns
        X_train[fold] = X_train[fold].loc[:, ~X_train[fold].columns.duplicated()]

        model.fit(X_train[fold], y_train[fold])

        X_test[fold] = X_test[fold].loc[:, ~X_test[fold].columns.duplicated()]

        y_pred = model.predict(X_test[fold])
        accuracy = accuracy_score(y_pred, y_test[fold])
        all_accuracies.append(accuracy)
        print("Accuracy for fold", fold, ":", accuracy)

    mean_accuracy = np.mean(all_accuracies)
    print("Mean Accuracy:", mean_accuracy)
    return mean_accuracy

mean_accuracy_xgb = train_xgb_cross_validation(X_train, y_train, X_test, y_test)

Accuracy for fold 0 : 0.85
Accuracy for fold 1 : 0.8
Accuracy for fold 2 : 0.8333333333333334
Accuracy for fold 3 : 0.8
Accuracy for fold 4 : 0.8363636363636363
Mean Accuracy: 0.8239393939393939
