In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import os

import sys
sys.path.append('../') 

import Utils.Classification_Helpers as helpers
import Utils.Brain_Imaging_Classification_Helpers as bi_helpers

In [4]:
subject_list = ["m292", "m294", "m298", "m299", "m300"]
label_list  = [0, 1, 2, 3, 4]

# Import and Concatenate Data

## Dataframes that do not depend on folds

In [5]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv", "Direct_Coordinate_Features.csv"]

feature_df, _ = bi_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory="")

## Dataframes that depend on folds (ATOL)

In [17]:
list_of_filenames = ["ATOL_Vectorization_Features.csv"]

fold_dependant_feature_df, _ = bi_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

list_of_filenames = ["ATOL_Vectorization_Features_for_Final_Test.csv"]

fold_dependant_final_test_feature_df, _ = bi_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

# Preprocess

In [18]:
feature_df.fillna(0, inplace=True)

# Save Concatenated Features for Data Exploration

In [19]:
# Save dataframe for data exploration
feature_df.to_csv("Features/All_Features.csv")

# Cross Validation

In [20]:
# TODO This can be in the helper file as well
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory = "../Time_Series")

## Features that do not depend on folds

In [21]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

## Fold-dependant features (ATOL)

In [22]:
train_fold_dependant_features_dfs_all_folds, _ = helpers.filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, train_indices, label_list)
validation_fold_dependant_features_dfs_all_folds, _ = helpers.filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, validation_indices, label_list)

## Reformat Data

In [23]:
# Comment out if you want to leave out fold-dependant features
#train_features_dfs_all_folds = helpers.combine_all_features(train_features_dfs_all_folds, train_fold_dependant_features_dfs_all_folds)
#validation_features_dfs_all_folds = helpers.combine_all_features(validation_features_dfs_all_folds, validation_fold_dependant_features_dfs_all_folds)

In [24]:
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

## Random Forest

In [46]:
helpers.train_rf_cross_validation(X_train, y_train, X_test, y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.97


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.9866666666666667


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.9966666666666667


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.98
Accuracy for fold 5 : 0.9963636363636363
Average Accuracy: 0.9859393939393939


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


0.9859393939393939

## XGBoost

In [47]:
mean_accuracy_xgb = helpers.train_xgb_cross_validation(X_train, y_train, X_test, y_test)

Accuracy for fold 0 : 0.9833333333333333
Accuracy for fold 1 : 0.9933333333333333
Accuracy for fold 2 : 0.9933333333333333
Accuracy for fold 3 : 0.9866666666666667
Accuracy for fold 4 : 0.9963636363636363
Mean Accuracy: 0.9906060606060606


In [74]:
params = {"seed": 1, "learning_rate": 0.14, "n_estimators": 50, "max_depth": 25, "min_child_weight": 0, "max_delta_step": 1, "subsample":0.5}

xb = xgb.XGBClassifier(**params)
all_accuracies = []
all_feature_importances = []

for fold in range(len(X_train)):
    # Remove duplicate columns
    X_train[fold] = helpers.keep_first_duplicate_columns(X_train[fold])

    xb.fit(X_train[fold], y_train[fold])

    X_test[fold] = X_test[fold].loc[:, ~X_test[fold].columns.duplicated()]

    y_pred = xb.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", accuracy)

    # Get feature importances for the current fold
    feature_importances = xb.feature_importances_
    all_feature_importances.append(feature_importances)

average_accuracy = np.mean(all_accuracies)
print("Mean Accuracy:", average_accuracy)

Accuracy for fold 0.99
Accuracy for fold 0.9933333333333333
Accuracy for fold 0.9966666666666667
Accuracy for fold 0.99
Accuracy for fold 1.0
Mean Accuracy: 0.994


# Final Test Set

In [25]:
def concatenate_final_training_and_test_indices(train_indices, validation_indices, subject_list, label_list):
    """
    The new training data consists of the previous training plus the previous validation data
    """

    final_train_indices = {}

    for subject in subject_list:

        # Initialize
        final_train_indices[subject] = {}
        
        train_indices_for_subject = train_indices[subject]
        validation_indices_for_subject = validation_indices[subject]

        for label in label_list:
            # It does not matter which fold we choose, so simply choose fold 0
            train_indices_to_combine = train_indices_for_subject["Label_"+str(label)]["Fold_0"]
            validation_indices_to_combine = validation_indices_for_subject["Label_"+str(label)]["Fold_0"]
            final_train_indices[subject]["Label_"+str(label)] = np.concatenate((train_indices_to_combine, validation_indices_to_combine))


    return final_train_indices
      
    

In [26]:
# Get final training indices
final_train_indices = concatenate_final_training_and_test_indices(train_indices, \
                            validation_indices, subject_list, label_list)

In [27]:
# Concatenate ATOL features with all other features
fold_dependant_final_test_feature_df.pop("Label")
fold_dependant_final_test_feature_df.pop("Fold")
fold_dependant_final_test_feature_df.pop("Subject")
feature_df = pd.concat([feature_df, fold_dependant_final_test_feature_df], axis=1)

In [28]:
# Get final test sets
X_train_final, y_train_final, X_test_final, y_test_final = helpers.create_final_input_data_dicts(feature_df,          
                                        final_train_indices, test_indices, label_list)

## Random Forest

In [29]:
final_accuracies = [] 

for seed in range(10):

    params = {"random_state": seed, "n_estimators": 1700, "min_samples_split": 6, "max_depth": 15}

    rf = RandomForestClassifier(**params)

    rf.fit(X_train_final, y_train_final)
    y_pred = rf.predict(X_test_final)
    accuracy = accuracy_score(y_pred, y_test_final)
    print("Accuracy:", accuracy)

    final_accuracies.append(accuracy)


print("Mean accuracy " + str(np.mean(final_accuracies)) + ", with standard deviation "+str(np.std(final_accuracies)) + ".")

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


KeyboardInterrupt: 

## XGBoost

In [30]:
final_accuracies = [] 

for seed in range(10):

    params = {"seed": seed, "learning_rate": 0.19, "n_estimators": 250, "max_depth": 4, "min_child_weight": 0, "max_delta_step": 1, "subsample":0.5}

    xb = xgb.XGBClassifier(**params)

    xb.fit(X_train_final, y_train_final)
    y_pred = xb.predict(X_test_final)
    accuracy = accuracy_score(y_pred, y_test_final)
    print("Accuracy:", accuracy)

    final_accuracies.append(accuracy)


print("Mean accuracy " + str(np.mean(final_accuracies)) + ", with standard deviation "+str(np.std(final_accuracies)) + ".")

Accuracy: 1.0
Accuracy: 0.9973333333333333
Accuracy: 0.9946666666666667
Accuracy: 0.9973333333333333
Accuracy: 0.9973333333333333



KeyboardInterrupt

