In [21]:
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle
import xgboost as xgb
import matplotlib.pyplot as plt
import os
import mlflow
from mlflow.models import infer_signature

import sys
sys.path.append('../')

import Utils.Time_Series_Classification_Helpers as ts_helpers
import Utils.Classification_Helpers as helpers

# Set up MLFlow

In [22]:
# Start MLFlow
#!mlflow server --host 127.0.0.1 --port 8080

In [23]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Import and Concatenate Data

In [36]:
subject_list = ["m292", "m294", "m298", "m299", "m300"]
label_list  = [0, 1, 2, 3, 4]

## Dataframes that do not depend on folds

In [37]:
list_of_filenames = ["Topological_Summary_Statistics.csv", "Signature_Statistics.csv", "Advanced_Features.csv"]
    
feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

all_labels = feature_df["Label"]

## Dataframes that DO depend on folds

In [38]:
list_of_filenames = ["Vectorization_Features.csv"]

fold_dependant_feature_df = ts_helpers.import_and_concatenate_datasets(subject_list, list_of_filenames, parent_directory = "")

# Experiments with Single Features

In [39]:
print("There are "+str(len(feature_df.columns))+" features in the main dataframe.")

There are 1919 features in the main dataframe.


In [40]:
feature_df.columns

Index(['EEG_Persistence Entropy_Dim_0', 'EEG_Amplitude_Dim_0',
       'EEG_No_Points_Dim_0', 'EEG_Largest_Persistence_Dim_0',
       'EEG_Persistence Entropy_Dim_1', 'EEG_Amplitude_Dim_1',
       'EEG_No_Points_Dim_1', 'EEG_Largest_Persistence_Dim_1',
       'EEG_Persistence Entropy_Dim_2', 'EEG_Amplitude_Dim_2',
       ...
       'EMG_SH_Vectorization_Coordinate_94_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_95_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_96_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_97_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_98_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_99_Homology_Dim_2',
       'EMG_AC_coordinate_cc1', 'EMG_AC_coordinate_cc2',
       'EMG_AC_coordinate_cc3', 'EMG_AC_coordinate_cc4'],
      dtype='object', length=1919)

## Save features for Data Exploration

In [64]:
feature_df.to_csv("Features/All_Features.csv")

## Filter

In [41]:
list_of_strings_in_column_name = ["_Vectorization_Coordinate_"]

feature_df = helpers.remove_columns_with_str(feature_df, list_of_strings_in_column_name)

print("There now are "+str(len(feature_df.columns))+" features in the main dataframe.")

There now are 119 features in the main dataframe.


# Replace NaNs with 0

In [54]:
feature_df.fillna(0, inplace=True)

# Cross Validation

In [55]:
train_indices, validation_indices, test_indices = helpers.load_folds(subject_list, parent_directory="")

## Features that do not depend on folds

In [56]:
train_features_dfs_all_folds, train_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, train_indices, label_list)
validation_features_dfs_all_folds, validation_labels_all_folds = helpers.filter_dataframe_with_indices(feature_df, validation_indices, label_list)

## Fold-dependant Features

In [57]:
train_fold_dependant_features_dfs_all_folds, _ = helpers.filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, train_indices, label_list)
validation_fold_dependant_features_dfs_all_folds, _ = helpers.filter_fold_dependant_dataframe_with_indices(fold_dependant_feature_df, validation_indices, label_list)

## Reformat data

In [58]:
# Comment out if you want to leave out fold-dependant features
#train_features_dfs_all_folds = helpers.combine_all_features(train_features_dfs_all_folds, train_fold_dependant_features_dfs_all_folds)
#validation_features_dfs_all_folds = helpers.combine_all_features(validation_features_dfs_all_folds, validation_fold_dependant_features_dfs_all_folds)

In [59]:
X_train, y_train, X_test, y_test = helpers.initialize_fold_dicts(train_features_dfs_all_folds, train_labels_all_folds, validation_features_dfs_all_folds, validation_labels_all_folds)

# MLFLow

## Random Forest

In [60]:
params = {"random_state": 42, "n_estimators": 900, "min_samples_split": 2}
rf = RandomForestClassifier(**params)
all_accuracies = []

for fold in range(5):
    rf.fit(X_train[fold], y_train[fold])
    y_pred = rf.predict(X_test[fold])
    accuracy = accuracy_score(y_pred, y_test[fold])
    all_accuracies.append(accuracy)
    print("Accuracy for fold", fold + 1, ":", accuracy)

average_accuracy = np.mean(all_accuracies)
print("Average Accuracy:", average_accuracy)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 1 : 0.72


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 2 : 0.71


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 3 : 0.72


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy for fold 4 : 0.6666666666666666
Accuracy for fold 5 : 0.730909090909091
Average Accuracy: 0.7095151515151514


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [61]:
mlflow_params = params
features = X_train[0].columns
mlflow_params["features"] = features

In [62]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(mlflow_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", average_accuracy)
    mlflow.log_metric("minimal accuracy",  np.min(all_accuracies))
    mlflow.log_metric("maximal accuracy",  np.max(all_accuracies))


    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Random Forest")

    # Infer the model signature
    signature = infer_signature(X_train[fold].values, rf.predict(X_train[fold].values))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="random_forest_time_series",
        signature=signature,
        input_example=X_train[fold],
        registered_model_name="random-forest-timeseries",
    )

MlflowException: API request to http://127.0.0.1:8080/api/2.0/mlflow/runs/create failed with exception HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: /api/2.0/mlflow/runs/create (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x291cb68f0>: Failed to establish a new connection: [Errno 61] Connection refused'))