In [169]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils import shuffle

# Import and Concatenate Data

In [170]:
train_feature_df = pd.read_csv("Features/Train_Topological_Summary_Statistics.csv")
train_advanced_feature_df = pd.read_csv("Features/Train_Advanced_Features.csv")
train_statistical_features = pd.read_csv("Features/Train_Statistical_Features.csv")
train_wasserstein_features = pd.read_csv("Features/Train_Wasserstein_Features.csv")
train_landscape_features = pd.read_csv("Features/Train_Landscape_Features.csv")
train_bottleneck_features = pd.read_csv("Features/Train_bottleneck_Features.csv")
train_signature_features = pd.read_csv("Features/Train_Signature_Statistics.csv")
train_direct_features = pd.read_csv("Features/Train_Direct_Persistence_Diagrams.csv")


test_feature_df = pd.read_csv("Features/Test_Topological_Summary_Statistics.csv")
test_advanced_feature_df = pd.read_csv("Features/Test_Advanced_Features.csv")
test_statistical_features = pd.read_csv("Features/Test_Statistical_Features.csv")
test_wasserstein_features = pd.read_csv("Features/Test_Wasserstein_Features.csv")
test_landscape_features = pd.read_csv("Features/Test_Landscape_Features.csv")
test_bottleneck_features = pd.read_csv("Features/Test_bottleneck_Features.csv")
test_signature_features = pd.read_csv("Features/Test_Signature_Statistics.csv")
test_direct_features = pd.read_csv("Features/Test_Direct_Persistence_Diagrams.csv")


# Concatenate train and test sets (and split again later) to not get a problem with indices
summary_statistics = pd.concat([train_feature_df, test_feature_df], ignore_index=True)
advanced_features = pd.concat([train_advanced_feature_df, test_advanced_feature_df], ignore_index=True)
statistical_features = pd.concat([train_statistical_features, test_statistical_features], ignore_index=True)
wasserstein_features =  pd.concat([train_wasserstein_features, test_wasserstein_features], ignore_index=True)
landscape_features =  pd.concat([train_landscape_features, test_landscape_features], ignore_index=True)
bottleneck_features =  pd.concat([train_bottleneck_features, test_bottleneck_features], ignore_index=True)
signature_features =  pd.concat([train_signature_features, test_signature_features], ignore_index=True)
direct_features =  pd.concat([train_direct_features, test_direct_features], ignore_index=True)


# Save labels for later
labels = summary_statistics["Label"]

list_of_dataframes =  [summary_statistics, advanced_features, statistical_features, wasserstein_features, landscape_features, bottleneck_features, signature_features, direct_features]

for df in list_of_dataframes:
    # We do not want the labels in the training dat
    df.drop(columns = ["Label"], inplace = True)
    # All dataframes come with unnamed columns TODO
    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)], axis = 1, inplace = True)


feature_df = pd.DataFrame(index=summary_statistics.index)

# Choose one, several or all of the following dataframes
feature_df = feature_df.join(summary_statistics) # improves result slightly
feature_df = feature_df.join(statistical_features) # improves result slightly, but not with all random_states
#feature_df = feature_df.join(wasserstein_features) # only improves result if advanced_features is not used
feature_df = feature_df.join(advanced_features) # improves result by a lot, also works well alone
feature_df = feature_df.join(bottleneck_features) # good standalone accuracy
feature_df = feature_df.join(signature_features) # improves result compared to only using statistical features

# Very bad accuracies
#feature_df = feature_df.join(landscape_features) # makes result worse, bad standalone accuracy
#feature_df = feature_df.join(direct_features)

# Preprocess Data

In [171]:
# Split dataframe

X_train = feature_df[int(len(train_advanced_feature_df)):]
y_train = labels[int(len(train_advanced_feature_df)):]

X_test = feature_df[int(len(test_advanced_feature_df)):]
y_test = labels[int(len(test_advanced_feature_df)):]

In [172]:
rf = RandomForestClassifier(random_state=8)
rf.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


RandomForestClassifier(random_state=8)

In [173]:
y_pred = rf.predict(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [174]:
accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9800148038490007


In [175]:
np.unique(y_pred, return_counts=True)

(array([1, 3, 5, 7]), array([ 271, 1052,   23,    5]))

In [176]:
np.unique(y_test, return_counts=True)

(array([1, 3, 5, 7]), array([ 250, 1050,   45,    6]))