In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Sequential
import pandas as pd
import numpy as np
import model_training_utils as mtu

# svDB prep

In [2]:
svdb_data = pd.read_csv("data_sliced_tda/svDB.csv")
svdb_data = svdb_data[['betti_H1', 'ann']]

# Fix the lame type, string -> list
svdb_data['betti_H1'] = svdb_data['betti_H1'].apply(lambda x: mtu.fix_betti_string_svdb(x))
# Turn the list into a np.array of floats
svdb_data['betti_H1'] = svdb_data['betti_H1'].apply(lambda x: (np.array([float(num) for num in x])))

# mitDB prep

In [6]:
mitdb_data = pd.read_csv("data_sliced_tda/mitDB.csv")

mitdb_data['betti_H1'] = mitdb_data['betti'].apply(lambda x: np.array(mtu.fix_betti_string_mitdb(x)[1]))
mitdb_data = mitdb_data[['betti_H1', 'ann']]

# Random research

In [23]:
mitdb_data['ann'].value_counts()

ann
0    82877
1    26347
Name: count, dtype: int64

# Merging data and doing rest of the prep

In [8]:
concatenated_data = pd.concat([svdb_data, mitdb_data])
concatenated_data = concatenated_data.reset_index(drop=True)

In [11]:
X, y = concatenated_data['betti_H1'], concatenated_data['ann']

# Split to train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=420)

# Encode the classes
y_train = to_categorical(np.array(y_train.to_list()))

# ????? Make a 3dim train data I guess
X_train = np.array(X_train.to_list())
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

# Model training

In [None]:
model = load_model('models_raw/4conv_1dens32.hdf5')
history = model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2, verbose=1)
model.save("models_trained/both_svdb_mitdb_trained.hdf5")

# Model testing and evaluating

In [None]:
model = load_model("models_trained/both_svdb_mitdb_trained.hdf5")

In [12]:
# Encode the test classes
y_test = to_categorical(np.array(y_test.to_list()))

# ????? Make a 3dim train data I guess
X_test = np.array(X_test.to_list())
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [13]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.34327471256256104
Test Accuracy: 0.8591911792755127


In [20]:
y_true = np.argmax(y_test, axis=1)
y_pred = np.argmax(model.predict(X_test), axis=1)

confusion_matrix(y_true, y_pred)



array([[47920,  1040],
       [ 7219,  2475]])

**Accuracy** is a measure of how well the model predicts the correct class labels for the given data. It is defined as the ratio of the number of correct predictions to the total number of predictions. It provides an overall view of the model's performance.

**Test loss**, also known as validation loss or evaluation loss, is a measure of how well the model is performing on unseen or test data. It quantifies the difference between the predicted outputs of the model and the true labels in the test set.

The **precision** represents the proportion of correctly predicted positive instances out of all instances predicted as positive. It measures the model's ability to avoid false positives.

The **recall**, also known as sensitivity or true positive rate, represents the proportion of correctly predicted positive instances out of all actual positive instances. It measures the model's ability to identify all positive instances.

The **F1-score** is the harmonic mean of precision and recall, providing a single metric that balances both measures. It is a useful metric when you want to consider both precision and recall simultaneously.

In [22]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92     48960
           1       0.70      0.26      0.37      9694

    accuracy                           0.86     58654
   macro avg       0.79      0.62      0.65     58654
weighted avg       0.84      0.86      0.83     58654

