In [1]:
from discovery_utils.getters.horizon_scout import get_training_data

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
import pandas as pd
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix

In [81]:
def make_train_val_datasets(
    dataset: pd.DataFrame,
    random_state=int,
    training_frac=float,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the input DataFrame into non-overlapping training, validation, and test datasets.

    First shuffle the dataset using a specified random state.
    Then partitions the dataset into training and validation sets.

    Args:
        dataset (pd.DataFrame): The complete dataset to be split.
        random_state (int): A seed used by the random number generator for shuffling the data.
        training_frac (float): The fraction of the dataset to allocate to the training set.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
        (training_dataset, val_dataset).
    """
    shuffled_dataset = dataset.sample(frac=1, random_state=random_state)
    train_size = int(len(shuffled_dataset) * training_frac)
    train_dataset = shuffled_dataset[:train_size]
    val_dataset = shuffled_dataset[train_size:]
    return (train_dataset.reset_index(drop=True), val_dataset.reset_index(drop=True))


def make_train_X_train_y_val_X_val_y(
    train_dataset: pd.DataFrame,
    val_dataset: pd.DataFrame
    ) -> Tuple[np.array, np.array, np.array, np.array]:
    train_X = np.vstack(train_dataset.embedding.values)
    train_y = train_dataset.relevant.values
    val_X = np.vstack(val_dataset.embedding.values)
    val_y = val_dataset.relevant.values
    return (train_X, train_y, val_X, val_y)

    
def print_confusion_matrix(actual, predictions):
    cm = confusion_matrix(actual, predictions)

    # Labels for the classes
    classes = ['Neg', 'Pos']

    # Print the confusion matrix with labels
    print('            Predicted:')
    print('            | {:<4} | {:<4} |'.format(*classes))
    print('-----------------------------')
    for i, row in enumerate(cm):
        print('Actual {:<4} | {:<4} | {:<4} |'.format(classes[i], *row))
    print()  # Print a new line for better separation
    
    # Calculate precision, recall, and accuracy
    precision = precision_score(actual, predictions, average='binary')
    recall = recall_score(actual, predictions, average='binary')
    accuracy = accuracy_score(actual, predictions)

    # Print the computed metrics
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Accuracy: {accuracy:.3f}")
    

In [82]:
# Make train and val datasets for AHL
ahl_training_data = get_training_data("AHL")
ahl_train, ahl_val = make_train_val_datasets(ahl_training_data, 13, 0.85)
ahl_train_X, ahl_train_y, ahl_val_X, ahl_val_y = make_train_X_train_y_val_X_val_y(ahl_train, ahl_val)

In [83]:
# Linear SVM performance AHL
from sklearn.svm import SVC
linear_svm_classifier = SVC(kernel='linear', C=2)
linear_svm_classifier.fit(ahl_train_X, ahl_train_y)
linear_svm_ahl_preds = linear_svm_classifier.predict(ahl_val_X)
print_confusion_matrix(ahl_val_y, linear_svm_ahl_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 229  | 22   |
Actual Pos  | 13   | 252  |

Precision: 0.920
Recall: 0.951
Accuracy: 0.932


In [84]:
# Make train and val datasets for ASF
asf_training_data = get_training_data("ASF")
asf_train, asf_val = make_train_val_datasets(asf_training_data, 13, 0.85)
asf_train_X, asf_train_y, asf_val_X, asf_val_y = make_train_X_train_y_val_X_val_y(asf_train, asf_val)

In [85]:
# Linear SVM performance ASF
from sklearn.svm import SVC
linear_svm_classifier = SVC(kernel='linear', C=0.5)
linear_svm_classifier.fit(asf_train_X, asf_train_y)
linear_svm_asf_preds = linear_svm_classifier.predict(asf_val_X)
print_confusion_matrix(asf_val_y, linear_svm_asf_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 218  | 15   |
Actual Pos  | 16   | 225  |

Precision: 0.938
Recall: 0.934
Accuracy: 0.935


In [86]:
# Make train and val datasets for AFS
afs_training_data = get_training_data("AFS")
afs_train, afs_val = make_train_val_datasets(afs_training_data, 13, 0.85)
afs_train_X, afs_train_y, afs_val_X, afs_val_y = make_train_X_train_y_val_X_val_y(afs_train, afs_val)

In [97]:
# Linear SVM performance AFS
from sklearn.svm import SVC
linear_svm_classifier = SVC(kernel='linear', C=0.2)
linear_svm_classifier.fit(afs_train_X, afs_train_y)
linear_svm_afs_preds = linear_svm_classifier.predict(afs_val_X)
print_confusion_matrix(afs_val_y, linear_svm_afs_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 209  | 8    |
Actual Pos  | 5    | 224  |

Precision: 0.966
Recall: 0.978
Accuracy: 0.971
