In [56]:
from discovery_utils.enrichment.crunchbase import _enrich_keyword_labels
from discovery_utils.getters.horizon_scout import get_training_data

In [57]:
import pandas as pd
from typing import Tuple

In [60]:
def classify_by_keywords(dataset: pd.DataFrame, mission: str) -> pd.DataFrame:
    """Classify mission relevance by keywords.
    
    Args:
        dataset (pd.DataFrame): Dataset to be classified. Must have column "text" and "id".
        mission (str): Mission ('AHL', 'ASF' or 'AFS')
    """
    return (
        dataset
        .merge(_enrich_keyword_labels(dataset, mission).assign(prediction=1), how="left", on="id")
        .fillna({"prediction": 0})
        .assign(correct=lambda x: (x['relevant'] == x['prediction']))
        .astype({"correct": int, "prediction": int})
    )
    

def make_train_val_test_datasets(
    dataset: pd.DataFrame,
    random_state=int,
    training_frac=float,
    val_frac=float,
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the input DataFrame into non-overlapping training, validation, and test datasets.

    First shuffle the dataset using a specified random state.
    Then partitions the dataset into training, validation, and test sets.
    The sum of `training_frac` and `val_frac` should be less than or equal to 1.
    If the sum is less than 1, the remaining portion of the dataset becomes the test set.

    Args:
        dataset (pd.DataFrame): The complete dataset to be split.
        random_state (int): A seed used by the random number generator for shuffling the data.
        training_frac (float): The fraction of the dataset to allocate to the training set.
        val_frac (float): The fraction of the dataset to allocate to the validation set.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames:
        (training_dataset, val_dataset, test_dataset).
    """
    shuffled_dataset = dataset.sample(frac=1, random_state=random_state)
    training_size = int(len(shuffled_dataset) * training_frac)
    val_size = int(len(shuffled_dataset) * val_frac)
    training_dataset = shuffled_dataset[:training_size]
    val_dataset = shuffled_dataset[training_size:training_size + val_size]
    test_dataset = shuffled_dataset[training_size + val_size:]
    return (training_dataset, val_dataset, test_dataset)

def performance(dataset: pd.DataFrame, dataset_label:str) -> None:
    """Calculate and print Accuracy, TPR and TNR for dataset"""
    accuracy = dataset.correct.sum() / len(dataset) 
    tpr = dataset.query("relevant == 1").correct.sum() / len(dataset.query("relevant == 1"))
    tnr = dataset.query("relevant == 0").correct.sum() / len(dataset.query("relevant == 0"))
    def float_to_percent(number: float) -> float:
        return round(number * 100, 2)
    print(f"{dataset_label} -- Accuracy: {float_to_percent(accuracy)}")
    print(f"{dataset_label} -- TPR: {float_to_percent(tpr)}")
    print(f"{dataset_label} -- TNR: {float_to_percent(tnr)}")
    

In [71]:
ahl_training_data = get_training_data("AHL")
ahl_train, ahl_val, ahl_test = make_train_val_test_datasets(ahl_training_data, 13, 0.8, 0.1)
ahl_test_with_preds = classify_by_keywords(ahl_test, "AHL")
performance(ahl_test_with_preds, "AHL Test Set")

AHL Test Set -- Accuracy: 69.23
AHL Test Set -- TPR: 40.11
AHL Test Set -- TNR: 98.85


In [70]:
asf_training_data = get_training_data("ASF")
asf_train, asf_val, asf_test = make_train_val_test_datasets(asf_training_data, 13, 0.8, 0.1)
asf_test_with_preds = classify_by_keywords(asf_test, "ASF")
performance(asf_test_with_preds, "ASF Test Set")

ASF Test Set -- Accuracy: 79.15
ASF Test Set -- TPR: 64.97
ASF Test Set -- TNR: 96.84


In [72]:
afs_training_data = get_training_data("AFS")
afs_train, afs_val, afs_test = make_train_val_test_datasets(afs_training_data, 13, 0.8, 0.1)
afs_test_with_preds = classify_by_keywords(afs_test, "AFS")
performance(afs_test_with_preds, "AFS Test Set")

AFS Test Set -- Accuracy: 88.03
AFS Test Set -- TPR: 84.46
AFS Test Set -- TNR: 94.0
