In [2]:
import pandas as pd
import json

def get_positive_training_data(positive_path: str) -> pd.DataFrame:
    """
    Loads and preprocesses positive training data from a CSV file.
    
    Args:
        positive_path (str): The file path to the CSV containing positive samples.
        
    Returns:
        pd.DataFrame: A DataFrame containing the processed positive
            training data, with duplicates and NA values removed,
            and a 'relevant' column set to 1.
    """
    return (
        pd.read_csv(positive_path)
        [["id", "text"]]
        .dropna(subset=["id"])
        .drop_duplicates(subset=["id"])
        .assign(relevant=1)
        .reset_index(drop=True)
    )

def make_negative_training_data(
    data: pd.DataFrame,
    random_state: int,
    n_samples: int) -> pd.DataFrame:
    """
    Samples and labels negative training data from a given DataFrame.
    
    Args:
        data (pd.DataFrame): The DataFrame to sample from.
        random_state (int): A seed for the random number generator to ensure reproducibility.
        n_samples (int): The number of samples to draw.
        
    Returns:
        pd.DataFrame: A DataFrame of sampled negative data, with a 'relevant' column set to 0.
    """
    return (
        data
        .sample(n_samples, random_state=random_state)
        .assign(relevant=0)
        .reset_index(drop=True)
    )

def make_training_data(
    positive_training_data: pd.DataFrame,
    negative_data_list: list,
    negative_data_frac_list: list,
    random_state: int
    ) -> pd.DataFrame:
    """
    Combines positive and negative training data into a single dataset.
    
    Args:
        positive_training_data (pd.DataFrame): DataFrame containing the positive samples.
        negative_data_list (list): A list of DataFrames from which negative samples will be drawn.
        negative_data_frac_list (list): A list of fractions dictating the proportion of negative samples
                                        to draw relative to the number of positive samples.
        random_state (int): Seed for the random number generator used in sampling.
        
    Returns:
        pd.DataFrame: A DataFrame containing both positive and negative training data.
    """
    training_data = [positive_training_data]
    positive_training_data_len = len(positive_training_data)
    negative_data_n_samples_list = [int(positive_training_data_len * frac) for frac in negative_data_frac_list]
    for negative_data, negative_data_n_samples in zip(negative_data_list, negative_data_n_samples_list):
        negative_training_data = make_negative_training_data(negative_data, random_state, negative_data_n_samples)
        training_data.append(negative_training_data)
    return pd.concat(training_data).reset_index(drop=True)
    

In [8]:
# Set paths (upload this data to S3 and write getters for it)
afs_pos_path = "training_data_inputs/Training data for mission classifier (positive samples only) - AFS.csv"
ahl_pos_path = "training_data_inputs/Training data for mission classifier (positive samples only) - AHL.csv"
asf_pos_path = "training_data_inputs/Training data for mission classifier (positive samples only) - ASF.csv"
cb_orgs_path = "training_data_inputs/cb_organizations_2024-04-17.parquet"
afs_extra_path = "training_data_inputs/relevance_labels_eval_annotated_afs.jsonl"

In [3]:
# Load positive training data
afs_positive_training_data = get_positive_training_data(afs_pos_path)
ahl_positive_training_data = get_positive_training_data(ahl_pos_path)
asf_positive_training_data = get_positive_training_data(asf_pos_path)
# Load crunchbase orgs data which will used to sample negative training data
cb_orgs = (
    pd.read_parquet("training_data_inputs/cb_organizations_2024-04-17.parquet")
    .query("founded_on >= '2021-01-01'")
    .query("country_code == 'GBR'")
    .query("short_description.notna()")
    [["id", "short_description"]]
    .rename(columns={"short_description": "text"})
)

In [4]:
# Create training data for each mission
afs_training_data = make_training_data(
    positive_training_data=afs_positive_training_data,
    negative_data_list=[ahl_positive_training_data, asf_positive_training_data, cb_orgs],
    negative_data_frac_list=[0.15, 0.15, 0.7],
    random_state=1
    )

ahl_training_data = make_training_data(
    positive_training_data=ahl_positive_training_data,
    negative_data_list=[afs_positive_training_data, asf_positive_training_data, cb_orgs],
    negative_data_frac_list=[0.15, 0.15, 0.7],
    random_state=2
    )

asf_training_data = make_training_data(
    positive_training_data=asf_positive_training_data,
    negative_data_list=[afs_positive_training_data, ahl_positive_training_data, cb_orgs],
    negative_data_frac_list=[0.15, 0.15, 0.7],
    random_state=3
    )

In [22]:
# Create extra training data for AFS from ISS 3
with open(afs_extra_path, 'r') as file:
    lines = file.readlines()

parsed_data = [json.loads(line) for line in lines]
afs_extra_training_data = (
    pd.DataFrame(parsed_data)
    [["id", "text", "prediction"]]
    .query("prediction != 'Not-specified'")
    .replace({'prediction': {'Not-relevant': 0, 'Relevant': 1}})
    .rename(columns={"prediction": "relevant"})
)

  .replace({'prediction': {'Not-relevant': 0, 'Relevant': 1}})


In [30]:
# Add extra training data to AFS
afs_training_data = pd.concat([afs_training_data, afs_extra_training_data])

In [5]:
# Save training datasets for each mission to csv (change this to save to S3 / write getters to get training data)
afs_training_data.to_csv("training_data_outputs/afs_training_data.csv", index=False)
ahl_training_data.to_csv("training_data_outputs/ahl_training_data.csv", index=False)
asf_training_data.to_csv("training_data_outputs/asf_training_data.csv", index=False)

In [6]:
# from discovery_utils.utils import s3

# bucket_name = s3.BUCKET_NAME_RAW  # or directly "your-bucket-name"
# file_path = "data/crunchbase/Crunchbase_2024-04-17/organizations.parquet"
# client = s3.s3_client()

# df = s3._download_obj(
#     s3_client=client,
#     bucket=bucket_name,
#     path_from=file_path,
#     download_as="dataframe"
# )