# Create a dataset for evaluating GPT labelling quality

- Get labels
- Sample 50 random examples per category
- Output these examples to a Google Sheet
- Output these examples in a format that is usable by Prodigy

In [2]:
from discovery_child_development.getters.labels import (
    get_relevance_labels,
    get_taxonomy_labels,
    get_detection_management_labels
)
import pandas as pd
import numpy as np

2024-01-24 17:54:17,170 - botocore.credentials - INFO - Found credentials in environment variables.
2024-01-24 17:54:18,752 - datasets - INFO - PyTorch version 2.1.2 available.


  from .autonotebook import tqdm as notebook_tqdm


In [31]:
from nesta_ds_utils.loading_saving import S3
from discovery_child_development import PROJECT_DIR, S3_BUCKET
OUTPUT_PATH = PROJECT_DIR / "outputs/labels/evals_data"
OUTPUT_PATH.mkdir(exist_ok=True, parents=True)
S3_PATH = "data/labels/child_development/evals_data/"

In [36]:
# sample n for each unique value of column prediction
def sample_n_predictions(
    df: pd.DataFrame,
    n: int,
    column: str = 'prediction',
    seed: int = 42
) -> pd.DataFrame:
    """
    Sample n predictions for each unique value of column prediction

    Args:
        df (pd.DataFrame): dataframe with predictions
        n (int): number of samples
        column (str, optional): column to groupby. Defaults to 'prediction'.
        seed (int, optional): random seed. Defaults to 42.

    Returns:
        pd.DataFrame: dataframe with n samples for each unique value of column prediction
    """
    np.random.seed(seed)
    return (
        df
        .groupby(column)
        .apply(lambda x: x.sample(n=n))
        .reset_index(drop=True)
    )

def sample_n_predictions_stratified(
    df: pd.DataFrame,
    n: int,
    column_A: str = 'prediction',
    column_B: str= 'source',
    seed: int = 42
) -> pd.DataFrame:
    """
    Produce a sample with n elements for each label in column_A, uniformly distributed by column_B labels

    Args:
        df (pd.DataFrame): dataframe with predictions
        n (int): number of samples
        column_A (str, optional): column to groupby. Defaults to 'prediction'.
        column_B (str, optional): column to stratify. Defaults to 'source'.
        seed (int, optional): random seed. Defaults to 42.

    Returns:
        pd.DataFrame: dataframe with n samples for each unique value of column prediction
    """
    np.random.seed(seed)
    return (
        df
        .groupby([column_A, column_B])
        .apply(lambda x: x.sample(n=n//2))
        .reset_index(drop=True)
    )


def output_eval_data(df: pd.DataFrame, filename: str) -> None:
    """Saves dataframe to json and uploads to S3"""
    df.to_json(path_or_buf=str(OUTPUT_PATH / filename), orient='records', lines=True)
    S3.upload_file(path_from=str(OUTPUT_PATH / filename), bucket=S3_BUCKET, path_to=S3_PATH + filename)


## Relevance labels

In [37]:
lab = get_relevance_labels()
eval_data = sample_n_predictions_stratified(lab, 50, 'prediction', 'source', seed=100) 

2024-01-25 16:19:40,789 - root - INFO - File data/labels/child_development/relevance_labels.jsonl downloaded from discovery-iss to /Users/karlis.kanders/Documents/code/discovery_child_development/outputs/labels/relevance/relevance_labels.jsonl


In [38]:
eval_data.groupby(['prediction','source']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,model,timestamp,text
prediction,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Not-relevant,openalex,25,25,25,25
Not-relevant,patents,25,25,25,25
Not-specified,openalex,25,25,25,24
Not-specified,patents,25,25,25,25
Relevant,openalex,25,25,25,25
Relevant,patents,25,25,25,25


In [39]:
filename = "relevance_labels_eval.jsonl"
output_eval_data(eval_data, filename)


### Detection management labels

In [40]:
lab = get_detection_management_labels()
eval_data = sample_n_predictions_stratified(lab, 50, 'prediction', 'source', seed=100) 

2024-01-25 16:20:15,081 - root - INFO - File data/labels/child_development/detection_management_labels.jsonl downloaded from discovery-iss to /Users/karlis.kanders/Documents/code/discovery_child_development/outputs/labels/detection_management/detection_management_labels.jsonl


In [42]:
lab.groupby(['prediction','source']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text,model,timestamp
prediction,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Both,openalex,117,117,117,117
Both,patents,52,52,52,52
Detection,openalex,283,283,283,283
Detection,patents,157,157,157,157
Management,openalex,530,530,530,530
Management,patents,651,651,651,651
,openalex,43,43,43,43
,patents,42,42,42,42


In [43]:
eval_data.groupby(['prediction','source']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text,model,timestamp
prediction,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Both,openalex,25,25,25,25
Both,patents,25,25,25,25
Detection,openalex,25,25,25,25
Detection,patents,25,25,25,25
Management,openalex,25,25,25,25
Management,patents,25,25,25,25
,openalex,25,25,25,25
,patents,25,25,25,25


In [44]:
filename = "detection_management_labels_eval.jsonl"
output_eval_data(eval_data, filename)


### Taxonomy labels

In [47]:
lab = get_taxonomy_labels()

In [50]:
lab.head(1)

Unnamed: 0,id,text,source,prediction_raw,prediction
0,AR-117118-A1,N-ACILATED FATTY AMINO ACIDS TO REDUCE THE ABS...,patents,"[Nutrition and weight, Technology (general), D...","[Nutrition and weight, Technology (general), D..."


In [55]:
category_counts = (
    lab
    .explode("prediction")
    .groupby(['prediction'])
    .agg(counts=('id', 'count'))
    .reset_index()
    .sort_values(by='counts')
)

In [57]:
category_counts

Unnamed: 0,prediction,counts
25,Non-tech assessments,66
42,Wearables,111
14,Income,123
28,Oral health,136
0,AR VR,137
17,Internet,156
40,Statistical methods,157
37,Social media,159
36,Sleep,191
18,Labour market,208
