In [3]:
from discovery_child_development.utils.jsonl_utils import load_jsonl
from discovery_child_development import PROJECT_DIR

import pandas as pd

EVALS_DIR = PROJECT_DIR / 'outputs/labels/evals_data'
LABELS_RELEVANCE = EVALS_DIR / "relevance_labels_eval_annotated.jsonl"
LABELS_DETECTION_MANAGEMENT = EVALS_DIR / "detection_labels_eval_annotated.jsonl"
LABELS_TAXONOMY = EVALS_DIR / "taxonomy_labels_eval_annotated.jsonl"

In [5]:
def get_data(path):
    columns = ['id', 'prediction', 'accept', '_annotator_id', 'correct']
    return (
        pd.DataFrame(load_jsonl(path))
        .assign(accept = lambda df: df['accept'].apply(lambda x: x[0]))
        .assign(correct = lambda df: df['prediction'] == df['accept'])
        )[columns]
    # annotations by annotator
    # compare prediction vs accept label

def get_taxonomy_data(path):
    columns = ['id', 'prediction', 'answer', '_annotator_id', 'correct']
    return (
        pd.DataFrame(load_jsonl(path))
        .assign(correct = lambda df: df['answer'] == 'accept')
        )[columns]


In [6]:
def get_annotator_counts(data_df):
    return (
        data_df
        .groupby('_annotator_id')
        .agg(counts = ('id', 'count'))
        .reset_index()
    )

In [7]:
def get_accuracy(data_df):
    return (
        data_df
        .groupby('prediction')
        .agg(accuracy = ('correct', 'mean'))
        .reset_index()
    )

In [8]:
def check_labels(data_df, category):
    return (
        data_df
        .query("prediction == @category")
        .groupby('accept')
        .agg(counts = ('id', 'count'))
        .reset_index()
    )

def check_taxonomy_labels(data_df, category):
    return (
        data_df
        .query("prediction == @category")
        .groupby('answer')
        .agg(counts = ('id', 'count'))
        .reset_index()
    )

## Relevance labels

In [54]:
data_df = get_data(LABELS_RELEVANCE)
get_annotator_counts(data_df)

Unnamed: 0,_annotator_id,counts
0,relevance_data-karlis,73
1,relevance_data-laurie,23
2,relevance_data-rosie,50


In [55]:
get_accuracy(data_df)

Unnamed: 0,prediction,accuracy
0,Not-relevant,0.895833
1,Not-specified,0.62
2,Relevant,0.895833


In [56]:
check_labels(data_df, 'Not-specified')

Unnamed: 0,accept,counts
0,???,2
1,Not-relevant,1
2,Not-specified,31
3,Relevant,16


## Detection and management labels

In [57]:
data_df = get_data(LABELS_DETECTION_MANAGEMENT)
get_annotator_counts(data_df)

Unnamed: 0,_annotator_id,counts
0,detection_data-karlis,34
1,detection_data-laurie,80
2,detection_data-natalie,32
3,detection_data-rosie,54


In [58]:
get_accuracy(data_df)

Unnamed: 0,prediction,accuracy
0,Both,0.64
1,Detection,0.72
2,Management,0.74
3,,0.7


In [59]:
check_labels(data_df, 'Both')

Unnamed: 0,accept,counts
0,???,2
1,Both,32
2,Detection,4
3,Management,10
4,,2


In [60]:
check_labels(data_df, 'Detection')

Unnamed: 0,accept,counts
0,???,2
1,Both,5
2,Detection,36
3,Management,5
4,,2


In [61]:
check_labels(data_df, 'Management')

Unnamed: 0,accept,counts
0,???,6
1,Both,2
2,Management,37
3,,5


## Taxonomy labels

In [9]:
data_df = get_taxonomy_data(LABELS_TAXONOMY)
get_accuracy(data_df).sort_values('accuracy', ascending=False)

Unnamed: 0,prediction,accuracy
28,Oral health,0.969697
36,Sleep,0.960784
34,RCTs,0.92
14,Income,0.92
26,Nutrition and weight,0.9
22,Mental health,0.896552
12,Health,0.88
10,Games,0.86
18,Labour market,0.846154
20,Mathematics,0.84


In [10]:
len(data_df)

1806

In [11]:
get_annotator_counts(data_df)

Unnamed: 0,_annotator_id,counts
0,taxonomy_data-jess,25
1,taxonomy_data-karlis,1472
2,taxonomy_data-laurie,250
3,taxonomy_data-natalie,59


In [12]:
check_taxonomy_labels(data_df, 'Communication and language')

Unnamed: 0,answer,counts
0,accept,20
1,ignore,2
2,reject,28


In [83]:
data_df.groupby('prediction').agg(counts = ('id', 'count')).sort_values('counts', ascending=False)

Unnamed: 0_level_0,counts
prediction,Unnamed: 1_level_1
Sleep,51
Technology (general),50
Health,50
Nutrition and weight,50
Prenatal,50
Preschool,50
Assessment (general),50
Mathematics,50
Literacy,50
Infancy,50
