In [33]:
from discovery_child_development.getters import get_dataset
import pandas as pd
from nesta_ds_utils.loading_saving import S3
from discovery_child_development import PROJECT_DIR, S3_BUCKET
OUTPUT_PATH = PROJECT_DIR / "outputs/labels/evals_data"
OUTPUT_PATH.mkdir(exist_ok=True, parents=True)
S3_PATH = "data/labels/child_development/evals_data/"

In [34]:
def output_eval_data(df: pd.DataFrame, filename: str) -> None:
    """Saves dataframe to json and uploads to S3"""
    df.to_json(path_or_buf=str(OUTPUT_PATH / filename), orient='records', lines=True)
    S3.upload_file(path_from=str(OUTPUT_PATH / filename), bucket=S3_BUCKET, path_to=S3_PATH + filename)

In [15]:
data = get_dataset("taxonomy_labelling_sample")
data

2024-01-31 12:27:55,227 - root - INFO - File data/labels/taxonomy_classifier/training_validation_data_patents_openalex.jsonl downloaded from discovery-iss to /Users/karlis.kanders/Documents/code/discovery_child_development/inputs/data/labelling/taxonomy/input/training_validation_data_patents_openalex.jsonl


Unnamed: 0,id,text,source
0,W1519846160,Intricate Identities: Cochlear Implant Users N...,openalex
1,W1523874198,Early Bilingual Education. In the coming years...,openalex
2,W1535586210,The Use of Music Therapy Strategies to Enhance...,openalex
3,W1538577929,Experience-based language acquisition: a compu...,openalex
4,W1556618260,"A new old age?: Exploring the values, attitude...",openalex
...,...,...,...
8833,CN-215456599-U,Environment-friendly children learn drawer for...,patents
8834,US-2021022682-A1,Advanced play environment for screening and ea...,patents
8835,CN-113254836-B,Intelligent child-care knowledge point informa...,patents
8836,CN-110122881-A,A nutritional composition for promoting height...,patents


In [16]:
from discovery_child_development import (
    PROJECT_DIR,
    binary_config,
    labelling_config,
    logging,
)
from discovery_child_development.utils.huggingface_pipeline import (
    load_model,
    load_training_args,
    load_trained_model,
)
from discovery_child_development.getters.binary_classifier.binary_classifier_model import (
    get_binary_classifier_models,
)
from discovery_child_development.utils.huggingface_pipeline import (
    predictions_huggingface,
)

# Paths
S3_MODELS_PATH = "models/binary_classifier/"
PATH_TO = f"{PROJECT_DIR}/outputs/data/models/"
MODEL_FILENAME = (
    f"gpt_labelled_binary_classifier_distilbert_production_True.tar.gz"
)
OUTPUT_FILENAME = labelling_config["OUTPUT_FILENAME"]

if __name__ == "__main__":
    # Save the model locally
    logging.info("Downloading the model...")
    get_binary_classifier_models(
        filename=MODEL_FILENAME, s3_path=S3_MODELS_PATH, path_to=PATH_TO
    )
    model_folder = (
        f"{PATH_TO}gpt_labelled_binary_classifier_distilbert_production_True"
    )

    # Load the model
    logging.info("Loading the model...")
    model = load_model(model_path=model_folder, config=binary_config, num_labels=2)

    # Train model with early stopping
    training_args = load_training_args(**binary_config["training_args"])
    trainer = load_trained_model(
        model=model,
        args=training_args,
        config=binary_config,
    )

    # Get the labelled data
    logging.info("Getting the labelled data...")
    data_for_labelling = data[['id', 'text']]

    # Get the predictions
    logging.info("Getting the predictions...")
    predictions = predictions_huggingface(
        trainer=trainer, text_data=data_for_labelling, config=binary_config
    )

    # # Save the predictions
    # logging.info("Saving the predictions...")
    # S3.upload_obj(
    #     predictions,
    #     S3_BUCKET,
    #     f"data/outputs/binary_classifier/{OUTPUT_FILENAME}.csv",
    # )


2024-01-31 12:33:35,087 - root - INFO - Downloading the model...
2024-01-31 12:33:56,953 - root - INFO - Loading the model...
2024-01-31 12:33:57,608 - root - INFO - Getting the labelled data...
2024-01-31 12:33:57,617 - root - INFO - Getting the predictions...


Map: 100%|██████████| 8838/8838 [00:01<00:00, 5174.13 examples/s]
Map: 100%|██████████| 8838/8838 [00:03<00:00, 2514.20 examples/s]
100%|██████████| 553/553 [03:31<00:00,  2.61it/s]


In [19]:
from discovery_child_development.getters.labels import get_taxonomy_labels

In [20]:
tax_labels = get_taxonomy_labels()

In [35]:
labelled_data = (
    predictions
    .query("predictions == 1")
    .merge(tax_labels[['id', 'source', 'prediction']], left_on="id", right_on="id", how="left")
    .drop(columns=["predictions"])
    .explode("prediction")
)

In [37]:
labelled_data.sample()

Unnamed: 0,id,text,source,prediction
822,W3118471363,SOCIAL LEARNING IN THE CHILDHOOD: INFLUENCE OF...,openalex,Family and home


In [31]:
labelled_data.groupby("prediction").count()

Unnamed: 0_level_0,id,text
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
AR VR,37,37
Assessment (general),1085,1085
Child protection,620,620
Cognitive development,1239,1239
Communication and language,478,478
Community,741,741
Data,615,615
Data science and AI,209,209
Expressive arts and design,402,402
Family and home,290,290


In [39]:
import numpy as np

def sample_n_predictions_stratified(
    df: pd.DataFrame,
    n: int,
    column_A: str = 'prediction',
    column_B: str= 'source',
    seed: int = 42
) -> pd.DataFrame:
    """
    Produce a sample with n elements for each label in column_A, uniformly distributed by column_B labels

    Args:
        df (pd.DataFrame): dataframe with predictions
        n (int): number of samples
        column_A (str, optional): column to groupby. Defaults to 'prediction'.
        column_B (str, optional): column to stratify. Defaults to 'source'.
        seed (int, optional): random seed. Defaults to 42.

    Returns:
        pd.DataFrame: dataframe with n samples for each unique value of column prediction
    """
    np.random.seed(seed)
    return (
        df
        .groupby([column_A, column_B])
        .apply(lambda x: x.sample(n=n//2))
        .reset_index(drop=True)
    )

In [44]:
unique_labels = labelled_data['prediction'].unique()

# write a function that goes through each label
# and samples 25 examples from each source - OR
# if there are not enough samples, sample all of them
# and then sample the remaining from the other sources
def  sample_n_or_all(
    df: pd.DataFrame,
    n: int,
    column_A: str = 'prediction',
    column_B: str= 'source',
    seed: int = 42
):
    """
    """
    np.random.seed(seed)
    unique_Bs = df[column_B].unique()
    sampled = pd.DataFrame()
    for label in unique_labels:
        label_df = df.query(f"{column_A} == '{label}'")
        for source in unique_Bs:
            if len(label_df.query(f"{column_B} == '{source}'")) < n:
                sampled = pd.concat([sampled, label_df.query(f"{column_B} == '{source}'")])
            else:
                sampled = pd.concat([sampled, label_df.query(f"{column_B} == '{source}'").sample(n=n)])
    return sampled

def sample_data(df, column_A, column_B, n, seed):
    np.random.seed(seed)
    return df.groupby([column_A, column_B]).apply(lambda x: x.sample(min(len(x), n))).reset_index(drop=True)

In [80]:
sample = sample_n_or_all(labelled_data, 25, 'prediction', 'source', seed=100) 
sample2 = sample_data(labelled_data, 'prediction', 'source', 25, seed=100)

In [81]:
# replace the label "Family and home" to "Parenting" in prediction column
# sample['prediction'] = sample['prediction'].replace('Family and home', 'Parenting')

In [82]:
sample.groupby("prediction").count().sort_values(by="prediction")

Unnamed: 0_level_0,id,text,source
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR VR,36,36,36
Assessment (general),50,50,50
Child protection,50,50,50
Cognitive development,50,50,50
Communication and language,50,50,50
Community,39,39,39
Data,50,50,50
Data science and AI,50,50,50
Expressive arts and design,50,50,50
Family and home,50,50,50


In [85]:
pd.set_option('display.max_colwidth', None)
sample = sample.sort_values("prediction")

In [None]:
# add "https://openalex.org/works/" to the openalex IDs

In [86]:
sample

Unnamed: 0,id,text,source,prediction
1686,W4285741356,"Mobile Augmented Reality applied as a learning strategy for early childhood education students.. The purpose of this research is to determine the effect of the use of mobile augmented reality applied as a learning strategy for early childhood education students. The research is applied, pre-experimental and quantitative design. The population consisted of children in the initial grade of an educational institution in Peru. The sample consisted of 20 students of 5 years old as experimental and control group. As a result of the intervention, it was possible to improve learning through competencies in 80% of an expected level of achievement and 20% of an outstanding level of achievement, concluding that the use of mobile augmented reality improved learning in early education. The use of this technology is recommended as a pedagogical tool for the early education sector in Peru.",openalex,AR VR
1524,W4241367487,"IPads in The Early Childhood Science Classroom: An Exploration of IPads Use in Hands - On Science Activities. This qualitative case study explored the affordances and uses of the iPad in Prep classrooms when integrated into hands-on science activities about movement. Using a hybrid inductive and deductive thematic analysis approach, this study identified and described the ways that virtual and augmented reality apps on the iPad can both enhance and distract from hands-on science activities about movement in Prep. The findings suggest ways that educators may achieve greater integration of hands-on science and technology learning experiences in Prep classrooms.",openalex,AR VR
3874,KR-102520287-B1,"Kids care platform service system using metaverse and method for controlling the same. A child care service system and control method using the metaverse are disclosed. The present invention synchronizes information of daycare centers or kindergarten children and provides them in real time, thereby assisting with rapid confirmation of children&#39;s safety and developmental status, safety status of facilities, etc., and childcare tasks in the real world.",patents,AR VR
1737,W4293868984,"Research on the Design and Effect of Early Childhood Education Games Supported by AR Technology. The main content of this study is the research on the design and effect of early childhood educational games supported by augmented reality (hereinafter referred to as AR) technology. The research starts with the investigation and literature review of practical problems in early childhood education, aiming at the practical problems, combined with the characteristics of AR technology, and systematically proposes a preschool education communication process model supported by technology to solve the practical problems of early childhood education. Using the characteristics of technological multimedia integration, establish an all-media integrated resource package for early childhood education to solve practical problems such as scattered educational resources, duplication of construction, and resource waste technology.",openalex,AR VR
2517,W4320159718,"Establishment of Childhood's Discipline Characters in a Clean and Healthy Life Behavior in The Pandemic Covid-19. Clean and healthy living behavior is a big challenge in early childhood education, especially during the Clean and healthy living behavior is a big challenge in early childhood education, especially during the covid 19 pandemic. One of the primary keys, in this case, is the disciplined character of children. This study aims to analyze the character of children's discipline in implementing Clean and Healthy Life Behavior during the Covid-19 Pandemic. This research is a type of qualitative research with a descriptive approach. Data analysis was carried out by systematically analyzing data obtained from interviews, field notes, and other materials so that they were quickly accessible and informed to others. The results showed that children's discipline in implementing Clean and Healthy Behavior (PHBS) during the Covid-19 pandemic was optimal. It showed the character of the following actions: using masks or face shields, hand washing discipline, and discipline in the application. From social distancing, discipline in activities, discipline in consuming healthy food, and discipline in maintaining environmental cleanliness. Then the factors that influence the disciplined character of children in implementing clean and healthy living behavior during the Covid-19 pandemic consist of internal factors in the form of habits and heredity and external factors in the form of education and environmental factors.",openalex,AR VR
...,...,...,...,...
1312,W4200399823,"Use of Wearable Devices to Study Physical Activity in Early Childhood Education. Physical activity recommendations for early childhood are gradually being met to a lesser extent today. The objectives of the study were: (i) to assess the degree of compliance with physical activity (PA) recommendations by gender; and (ii) to analyze the level of PA and steps in different periods over a week. This study was an observational cross-sectional study. Sixty-three young children (33 boys and 30 girls) aged two years (2.15 ± 0.35) were recruited for this study. Participants wore a “Garmin vivofit® jr.” activity tracker for seven days, collecting minutes of moderate-to-vigorous physical activity (MVPA) and step volume (Out-of-School Time and School Time). The results show a 50% compliance of the 120 min/day MVPA and 13,000 steps per day. No gender differences were detected. The findings in the analysis indicated a trend towards higher PA and steps at the Weekend. Moreover, participants reached higher PA and steps values “Out-of-School Time” than “School Time” (MVPA) (min/day). This study can provide strategies and motivational PA guidelines at school to enhance well-being at an early age. These activity trackers could stimulate more sustainable forms of urban mobility, such as walking, as the environment would accompany the child to meet daily PA recommendations.",openalex,Wearables
429,W3003525908,"Behavior Profiles at 2 Years for Children Born Extremely Preterm with Bronchopulmonary Dysplasia. Objective To characterize behavior of 2-year-old children based on the severity of bronchopulmonary dysplasia (BPD). Study design We studied children born at 22-26 weeks of gestation and assessed at 22-26 months of corrected age with the Child Behavior Checklist (CBCL). BPD was classified by the level of respiratory support at 36 weeks of postmenstrual age. CBCL syndrome scales were the primary outcomes. The relationship between BPD grade and behavior was evaluated, adjusting for perinatal confounders. Mediation analysis was performed to evaluate whether cognitive, language, or motor skills mediated the effect of BPD grade on behavior. Results Of 2310 children, 1208 (52%) had no BPD, 806 (35%) had grade 1 BPD, 177 (8%) had grade 2 BPD, and 119 (5%) had grade 3 BPD. Withdrawn behavior (P < .001) and pervasive developmental problems (P < .001) increased with worsening BPD grade. Sleep problems (P = .008) and aggressive behavior (P = .023) decreased with worsening BPD grade. Children with grade 3 BPD scored 2 points worse for withdrawn behavior and pervasive developmental problems and 2 points better for externalizing problems, sleep problems, and aggressive behavior than children without BPD. Cognitive, language, and motor skills mediated the effect of BPD grade on the attention problems, emotionally reactive, somatic complaints, and withdrawn CBCL syndrome scales (P values < .05). Conclusions BPD grade was associated with increased risk of withdrawn behavior and pervasive developmental problems but with decreased risk of sleep problems and aggressive behavior. The relationship between BPD and behavior is complex. Cognitive, language, and motor skills mediate the effects of BPD grade on some problem behaviors. To characterize behavior of 2-year-old children based on the severity of bronchopulmonary dysplasia (BPD). We studied children born at 22-26 weeks of gestation and assessed at 22-26 months of corrected age with the Child Behavior Checklist (CBCL). BPD was classified by the level of respiratory support at 36 weeks of postmenstrual age. CBCL syndrome scales were the primary outcomes. The relationship between BPD grade and behavior was evaluated, adjusting for perinatal confounders. Mediation analysis was performed to evaluate whether cognitive, language, or motor skills mediated the effect of BPD grade on behavior. Of 2310 children, 1208 (52%) had no BPD, 806 (35%) had grade 1 BPD, 177 (8%) had grade 2 BPD, and 119 (5%) had grade 3 BPD. Withdrawn behavior (P < .001) and pervasive developmental problems (P < .001) increased with worsening BPD grade. Sleep problems (P = .008) and aggressive behavior (P = .023) decreased with worsening BPD grade. Children with grade 3 BPD scored 2 points worse for withdrawn behavior and pervasive developmental problems and 2 points better for externalizing problems, sleep problems, and aggressive behavior than children without BPD. Cognitive, language, and motor skills mediated the effect of BPD grade on the attention problems, emotionally reactive, somatic complaints, and withdrawn CBCL syndrome scales (P values < .05). BPD grade was associated with increased risk of withdrawn behavior and pervasive developmental problems but with decreased risk of sleep problems and aggressive behavior. The relationship between BPD and behavior is complex. Cognitive, language, and motor skills mediate the effects of BPD grade on some problem behaviors.",openalex,Wearables
2939,KR-102193190-B1,"System for monitoring integrational safety of children using Beacon signal and Driving method thereof. Regarding the integrated child safety monitoring system and its driving method using a beacon signal, it is a wearable device that can be worn on a part of the body of a child and a teacher who wishes to attend childcare and educational institutions, and detects the heart rate of a child or teacher, and the wearable device. By receiving the heart rate detected in, it checks whether the heart rate of the child or teacher is within the normal range, and when the heart rate of the child or teacher is out of the normal range, the device is used by the parent or senior teacher of the child. A safety monitoring server that transmits an abnormality notification signal and a parent of a child who wants to go to the child care and educational institution owns it, and an app dedicated to safety management in parent mode is installed and driven, and the child&#39;s real-time heart rate information and information from the safety monitoring server High-efficiency children that can ensure the safety of children by minimizing the addition or modification of existing systems or equipment by a child safety integrated monitoring system including a parent terminal that receives an abnormality notification signal and displays it through the safety management app. An integrated safety monitoring system and its driving method can be provided.",patents,Wearables
3904,CN-112107432-A,"Information processing device, information processing method, information processing system, and storage medium. The present invention provides an information processing apparatus, an information processing method, an information processing system and a storage medium for acquiring useful information related to child rearing. The information processing device according to the present application is characterized by having an acquisition unit that acquires body motion information indicating the body motion of the infant detected by a detection device attached to an article worn by the infant, and information indicating the infant&#39;s body motion. Body temperature information of body temperature; a determination unit that determines an event related to the infant based on the body motion information and body temperature information acquired by the acquisition unit; The event information is registered in a predetermined recording device.",patents,Wearables


In [87]:
filename = "taxonomy_labels_eval.jsonl"
output_eval_data(sample, filename)

In [88]:
sample.groupby("source").count()

Unnamed: 0_level_0,id,text,prediction
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
openalex,1050,1050,1050
patents,759,759,759


In [90]:
# Import categories and export as a table
from discovery_child_development.utils import taxonomy_labelling_utils as tlu
PATH_TO_CATEGORIES = PROJECT_DIR / "discovery_child_development/pipeline/labelling/taxonomy/prompts/categories.json"
categories = tlu.load_categories(PATH_TO_CATEGORIES)

In [98]:
# turn the dict into a table
categories_table = (
    pd.DataFrame.from_dict(categories, orient="index").reset_index().rename(columns={"index": "category", 0: "description"})
    .to_csv(str(PROJECT_DIR / "outputs/labels/taxonomy/categories.csv"), index=False)
)
