In [1]:
import pandas as pd

from discovery_child_development import PROJECT_DIR
from discovery_child_development.getters import openalex as oa
from discovery_child_development.utils import classification_utils
from discovery_child_development.utils import taxonomy_labelling_utils as tlu

pd.set_option('max_colwidth', 400)

INPUT_FILE = PROJECT_DIR / "inputs/data/labelling/taxonomy/output/training_validation_data_LABELLED.jsonl"

2023-12-13 14:58:55,240 - botocore.credentials - INFO - Found credentials in environment variables.


In [2]:
def clean_openalex_id(df, column_name='id'):
  """Cleans the OpenAlex ID to remove the prefix"""
  df[column_name] = df[column_name].str.extract(r'/(W\d+)$')
  return df

In [3]:
labelled_df = pd.read_json(INPUT_FILE, lines=True)

labelled_df.head()

Unnamed: 0,id,text,tokens_input,tokens_output,cost,options,accept,model_output,_input_hash,_task_hash,_view_id,config,answer,_timestamp,_annotator_id,_session_id
0,W1201555323,"Planning and Optimization During the Life-Cycle of Service Level Agreements for Cloud Computing. A Service Level Agreement (SLA) is an electronic contract between the consumer and the provider of a service. It governs their business relationship by clarifying expectations and obligations of participating entities, with regard to the service and its quality. SLAs are already the prime paradigm ...",1472,11,0.001494,"[{'id': 'Cognitive development', 'text': 'Cognitive development'}, {'id': 'SEND', 'text': 'SEND'}, {'id': 'Communication and language', 'text': 'Communication and language'}, {'id': 'Physical', 'text': 'Physical'}, {'id': 'Expressive arts and design', 'text': 'Expressive arts and design'}, {'id': 'Prenatal', 'text': 'Prenatal'}, {'id': 'Literacy', 'text': 'Literacy'}, {'id': 'Infancy', 'text':...","[Data, Technology (general), Operations]","[Data, Technology (general), Operations]",207436002,277803126,choice,{'choice_style': 'multiple'},ignore,1702056370,2023-12-08_17-25-55,2023-12-08_17-25-55
1,W133750416,"Practitioners' perspectives on the preschool curriculum. This study examines perspectives of the preschool curriculum held by selected teachers in Queensland State preschools.A review of the philosophical, theoretical and research literature on preschool curriculum was conducted to identify current conceptions and expectations of the preschool curriculum. A substantial body of evidence confirm...",1363,5,0.001373,"[{'id': 'Cognitive development', 'text': 'Cognitive development'}, {'id': 'SEND', 'text': 'SEND'}, {'id': 'Communication and language', 'text': 'Communication and language'}, {'id': 'Physical', 'text': 'Physical'}, {'id': 'Expressive arts and design', 'text': 'Expressive arts and design'}, {'id': 'Prenatal', 'text': 'Prenatal'}, {'id': 'Literacy', 'text': 'Literacy'}, {'id': 'Infancy', 'text':...",[Education],[Education],-1803592620,973967713,choice,{'choice_style': 'multiple'},accept,1702056376,2023-12-08_17-25-55,2023-12-08_17-25-55
2,W1509365752,"We Belong to No Soil: Nation and Narration in the Work of Emily Perkins. &lt;p&gt;Emily Perkins' work exemplifies a shift in the way the nation is represented in New Zealand fiction. In place of the cultural nationalist acceptance that the writer should attend faithfully to the New Zealand referent and seek to define the nation we find doubt, uncertainty and resistance. This shift has been obs...",1628,7,0.001642,"[{'id': 'Cognitive development', 'text': 'Cognitive development'}, {'id': 'SEND', 'text': 'SEND'}, {'id': 'Communication and language', 'text': 'Communication and language'}, {'id': 'Physical', 'text': 'Physical'}, {'id': 'Expressive arts and design', 'text': 'Expressive arts and design'}, {'id': 'Prenatal', 'text': 'Prenatal'}, {'id': 'Literacy', 'text': 'Literacy'}, {'id': 'Infancy', 'text':...",[Culture and communities],[Culture and communities],-1175684480,1081805866,choice,{'choice_style': 'multiple'},ignore,1702056385,2023-12-08_17-25-55,2023-12-08_17-25-55
3,W1522685060,"Towards an integrated methodology : C4, Sherr and Dream provings of Protea cynaroides. Homoeopathic provings form the experimental base of clinical homoeopathy. Provings are conducted through the administration of homoeopathically prepared medicine to healthy volunteers in order to elicit disease symptoms. The symptoms are collated to formulate the materia medica of the substance. AIM The aim ...",2138,8,0.002154,"[{'id': 'Cognitive development', 'text': 'Cognitive development'}, {'id': 'SEND', 'text': 'SEND'}, {'id': 'Communication and language', 'text': 'Communication and language'}, {'id': 'Physical', 'text': 'Physical'}, {'id': 'Expressive arts and design', 'text': 'Expressive arts and design'}, {'id': 'Prenatal', 'text': 'Prenatal'}, {'id': 'Literacy', 'text': 'Literacy'}, {'id': 'Infancy', 'text':...",[Data Science and AI],[Data Science and AI],284323623,-1462596629,choice,{'choice_style': 'multiple'},ignore,1702056415,2023-12-08_17-25-55,2023-12-08_17-25-55
4,W1525264554,"Living with Tensions: Stories of Chinese Early Childhood Teachers’ Teaching and Learning Experiences in the Contemporary Urban Chinese Context. &lt;p&gt;This narrative inquiry explores 6 Chinese early childhood teachers’ teaching and learning experiences in Shanghai and Beijing, where Chinese and Western educational ideas and practices co-exist. Interviews with teachers, kindergarten directors...",1373,5,0.001383,"[{'id': 'Cognitive development', 'text': 'Cognitive development'}, {'id': 'SEND', 'text': 'SEND'}, {'id': 'Communication and language', 'text': 'Communication and language'}, {'id': 'Physical', 'text': 'Physical'}, {'id': 'Expressive arts and design', 'text': 'Expressive arts and design'}, {'id': 'Prenatal', 'text': 'Prenatal'}, {'id': 'Literacy', 'text': 'Literacy'}, {'id': 'Infancy', 'text':...",[Education],[Education],-1103914168,909385015,choice,{'choice_style': 'multiple'},accept,1702056442,2023-12-08_17-25-55,2023-12-08_17-25-55


In [4]:
for x in labelled_df[labelled_df['answer']=='ignore']['text']:
    print(x)

Planning and Optimization During the Life-Cycle of Service Level Agreements for Cloud Computing. A Service Level Agreement (SLA) is an electronic contract between the consumer and the provider of a service. It governs their business relationship by clarifying expectations and obligations of participating entities, with regard to the service and its quality. SLAs are already the prime paradigm for the description of cloud computing services. Once an SLA is established, the provider has to ensure that service quality remains within certain acceptable levels; and comply with the customer's demands until the end of the service life time. However, managing the SLAs is still a technical challenge that requires signi cant e ort to achieve autonomy, economy and e ciency. Current state-of-the-art in SLA management faces challenges such as SLA representation for cloud services; business-related SLA optimizations; service outsourcing and resource management. These areas constitute, as one would e

In [5]:
categories_flat = tlu.load_categories()

categories_list = sorted(list(categories_flat.keys()))

In [6]:
categories_list

['AR VR',
 'Assessment (general)',
 'Child protection',
 'Cognitive development',
 'Communication and language',
 'Culture and communities',
 'Data',
 'Data Science and AI',
 'Early childhood development (general)',
 'Education',
 'Expressive arts and design',
 'Family environment',
 'Games',
 'Genetics',
 'Health',
 'Income',
 'Inequalities',
 'Infancy',
 'Internet',
 'Labour market',
 'Literacy',
 'Mathematics',
 'Media',
 'Mental health',
 'Mobile',
 'Neuroscience',
 'Non-tech assessments',
 'Nutrition and weight',
 'Operations',
 'Oral health',
 'Personal social emotional',
 'Physical',
 'Policy',
 'Prenatal',
 'RCTs',
 'Robotics',
 'SEND',
 'Sleep',
 'Social environment',
 'Social media',
 'Social services',
 'Statistical methods',
 'Technology (general)']

In [7]:
print(f"Examples reviewed so far: {len(labelled_df)}")

Examples reviewed so far: 373


In [8]:
print(f"Total cost so far: ${sum(labelled_df['cost'])}")

Total cost so far: $0.5188929999999999


In [9]:
accepted_labels = labelled_df[labelled_df['answer']=='accept']

prop_accepted = len(accepted_labels) / len(labelled_df)

print(f"Proportion of papers that were relevant to (human) early years development: {prop_accepted}")

Proportion of papers that were relevant to (human) early years development: 0.6327077747989276


In [10]:
accepted_ids = accepted_labels['id'].unique().tolist()

In [11]:
pre_labelled_data, _ = oa.get_labelled_data(score_threshold=0.5)
pre_labelled_data.head()

Unnamed: 0,openalex_id,concept_id,sub_category,display_name,level,score,text
0,https://openalex.org/W4249228678,https://openalex.org/C2992354236,child protection,Sexual abuse,4.0,0.583067,"REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study. Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been describ..."
10,https://openalex.org/W2996407267,https://openalex.org/C2779144063,neuroscience,Amygdala,2.0,0.802183,Childhood Adversity and Neural Development: A Systematic Review. An extensive literature on childhood adversity and neurodevelopment has emerged over the past decade. We evaluate two conceptual models of adversity and neurodevelopment—the dimensional model of adversity and stress acceleration model—in a systematic review of 109 studies using MRI-based measures of neural structure and function ...
11,https://openalex.org/W2996407267,https://openalex.org/C15744967,"personal, social, emotional",Psychology,0.0,0.684793,Childhood Adversity and Neural Development: A Systematic Review. An extensive literature on childhood adversity and neurodevelopment has emerged over the past decade. We evaluate two conceptual models of adversity and neurodevelopment—the dimensional model of adversity and stress acceleration model—in a systematic review of 109 studies using MRI-based measures of neural structure and function ...
12,https://openalex.org/W2996407267,https://openalex.org/C169760540,neuroscience,Neuroscience,1.0,0.547728,Childhood Adversity and Neural Development: A Systematic Review. An extensive literature on childhood adversity and neurodevelopment has emerged over the past decade. We evaluate two conceptual models of adversity and neurodevelopment—the dimensional model of adversity and stress acceleration model—in a systematic review of 109 studies using MRI-based measures of neural structure and function ...
13,https://openalex.org/W2996407267,https://openalex.org/C138496976,development (general),Developmental psychology,1.0,0.528793,Childhood Adversity and Neural Development: A Systematic Review. An extensive literature on childhood adversity and neurodevelopment has emerged over the past decade. We evaluate two conceptual models of adversity and neurodevelopment—the dimensional model of adversity and stress acceleration model—in a systematic review of 109 studies using MRI-based measures of neural structure and function ...


In [11]:
all_abstracts = oa.get_abstracts()
all_abstracts = clean_openalex_id(all_abstracts, 'id')

all_concepts_metadata = oa.get_concepts_metadata()
all_concepts_metadata = clean_openalex_id(all_concepts_metadata, 'openalex_id')

all_concepts_metadata[['openalex_id', "display_name"]].head()

Unnamed: 0,openalex_id,display_name
0,W4249228678,Medicine
1,W4249228678,Sexual abuse
2,W4249228678,Psychiatry
3,W4249228678,Injury prevention
4,W4249228678,Occupational safety and health


In [12]:
all_data = pd.merge(all_abstracts, all_concepts_metadata[['openalex_id', "display_name"]], how='outer', left_on='id', right_on='openalex_id')
all_data.head()

Unnamed: 0,id,title,abstract,text,openalex_id,display_name
0,W4249228678,REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study,"Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been described. Methods A questionnaire about adverse childhood experiences was mailed to 13,494 adults who had completed a standardized medical evaluation at a large HMO; 9,508 (...","REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study. Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been describ...",W4249228678,Medicine
1,W4249228678,REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study,"Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been described. Methods A questionnaire about adverse childhood experiences was mailed to 13,494 adults who had completed a standardized medical evaluation at a large HMO; 9,508 (...","REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study. Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been describ...",W4249228678,Sexual abuse
2,W4249228678,REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study,"Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been described. Methods A questionnaire about adverse childhood experiences was mailed to 13,494 adults who had completed a standardized medical evaluation at a large HMO; 9,508 (...","REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study. Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been describ...",W4249228678,Psychiatry
3,W4249228678,REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study,"Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been described. Methods A questionnaire about adverse childhood experiences was mailed to 13,494 adults who had completed a standardized medical evaluation at a large HMO; 9,508 (...","REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study. Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been describ...",W4249228678,Injury prevention
4,W4249228678,REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study,"Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been described. Methods A questionnaire about adverse childhood experiences was mailed to 13,494 adults who had completed a standardized medical evaluation at a large HMO; 9,508 (...","REPRINT OF: Relationship of Childhood Abuse and Household Dysfunction to Many of the Leading Causes of Death in Adults: The Adverse Childhood Experiences (ACE) Study. Background The relationship of health risk behavior and disease in adulthood to the breadth of exposure to childhood emotional, physical, or sexual abuse, and household dysfunction during childhood has not previously been describ...",W4249228678,Occupational safety and health


In [13]:
all_data = all_data[all_data['id'].isin(accepted_ids)]

all_data = (
        all_data[['id', 'text', 'display_name']]
        .groupby(['id', 'text'])['display_name']
        .agg(lambda x: list(set(x)))
        .reset_index()
    )

all_data.head()

Unnamed: 0,id,text,display_name
0,W133750416,"Practitioners' perspectives on the preschool curriculum. This study examines perspectives of the preschool curriculum held by selected teachers in Queensland State preschools.A review of the philosophical, theoretical and research literature on preschool curriculum was conducted to identify current conceptions and expectations of the preschool curriculum. A substantial body of evidence confirm...","[Psychology, Developmental psychology, Pedagogy, Curriculum, Curriculum mapping, Preschool education, Curriculum development, Emergent curriculum, Early childhood, Quality (philosophy), Philosophy, Epistemology, Early childhood education, Mathematics education]"
1,W1525264554,"Living with Tensions: Stories of Chinese Early Childhood Teachers’ Teaching and Learning Experiences in the Contemporary Urban Chinese Context. &lt;p&gt;This narrative inquiry explores 6 Chinese early childhood teachers’ teaching and learning experiences in Shanghai and Beijing, where Chinese and Western educational ideas and practices co-exist. Interviews with teachers, kindergarten directors...","[Psychology, Curriculum, Geography, Mathematics education, Context (archaeology), Citizen journalism, Early childhood, Archaeology, Sociology, Early childhood education, Literature, Teaching method, Developmental psychology, Art, Narrative inquiry, Agency (philosophy), Law, Social science, Pedagogy, Political science, Professional development, Professional learning community, Narrative]"
2,W210033724,"Catering for the needs of pre-school age children in rural areas: A case study. In 1985 the then Minister for Community Services (Victoria), the Hon. Caroline Hogg, announced a change of policy in relation to preschool services in Victoria; all children were to have a year of ""kindergarten type experiences"" in the year before school, and existing playgroups for two year olds were to be phased ...","[Psychology, Law, Equity (law), Economic growth, Political science, Hindsight bias, Social policy, Pedagogy, Social psychology, Project commissioning, Publishing, Poverty, Philosophy, Sociology, Early childhood education, Linguistics, Government (linguistics), Economics]"
3,W2107566735,"Computer-based support for science education materials developers in Africa : exploring potentials. CASCADE-SEA stands for Computer Assisted Curriculum Analysis, Design and Evaluation for Science (and mathematics) Education in Africa; and is the name of a computer program that was developed during the course of the study described in this book. This research was initiated to explore the potent...","[Psychology, Curriculum, Geography, Mathematics education, Operating system, Computer science, Context (archaeology), Curriculum development, Epistemology, Archaeology, Politics, Engineering, Law, Political science, Pedagogy, Quality (philosophy), Philosophy, Process (computing), Articulation (sociology)]"
4,W2154198213,"Contributions of School-Based Parenting and Family Literacy Centres in an Early Childhood Service System. &lt;p&gt;Increasingly, governments are seeking ways to integrate early childhood education and care services as a social policy strategy to maximize child and family outcomes. This study examines the role of a school-based parenting and family literacy program to a system of services in on...","[Family literacy, Psychology, Developmental psychology, Service (business), Appreciative inquiry, Community service, Pedagogy, Political science, Public relations, Business, Early childhood, Literacy, Early childhood education, Marketing]"


In [14]:
merged_data = pd.merge(accepted_labels[['id', 'accept', 'model_output']], all_data, how='outer', on='id').head()

In [15]:
merged_data.to_csv("test.csv")

In [None]:
totally_agreed = sum(accepted_labels['model_output'] == accepted_labels['accept'])

print(f"Human agreed with GPT labels in {totally_agreed} out of {len(accepted_labels)} examples, ie in {round((totally_agreed/len(accepted_labels))*100, 2)}% of examples.")

In [None]:
human_labels = accepted_labels[['id', 'accept']].set_index('id')

In [None]:
human_labels['accept'].unique

In [None]:
human_labels_df, _ = classification_utils.add_binarise_labels(human_labels, 'accept')
gpt_labels_df, _ = classification_utils.add_binarise_labels(human_labels, 'model_output')

for category in categories_list:
    if category not in human_labels_df.columns:
        human_labels_df[category] = 0
        
human_labels_df = human_labels_df[categories_list]

In [None]:
human_labels_df.sum()