# 1. Import Packages

In [3]:
import pandas as pd
import numpy as np
from discovery_child_development.utils.huggingface_pipeline import (
    load_model,
    load_training_args,
    load_trained_model)
from discovery_child_development import PROJECT_DIR, binary_config, config
from discovery_child_development.getters.binary_classifier.binary_classifier_model import get_binary_classifier_models
from discovery_child_development.getters.openalex import get_abstracts
from discovery_child_development.getters.openalex_broad_concepts import get_abstracts_broad
from discovery_child_development.getters.binary_classifier.gpt_labelled_datasets import get_labelled_data_for_classifier

# 2. Setting Parameters

In [4]:
# Model vars
production = True

# Set the seed
SEED = config["seed"]
np.random.seed(SEED)

#Paths
S3_PATH = "models/binary_classifier/"
PATH_TO = f"{PROJECT_DIR}/outputs/data/models/"
OUTPUT_FILENAME = f"gpt_labelled_binary_classifier_distilbert_production_{production}.tar.gz"

# 3.1 Loading model

In [14]:
get_binary_classifier_models(filename=OUTPUT_FILENAME, s3_path=S3_PATH, path_to=PATH_TO)

In [5]:
model_folder = f"{PATH_TO}gpt_labelled_binary_classifier_distilbert_production_{production}"
# Load the model
model = load_model(model_path=model_folder,config=binary_config, problem_type=False, num_labels=2)

# Train model with early stopping
training_args = load_training_args(output_dir=S3_PATH, config=binary_config)
trainer = load_trained_model(
    model=model,
    args=training_args,
    config=binary_config,
    problem_type=False,
)

# 4. Testing edge cases

In [6]:
from discovery_child_development.getters.binary_classifier.prompts_edge_cases import get_examples
from discovery_child_development.utils.testing_examples_utils import testing_examples_huggingface
examples = get_examples()

2023-12-15 14:33:25,604 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [7]:
# Removing Not-specified
examples = examples.query("labels!='Not specified'")
examples.tail()

Unnamed: 0,labels,text
12,Relevant,"""Systems and methods for assessing infant and ..."
13,Relevant,"""Non-invasive nerve stimulation to treat or pr..."
14,Not relevant,"""Manual feeding method for panda young. The in..."
15,Not relevant,"""Functional Ontogeny of Hypothalamic Agrp Neur..."
17,Relevant,"""Time Orientation Technologies in Special Educ..."


In [8]:
testing_examples_huggingface(trainer,examples, binary_config)

Map: 100%|██████████| 15/15 [00:00<00:00, 590.48 examples/s]
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1/1 [00:03<00:00,  3.45s/it]


(array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0]),
 {'test_loss': 0.9433050155639648,
  'test_accuracy': 0.6666666666666666,
  'test_recall': 0.7,
  'test_precision': 0.7777777777777778,
  'test_f1': 0.7368421052631577,
  'test_runtime': 4.5042,
  'test_samples_per_second': 3.33,
  'test_steps_per_second': 0.222})

# 5. Trialling the model on the openalex concepts

In [None]:
# Get labelled training data
labelled_data = get_labelled_data_for_classifier(set_type="train")
labelled_data_ids = labelled_data.id.unique()

In [None]:
# Get abstracts
abstracts = get_abstracts().query("id not in @labelled_data_ids")
abstracts_broad = get_abstracts_broad().query("id not in @labelled_data_ids")

In [None]:
# Collecting sample of results
relevant = abstracts.sample(500,random_state=SEED).assign(labels=1)
not_relevant = abstracts_broad.sample(500,random_state=SEED).assign(labels=0)
test_set = pd.concat([relevant,not_relevant])

In [None]:
results = testing_examples_huggingface(trainer,test_set[['labels','text']], binary_config)

In [None]:
results[1]

In [None]:
results[0]

In [None]:
test_set['predictions'] = results[1]

In [None]:
test_set[test_set.predictions!=test_set.labels]