# 1. Import Packages

In [1]:
import pandas as pd
import numpy as np
from discovery_child_development.utils.huggingface_pipeline import (
    load_model,
    load_training_args,
    load_trained_model)
from discovery_child_development import PROJECT_DIR, binary_config, config
from discovery_child_development.getters.binary_classifier.binary_classifier_model import get_binary_classifier_models
from discovery_child_development.getters.openalex import get_abstracts
from discovery_child_development.getters.openalex_broad_concepts import get_abstracts_broad
from discovery_child_development.getters.binary_classifier.gpt_labelled_datasets import get_labelled_data_for_classifier

  from .autonotebook import tqdm as notebook_tqdm


2023-12-18 11:59:00,310 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


# 2. Setting Parameters

In [2]:
# Model vars
production = True

# Set the seed
SEED = config["seed"]
np.random.seed(SEED)

#Paths
S3_PATH = "models/binary_classifier/"
PATH_TO = f"{PROJECT_DIR}/outputs/data/models/"
OUTPUT_FILENAME = f"gpt_labelled_binary_classifier_distilbert_production_{production}.tar.gz"

# 3.1 Loading model

In [3]:
get_binary_classifier_models(filename=OUTPUT_FILENAME, s3_path=S3_PATH, path_to=PATH_TO)

In [4]:
model_folder = f"{PATH_TO}gpt_labelled_binary_classifier_distilbert_production_{production}"
# Load the model
model = load_model(model_path=model_folder,config=binary_config, num_labels=2)

# Train model with early stopping
training_args = load_training_args(output_dir=S3_PATH, config=binary_config)
trainer = load_trained_model(
    model=model,
    args=training_args,
    config=binary_config,
)

# 4. Testing edge cases

In [5]:
from discovery_child_development.getters.binary_classifier.prompts_edge_cases import get_examples
from discovery_child_development.utils.testing_examples_utils import testing_examples_huggingface
examples = get_examples()

In [None]:
# Removing Not-specified
examples = examples.query("labels!='Not specified'")
examples.tail()

In [None]:
testing_examples_huggingface(trainer,examples, binary_config)

# 5. Trialling the model on the openalex concepts

In [10]:
# Get labelled training data
labelled_data = get_labelled_data_for_classifier(set_type="train")
labelled_data_ids = labelled_data.id.unique()

In [None]:
# Get abstracts
abstracts = get_abstracts().query("id not in @labelled_data_ids")
abstracts_broad = get_abstracts_broad().query("id not in @labelled_data_ids")

In [None]:
# Collecting sample of results
relevant = abstracts.sample(500,random_state=SEED).assign(labels=1)
not_relevant = abstracts_broad.sample(500,random_state=SEED).assign(labels=0)
test_set = pd.concat([relevant,not_relevant])

In [None]:
results = testing_examples_huggingface(trainer,test_set[['labels','text']], binary_config)

In [None]:
results[1]

In [None]:
results[0]

In [None]:
test_set['predictions'] = results[0]

In [None]:
test_set[test_set.predictions!=test_set.labels]

# 6. Trialling the model on the crunchbase concepts

In [19]:
from discovery_child_development.getters.labels import get_relevance_labels

crunchbase_relevant = get_relevance_labels("relevant_crunchbase_investments_20230623").reset_index().dropna()

In [20]:
results_crunchbase = testing_examples_huggingface(trainer,crunchbase_relevant[['labels','text']], binary_config)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[label_col] = df[label_col].replace({replace_cat[0]: 1, replace_cat[1]: 0})
Map: 100%|██████████| 1621/1621 [00:00<00:00, 7570.84 examples/s]
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 102/102 [01:31<00:00,  1.11it/s]


In [18]:
crunchbase_relevant[['labels','text']].dropna()

Unnamed: 0,labels,text
0,1,Care.com improves the lives of families and ca...
1,1,Maya's Mom is a social network and advice site...
2,1,Maya's Mom is a social network and advice site...
3,1,"BabyCenter, a subsidiary of Johnson & Johnson,..."
4,1,"CafeMom is an online media company for moms, r..."
...,...,...
1651,1,EdTech startup using AI to accomplish the comp...
1652,1,EdTech
1653,1,eTalk - Online English learning platform for a...
1654,1,Kingsley House offers nationally-accredited & ...
