# Evaluating the Distilbert model

The goal of this notebook is to evaluate the Distilbert model on various datasets. The model is trained in pipeline/models/binary_classifier/04b_train_distilbert_classifier.py. Or you can test it in 04b_train_distilbert_classifier.ipynb. 

It also tests whether we can download the model from S3, save it locally, and then load it.

The model is trained on the gpt-labelled openalex/patents dataset. The model is then tested on the following datasets:

- A sample of interesting examples discussed by the team
- A sample from the openalex dataset gathered in pipeline/openalex.
- A sample of potentially relevant crunchbase descriptions.

The equivalent refactored file for the openalex/crunchbase data resides in pipeline/models/binary_classifier (05 and 06). This notebook is for testing purposes only.


# 1. Import Packages

In [None]:
import pandas as pd
import numpy as np
from discovery_child_development.utils.huggingface_pipeline import (
    load_model,
    load_training_args,
    load_trained_model)
from discovery_child_development import PROJECT_DIR, binary_config, config
from discovery_child_development.getters.binary_classifier.binary_classifier_model import get_binary_classifier_models
from discovery_child_development.getters.openalex import get_abstracts
from discovery_child_development.getters.openalex_broad_concepts import get_abstracts_broad
from discovery_child_development.getters.binary_classifier.gpt_labelled_datasets import get_labelled_data_for_classifier

# 2. Setting Parameters

In [None]:
# Model vars
production = True

# Set the seed
SEED = config["seed"]
np.random.seed(SEED)

#Paths
S3_PATH = "models/binary_classifier/"
PATH_TO = f"{PROJECT_DIR}/outputs/data/models/"
MODEL_FILENAME = f"gpt_labelled_binary_classifier_distilbert_production_{production}.tar.gz"

# 3.1 Loading model

In [None]:
get_binary_classifier_models(filename=MODEL_FILENAME, s3_path=S3_PATH, path_to=PATH_TO)

In [None]:
model_folder = f"{PATH_TO}gpt_labelled_binary_classifier_distilbert_production_{production}"
# Load the model
model = load_model(model_path=model_folder,config=binary_config, num_labels=2)

# Train model with early stopping
training_args = load_training_args(**binary_config["training_args"])
trainer = load_trained_model(
    model=model,
    args=training_args,
    config=binary_config,
)

# 4. Testing edge cases

In [None]:
from discovery_child_development.getters.binary_classifier.prompts_edge_cases import get_examples
from discovery_child_development.utils.testing_examples_utils import testing_examples_huggingface
examples = get_examples()

In [None]:
# Removing Not-specified
examples = examples.query("labels!='Not specified'")
examples.tail()

In [None]:
from discovery_child_development.utils.huggingface_pipeline import predictions_huggingface

In [None]:
predictions_huggingface(trainer,examples[["text"]],binary_config)

In [None]:
testing_examples_huggingface(trainer,examples, binary_config)

# 5. Trialling the model on the openalex concepts

In [None]:
# Get labelled training data
labelled_data = get_labelled_data_for_classifier(set_type="train")
labelled_data_ids = labelled_data.id.unique()

In [None]:
# Get abstracts
abstracts = get_abstracts().query("id not in @labelled_data_ids")
abstracts_broad = get_abstracts_broad().query("id not in @labelled_data_ids")

In [None]:
# Collecting sample of results
relevant = abstracts.sample(500,random_state=SEED).assign(labels=1)
not_relevant = abstracts_broad.sample(500,random_state=SEED).assign(labels=0)
test_set = pd.concat([relevant,not_relevant])

In [None]:
results = testing_examples_huggingface(trainer,test_set[['text']], binary_config)

In [None]:
results[1]

In [None]:
results[0]

In [None]:
test_set['predictions'] = results[0]

In [None]:
test_set[test_set.predictions!=test_set.labels]

# 6. Trialling the model on the crunchbase concepts

In [None]:
from discovery_child_development.getters.labels import get_labelled_data

crunchbase_relevant = get_labelled_data("relevant_crunchbase_investments_20230623").reset_index().dropna()

In [None]:
results_crunchbase = testing_examples_huggingface(trainer,crunchbase_relevant[['labels','text']], binary_config)

In [None]:
crunchbase_relevant[['labels','text']].dropna()