# Evaluating the Simple models

The goal of this notebook is to evaluate the simple models ("log_regression", "knn", "random_forest", "sgd", "svm") on various datasets. The model is trained in pipeline/models/binary_classifier/04a_train_simple_classifiers.py. Or you can test it in 04a_train_simple_classifiers.ipynb. 

It also tests whether we can download the model from S3, save it locally, and then load it.

The model is trained on the gpt-labelled openalex/patents dataset. The model is then tested on the following datasets:

- A sample of interesting examples discussed by the team
- A sample from the openalex dataset gathered in pipeline/openalex.
- A sample of potentially relevant crunchbase descriptions.

The equivalent refactored file for the openalex/crunchbase data resides in pipeline/models/binary_classifier (05 and 06). This notebook is for testing purposes only.


# 1. Import Packages

In [None]:
import pandas as pd
import numpy as np
from discovery_child_development import PROJECT_DIR, binary_config, config, S3_BUCKET
from nesta_ds_utils.loading_saving import S3
from discovery_child_development.getters.openalex import get_abstracts
from discovery_child_development.getters.openalex_broad_concepts import get_abstracts_broad
from discovery_child_development.getters.binary_classifier.gpt_labelled_datasets import get_labelled_data_for_classifier

# 2. Setting Parameters

In [None]:
# Model vars
production = True

# Set the seed
SEED = config["seed"]
np.random.seed(SEED)

#Paths
S3_PATH = "models/binary_classifier/"
PATH_TO = f"{PROJECT_DIR}/outputs/data/models/"

# 3.1 Loading model

In [None]:
models_simple = ["log_regression", "knn", "random_forest", "sgd", "svm"]
models_all = {}
for model in models_simple:
    # Save model to S3
    models_all[model]=S3.download_obj(
    bucket=S3_BUCKET,
    path_from=f"{S3_PATH}gpt_labelled_binary_classifier_{model}.pkl",
    )

# 4. Testing edge cases

In [None]:
from discovery_child_development.getters.binary_classifier.prompts_edge_cases import get_examples
from discovery_child_development.utils.general_utils import replace_binary_labels
from discovery_child_development.utils.testing_examples_utils import testing_examples_simple
examples = get_examples()

In [None]:
# Removing Not-specified
examples = examples.query("labels!='Not specified'")
examples = replace_binary_labels(examples, "labels", replace_cat=["Relevant", "Not relevant"])
examples.tail()

In [None]:
for model in models_all:
    print(
        testing_examples_simple(list(examples.text),list(examples.labels),models_all[model])
    )

# 5. Trialling the model on the openalex concepts

In [None]:
# Get labelled training data
labelled_data = get_labelled_data_for_classifier(set_type="train")
labelled_data_ids = labelled_data.id.unique()

In [None]:
# Get abstracts
abstracts = get_abstracts().query("id not in @labelled_data_ids")
abstracts_broad = get_abstracts_broad().query("id not in @labelled_data_ids")

In [None]:
# Collecting sample of results
relevant = abstracts.sample(500,random_state=SEED).assign(labels=1)
not_relevant = abstracts_broad.sample(500,random_state=SEED).assign(labels=0)
test_set = pd.concat([relevant,not_relevant])

In [None]:
test_set

In [None]:
for model in models_all:
    print(
        testing_examples_simple(list(test_set.text),list(test_set.labels),models_all[model])
    )

# 6. Trialling the model on the crunchbase concepts

In [None]:
from discovery_child_development.getters.labels import get_labelled_data

crunchbase_relevant = get_labelled_data("relevant_crunchbase_investments_20230623").reset_index().dropna()

In [None]:
for model in models_all:
    print(
        testing_examples_simple(list(crunchbase_relevant.text),list(crunchbase_relevant.labels), models_all[model])
    )