# Inference Pipeline

The goal of this notebook is to evaluate the simple models ("log_regression", "knn", "random_forest", "sgd", "svm") and the distilbert-base-uncased models on the new data (unlabelled data not involved in the training process). 

This notebook contains the process in 
`pipeline/models/binary_classifier/05_inference_pipeline.py` for the chosen model distilbert-base-uncased.

This notebook also contains the process for the simple models for exploration of the data.


# 1. Import Packages

In [None]:
import pandas as pd
import numpy as np
from discovery_child_development import PROJECT_DIR, binary_config, config, S3_BUCKET, labelling_config, logging
from nesta_ds_utils.loading_saving import S3
from discovery_child_development.utils.huggingface_pipeline import (
    load_model,
    load_training_args,
    load_trained_model)
from discovery_child_development.getters.binary_classifier.binary_classifier_model import get_binary_classifier_models
from discovery_child_development.utils.huggingface_pipeline import predictions_huggingface
from discovery_child_development.getters.unlabelled_data import get_data_for_relevance_classifier
from discovery_child_development.utils.classification_utils import prediction_simple

# 2. Setting Parameters

In [None]:
# Model vars
production = True

# Set the seed
SEED = config["seed"]
np.random.seed(SEED)

#Paths
S3_PATH = "models/binary_classifier/"
PATH_TO = f"{PROJECT_DIR}/outputs/data/models/"
MODEL_FILENAME = f"gpt_labelled_binary_classifier_distilbert_production_{production}.tar.gz"
OUTPUT_FILENAME = labelling_config["OUTPUT_FILENAME"]

# 3 Loading simple models

In [None]:
models_simple = ["log_regression", "knn", "random_forest", "sgd", "svm"]
models_all = {}
for model in models_simple:
    # Save model to S3
    models_all[model]=S3.download_obj(
    bucket=S3_BUCKET,
    path_from=f"{S3_PATH}gpt_labelled_binary_classifier_{model}.pkl",
    )

# 4. Loading Distilbert model

In [None]:
get_binary_classifier_models(filename=MODEL_FILENAME, s3_path=S3_PATH, path_to=PATH_TO)

In [None]:
model_folder = f"{PATH_TO}gpt_labelled_binary_classifier_distilbert_production_{production}"
# Load the model
model = load_model(model_path=model_folder,config=binary_config, num_labels=2)

# Train model with early stopping
training_args = load_training_args(**binary_config["training_args"])
trainer = load_trained_model(
    model=model,
    args=training_args,
    config=binary_config,
)

# 5. Loading Data

In [None]:
data_for_labelling = get_data_for_relevance_classifier(config=labelling_config)

# 5. Simple classifiers

In [None]:
results_dict  = {}
for model in models_all:
    temp_df = prediction_simple(models_all[model], data_for_labelling["text"])
    results_dict[model] = temp_df

In [None]:
results_dict["log_regression"].head()

# 6. Distilbert-base-uncased model

In [None]:
predictions = predictions_huggingface(trainer=trainer, text_data=data_for_labelling, config=binary_config)

In [None]:
predictions.head()