In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".." ))
sys.path.insert(0, root_path)

# Embeddings Multiclass Example

In [2]:
from src.embeddings_approach import embeddings_approach
from src.azure_config import azure_config
from src.data_ingestion import data_ingestion
from src.model_assessment import model_assessment

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
# data_ingestion.add_train_test_val_labels_to_df('safeguarding_472_Sept22_DanFinola')

Let's just check the labels are there in the expected numbers:

In [None]:
df = data_ingestion.DataRetrieverDatastore("safeguarding_472_Sept22_DanFinola").dataset
print(df["train"].sum())
print(df["test"].sum())
print(df["val"].sum())
del df

We need to make sure that the target columns exist in both the datasets we're using

In [None]:
run = azure_config.start_run(expeiment_name="embeddings_multiclass_example")

In [None]:
print(
    data_ingestion.DataRetrieverDatastore(
        "safeguarding_472_Sept22_DanFinola"
    ).dataset.columns
)
print(data_ingestion.DataRetrieverDatastore("published_3k_DG_devset").dataset.columns)

They do now - but only because we've already run the cell below

In [None]:
df = data_ingestion.DataRetrieverDatastore("safeguarding_472_Sept22_DanFinola").dataset
df["Comment Text"] = df["text"]
data_ingestion.register_dataframe(
    df=df, dataset_name="safeguarding_472_Sept22_DanFinola"
)

And let's make sure we've split our data into train/test/val. Again, you'll only need to do this once!

In [None]:
data_ingestion.add_train_test_val_labels_to_df(
    dataset_name="safeguarding_472_Sept22_DanFinola"
)

Also have to make sure that the `y` column exists in the published data

In [None]:
df = data_ingestion.DataRetrieverDatastore("published_3k_DG_devset").dataset
df["label_multi"] = 0
data_ingestion.register_dataframe(df=df, dataset_name="published_3k_DG_devset")
del df

OK and now onto the actual thing. 

Notice that, in contrast to the other example, I've used an SVM here. You have to make sure that you're using a classifier which is capable of doing multiclass. 
You also have to make sure that you're supplying the hyperopt dictionary which goes with that classifier. 

In [None]:
import sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

emb_1 = embeddings_approach.EmbeddingsApproach(
    classifier_class=sklearn.svm.SVC,
    augmented_dataset_name_list=[],
    # default_classifier_arguments=embeddings_approach.svm_space_arguments_default,
    default_classifier_arguments=embeddings_approach.svm_space_arguments_default,
    model_for_embeddings_name="all-MiniLM-L6-v2",
    name_of_column_to_embed="Comment Text",
    name_of_y_column="label_multi",
    positive_label_dataset_name_list=[
        "safeguarding_472_Sept22_DanFinola",
    ],
    negative_label_dataset_name_list=["published_3k_DG_devset"],
    max_evals=5,
    multiclass=True,
    balance_train_test_val=False,
)

In [None]:
emb_1.find_optimised_classifier()
emb_1.make_and_fit_optimal_classifier()

Now that we've got the optimised model made, we can go ahead and some metrics and log the results. 

In [None]:
emb_1.assessor.get_and_display_confusion_matrix()

Lol, perfect! Let's log that!

In [None]:
emb_1.register_optimal_model()
emb_1.log_all_attributes(run=run)
emb_1.assessor.log_all_multiclass_metrics(run=run)
run.complete()

If there are any metrics missing from the assessor class which you'd like to see registered, just add them to the class as methods!