In [None]:
import os
import mlflow
import numpy as np
from mlflow.models import infer_signature
from bioext.doccano_utils import DoccanoSession
from bioext.hfpipeline import GlobalConfig, DataSource, TaskType, DataHandler, HFSequenceClassificationTrainer
from transformers import AutoTokenizer
from dotenv import load_dotenv

In [None]:
# This notebook is for testing the functionality of hfpipeline.py
# Before using, load up your local Doccano instance and create a project + load data
# Sample pre-labelled data is provided in ./imports for binary classification, multiclass (3 label) classification, and multilabel (4 label) classification
# A pre-labelled NER dataset is provided, but not yet implemented in hfpipeline

In [None]:
load_dotenv()
docsesh = DoccanoSession()

In [None]:
projects = docsesh.client.list_projects()

for project in projects:
    print(f"Project ID: {project.id}, Name: {project.name}, Type: {project.project_type}")

In [None]:
config = GlobalConfig(
    doc_project_id=1,
    source=DataSource.DOCCANO,
    task=TaskType.MULTILABEL,
    num_labels=4,
    model_name="distilbert-base-uncased",
    max_length=256,
    batch_size=16,
    learning_rate=3e-5,
    num_train_epochs=3,
    output_dir="./model_output"
)

# data handler to load and preprocess data
data_handler = DataHandler(config=config)

print(f"Training samples: {len(data_handler.train_dataset)}")
print(f"Testing samples: {len(data_handler.test_dataset)}")

sample = data_handler.train_dataset[0]
print(sample)

In [None]:
# initialise trainer
trainer = HFSequenceClassificationTrainer(
    config=config,
    tokenizer=data_handler.tokenizer
)

In [None]:
trainer.setup_trainer(
    train_dataset=data_handler.train_dataset,
    eval_dataset=data_handler.test_dataset
)

In [None]:
experiment_name = "bert-binary-classification"
mlflow.set_tracking_uri("http://localhost:5001")
# mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)
print("Set up Experiment on MLflow")


Set up Experiment on MLflow


In [None]:
with mlflow.start_run():
    training_metrics = trainer.train()
    
    # log model to mlflow
    print("Log training params")
    mlflow.transformers.log_model(
        transformers_model={
            "model": trainer.model,
            "tokenizer": trainer.tokenizer,
        },
        artifact_path="bert_model",
        task="text-classification",
        signature=infer_signature(sample, np.array([[0.1, 0.9]])),
    )
    
    # log metrics to mlflow
    print("Training metrics")
    mlflow.log_metrics(training_metrics)
    
    # log metrics to mlflow
    print("Evaluation metrics")
    eval_metrics = trainer.trainer.evaluate()
    mlflow.log_metrics(eval_metrics)


print(f"Model saved to: {os.path.abspath(config.output_dir)}")
print("Training metrics:")
for key, value in training_metrics.items():
    print(f"{key}: {value:.2f}")


print("Evaluation metrics:")
for key, value in eval_metrics.items():
    if isinstance(value, (int, float)):
        print(f"{key}: {value:.2f}")

Epoch,Training Loss,Validation Loss,Accuracy,Micro Precision,Micro Recall,Micro F1,Macro Precision,Macro Recall,Macro F1
1,No log,0.520203,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,No log,0.499661,0.076923,1.0,0.076923,0.142857,0.25,0.0625,0.1
3,No log,0.493799,0.230769,0.75,0.230769,0.352941,0.1875,0.1875,0.1875


Device set to use mps:0


Log training params


README.md:   0%|          | 0.00/8.58k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training metrics
Evaluation metrics


🏃 View run blushing-cod-734 at: http://localhost:5001/#/experiments/1/runs/3f32033803b94ec0b27965880d00fe6d
🧪 View experiment at: http://localhost:5001/#/experiments/1
Model saved to: /Users/sratkai/Documents/projects/bio-ext/projects/pipeline_development/model_output
Training metrics:
train_runtime: 7.81
train_samples_per_second: 19.21
train_steps_per_second: 1.54
total_flos: 9935409254400.00
train_loss: 0.52
epoch: 3.00
Evaluation metrics:


NameError: name 'eval_results' is not defined