In [None]:
%load_ext autoreload
%autoreload 2

# MLflow Classification Recipe Notebook

This notebook runs the MLflow Classification Recipe on Databricks and inspects its results. For more information about the MLflow Classification Recipe, including usage examples, see the [Classification Recipe overview documentation](https://mlflow.org/docs/latest/recipes.html#classification-recipe) the [Classification Recipe API documentation](https://mlflow.org/docs/latest/python_api/mlflow.recipes.html#module-mlflow.recipes.classification.v1.recipe).

In [None]:
from mlflow.recipes import Recipe

r = Recipe(profile="local")


In [None]:
r.clean()

In [None]:
r.inspect()

In [None]:
r.run("ingest")

In [None]:
r.run("split")

In [None]:
r.run("transform")

In [None]:
r.run("train")

In [None]:
r.run("evaluate")

In [None]:
r.run("register")

In [None]:
r.inspect("train")

In [None]:
training_data = r.get_artifact("training_data")
training_data.describe()

In [None]:
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    TrainingArguments,
)        

def trainer_fn(estimator_params: Dict[str, Any]):
    """
    Returns an *untrained* HF trainer here.

    Input estimator_params is a dictionary of parameters passed to the estimator.
    It contains the following keys:
      'train_dataset': A ``datasets.Dataset`` object for training.
      'cache_dir': A string containing the path to the cache directory.
    """
    training_args = TrainingArguments(output_dir=estimator_params["cache_dir"])
    model_name = "distilbert-base-uncased"
    config = AutoConfig.from_pretrained(
        model_name,
        cache_dir=training_args.output_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=training_args.output_dir,
        use_fast=True,
    )
    model = AutoModel.from_pretrained(
        model_name,
        config=config,
        cache_dir=training_args.output_dir,
    )
    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
    # on a small vocab and want a smaller embedding size, remove this test.
    embedding_size = model.get_input_embeddings().weight.shape[0]
    if len(tokenizer) > embedding_size:
        model.resize_token_embeddings(len(tokenizer))

    def preprocess_squad_batch(
        examples,
        question_column: str,
        answer_column: str,
    ) -> Tuple[List[str], List[str]]:
        questions = examples[question_column]
        answers = examples[answer_column]
        return questions, answers

    def preprocess_examples(examples):
        question_column = "character"
        answer_column = "speech"
        inputs, targets = preprocess_squad_batch(
            examples, question_column, answer_column
        )
        model_inputs = tokenizer(
            inputs,
            targets,
            max_length=384,
            padding="max_length",
            truncation=True,
        )
        return model_inputs

    train_dataset = estimator_params["train_dataset"]
    # Create train feature from dataset
    train_dataset = train_dataset.map(
        preprocess_examples,
        batched=True,
        num_proc=1,
        load_from_cache_file=True,
        desc="Running tokenizer on train dataset",
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=tokenizer.pad_token_id,
        pad_to_multiple_of=8,
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    return trainer

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(training_data)
cache_dir = "./"
trainer = trainer_fn(estimator_params={"train_dataset": train_dataset, "cache_dir": cache_dir})
trainer.train()