# Cuisine Classification - Simple Training Pipeline
**Reference-style simple approach for ResNet-50 cuisine classification**


In [None]:
# Install required packages - matching reference versions
%pip install datasets==2.20.0 transformers==4.49.0 accelerate==1.4.0 mlflow==2.20.2 torchvision==0.20.1 torch
dbutils.library.restartPython()

In [None]:
# Essential configuration - parameterized but simple
CATALOG = "cuisine_vision_catalog"
SCHEMA = "gold"
TABLE_NAME = "ml_dataset"
MODEL_NAME = "microsoft/resnet-50"  # Can switch to: google/vit-base-patch16-224, microsoft/swin-tiny-patch4-window7-224
EXPERIMENT_NAME = "/cuisine_classifier"
REGISTERED_MODEL_NAME = f"{CATALOG}.ml_models.cuisine_classifier"
NUM_EPOCHS = 5  # Start small like reference
RANDOM_SEED = 42

print(f"üçΩÔ∏è Training {MODEL_NAME} on {CATALOG}.{SCHEMA}.{TABLE_NAME}")
print(f"üìä Will register as: {REGISTERED_MODEL_NAME}")

In [None]:
# Load dataset from gold layer - simple and direct
from datasets import Dataset
import mlflow

# Setup experiment like reference
mlflow.set_experiment(EXPERIMENT_NAME)

# Load data directly from gold table - rename columns to match HuggingFace expectations
dataset = Dataset.from_spark(
    spark.table(f"{CATALOG}.{SCHEMA}.{TABLE_NAME}")
    .select("processed_image_data", "cuisine_category")
    .filter("processed_image_data IS NOT NULL AND cuisine_category IS NOT NULL"),
    cache_dir="/tmp/hf_cache/cuisine_train"
).rename_column("processed_image_data", "image").rename_column("cuisine_category", "label")

# Simple train/test split like reference
splits = dataset.train_test_split(test_size=0.2, seed=RANDOM_SEED)
train_ds = splits['train']
val_ds = splits['test']

print(f"üìä Dataset loaded: {len(train_ds)} train, {len(val_ds)} validation")
print(f"üè∑Ô∏è Classes: {set(dataset['label'])}")

In [None]:
# Image preprocessing - exact same pattern as reference
import torch
from transformers import AutoFeatureExtractor
from PIL import Image
import io
from torchvision.transforms import CenterCrop, Compose, Normalize, RandomResizedCrop, Resize, ToTensor, Lambda

# Load model feature extractor - same as reference
model_def = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

# Transformations - identical to reference pattern
transforms = Compose([
    Lambda(lambda b: Image.open(io.BytesIO(b)).convert("RGB")),  # byte to PIL
    ToTensor(),  # convert PIL to tensor
    Normalize(mean=model_def.image_mean, std=model_def.image_std)
])

# Preprocessing function - same as reference
def preprocess(batch):
    """Apply transforms across a batch."""
    batch["image"] = [transforms(image) for image in batch["image"]]
    return batch

# Set transformations
train_ds.set_transform(preprocess)
val_ds.set_transform(preprocess)

print(f"üñºÔ∏è Image preprocessing configured for {MODEL_NAME}")

In [None]:
# Model setup - identical pattern to reference
from transformers import AutoModelForImageClassification

# Create label mappings - same logic as reference
label2id, id2label = dict(), dict()
for i, label in enumerate(sorted(set(dataset['label']))):
    label2id[label] = i
    id2label[i] = label

print(f"üè∑Ô∏è Label mapping created: {label2id}")

# Load model - exact same pattern as reference
model = AutoModelForImageClassification.from_pretrained(
    MODEL_NAME,
    label2id=label2id,
    id2label=id2label,
    num_labels=len(label2id),
    ignore_mismatched_sizes=True
)

print(f"ü§ñ Model loaded: {MODEL_NAME} with {len(label2id)} classes")

In [None]:
# Training arguments - simplified from reference
from transformers import TrainingArguments

model_short_name = MODEL_NAME.split("/")[-1]

args = TrainingArguments(
    f"/tmp/huggingface/cuisine/{model_short_name}-finetuned",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=NUM_EPOCHS,
    load_best_model_at_end=True,
    per_device_train_batch_size=8,  # Small batch size for stability
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    logging_steps=10
)

print(f"üèãÔ∏è Training configured: {NUM_EPOCHS} epochs, batch size 8")

In [None]:
# Model wrapper - simplified from reference
import mlflow
import pandas as pd

class CuisineModelWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.pipeline.model.eval()

    def predict(self, context, images):
        from PIL import Image
        with torch.set_grad_enabled(False):
            # Convert bytes to PIL images
            if 'processed_image_data' in images.columns:
                image_column = 'processed_image_data'
            else:
                image_column = images.columns[0]  # Fallback to first column
                
            images_list = images[image_column].apply(lambda b: Image.open(io.BytesIO(b))).to_list()
            # Get predictions
            predictions = self.pipeline.predict(images_list)
            # Return best prediction for each image
            return pd.DataFrame([max(r, key=lambda x: x['score']) for r in predictions])

print("üéØ Model wrapper defined")

In [None]:
# Training and MLflow logging - exact reference pattern
from transformers import pipeline, Trainer
from mlflow.models import infer_signature

with mlflow.start_run(run_name=f"cuisine_classifier_{model_short_name}") as run:
    # Log training dataset
    mlflow.log_input(mlflow.data.from_huggingface(train_ds, "training"))
    
    # Log parameters
    mlflow.log_params({
        "model_name": MODEL_NAME,
        "num_epochs": NUM_EPOCHS,
        "num_classes": len(label2id),
        "train_size": len(train_ds),
        "val_size": len(val_ds)
    })

    # Data collator - same as reference
    def collate_fn(examples):
        import torch
        pixel_values = torch.stack([e["image"] for e in examples])
        labels = torch.tensor([label2id[e["label"]] for e in examples], dtype=torch.long)
        labels = torch.nn.functional.one_hot(labels, num_classes=len(label2id)).float()
        return {"pixel_values": pixel_values, "labels": labels}

    # Train model
    trainer = Trainer(model, args, train_dataset=train_ds, eval_dataset=val_ds, 
                     tokenizer=model_def, data_collator=collate_fn)
    
    print("üöÄ Starting training...")
    train_results = trainer.train()
    print("‚úÖ Training completed!")
    
    # Create pipeline
    classifier = pipeline("image-classification", model=trainer.state.best_model_checkpoint, tokenizer=model_def)
    
    # Test model and create signature
    wrapped_model = CuisineModelWrapper(classifier)
    test_df = spark.table(f"{CATALOG}.{SCHEMA}.{TABLE_NAME}").select('processed_image_data').limit(5).toPandas()
    predictions = wrapped_model.predict(None, test_df)
    signature = infer_signature(test_df, predictions)
    
    # Log model
    reqs = mlflow.transformers.get_default_pip_requirements(model)
    
    logged = mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=wrapped_model,
        pip_requirements=reqs,
        signature=signature,
    )
    
    print(f"üì¶ Model logged: {logged.model_uri}")

In [None]:
# Model registration - same as reference
from mlflow.tracking import MlflowClient

mlflow.set_registry_uri("databricks-uc")

registered = mlflow.register_model(
    model_uri=logged.model_uri,
    name=REGISTERED_MODEL_NAME,
)

MlflowClient().set_registered_model_alias(
    name=REGISTERED_MODEL_NAME,
    alias="prod",
    version=registered.version,
)

print(f"üéâ Registered {REGISTERED_MODEL_NAME} v{registered.version} and set alias 'prod'.")
print(f"üîó Model URI: models:/{REGISTERED_MODEL_NAME}@prod")

In [None]:
# Quick test inference - same pattern as reference
predict_cuisine_udf = mlflow.pyfunc.spark_udf(spark, model_uri=f"models:/{REGISTERED_MODEL_NAME}@prod")
columns = predict_cuisine_udf.metadata.get_input_schema().input_names()

# Run inference on validation data
predictions_df = (
    spark.table(f"{CATALOG}.{SCHEMA}.{TABLE_NAME}")
    .filter("dataset_split = 'test'")
    .withColumn("prediction", predict_cuisine_udf(*columns))
    .select("image_id", "cuisine_category", "prediction.label as predicted_cuisine", "prediction.score")
)

display(predictions_df.limit(20))
print("üß™ Test predictions completed!")

In [None]:
# Simple accuracy check - optional but useful
results = predictions_df.selectExpr(
    "cuisine_category as actual", 
    "predicted_cuisine as predicted", 
    "score"
).toPandas()

accuracy = (results['actual'] == results['predicted']).mean()
print(f"üéØ Test Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")

# Show per-class results
print("\nüìä Per-class results:")
for cuisine in sorted(results['actual'].unique()):
    subset = results[results['actual'] == cuisine]
    acc = (subset['actual'] == subset['predicted']).mean()
    print(f"   {cuisine}: {acc:.3f} ({len(subset)} samples)")

print("\nüéâ Simple training pipeline completed successfully!")