# Cuisine Classification ML Training Pipeline

A straightforward ML training pipeline for cuisine classification using ResNet-50.

In [0]:
# installation - only what we need
%pip install torch torchvision transformers datasets mlflow scikit-learn

Collecting mlflow
  Using cached mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.6.0 (from mlflow)
  Using cached mlflow_skinny-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.6.0 (from mlflow)
  Using cached mlflow_tracing-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Using cached flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Using cached huey-2.5.4-py3-none-any.whl.metadata (4.6 kB)
Collecting opentelemetry-proto<3,>=1.9.0 (from mlflow-skinny==3.6.0->mlflow)
  Using cached opentelemetry_proto-1.38.0-py3-none-any.whl.metadata (2.3 kB)
Collecting python-dotenv<2,>=0.19.0 (from mlflow-skinny==3.6.0->mlflow)
  Using cached python_dotenv-1.2.1-py3-none-an

In [0]:
dbutils.library.restartPython()

In [0]:
# imports - clean and minimal
import mlflow
import torch
import pandas as pd
import numpy as np
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
from PIL import Image
import io
from torchvision.transforms import Compose, Normalize, ToTensor, Lambda
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
# from sklearn.preprocessing import LabelEncoder

print("✅ imports loaded successfully")

2025-11-10 11:31:03.680355: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-10 11:31:03.864055: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-10 11:31:04.041331: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762774264.189205   23896 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762774264.240006   23896 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762774264.631665   23896 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

[2025-11-10 11:31:10,803] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cpu (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


✅ imports loaded successfully


In [0]:
# configuration - no complex widgets
CATALOG = "cuisine_vision_catalog"
MODEL_CHECKPOINT = "microsoft/resnet-50"
EXPERIMENT_NAME = "/cuisine_classifier"
NUM_EPOCHS = 5 # 3
BATCH_SIZE = 12 # 8
LEARNING_RATE = 2e-4 # 5e-5

print(f"🔧 Configuration:")
print(f"   📊 Catalog: {CATALOG}")
print(f"   🧠 Model: {MODEL_CHECKPOINT}")
print(f"   🔄 Epochs: {NUM_EPOCHS}")
print(f"   📦 Batch Size: {BATCH_SIZE}")
print(f"   📈 Learning Rate: {LEARNING_RATE}")

🔧 Configuration:
   📊 Catalog: cuisine_vision_catalog
   🧠 Model: microsoft/resnet-50
   🔄 Epochs: 5
   📦 Batch Size: 12
   📈 Learning Rate: 0.0002


In [0]:
# data loading - direct from gold table
print("📊 Loading data from gold layer...")

# Load data directly - no complex joins
dataset_df = (
    spark.table(f"{CATALOG}.gold.ml_dataset")
    .select("processed_image_data", "cuisine_category")
    .filter("processed_image_data IS NOT NULL")
    .toPandas()
)

print(f"✅ Loaded {len(dataset_df)} samples")
print(f"   🍽️ Cuisines: {sorted(dataset_df['cuisine_category'].unique())}")

# Create HuggingFace dataset - rename
dataset = Dataset.from_pandas(
    dataset_df.rename(columns={
        "processed_image_data": "image", 
        "cuisine_category": "label"
    })
)

# train/test split
splits = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = splits['train']
val_ds = splits['test']

print(f"✅ Data splits:")
print(f"   🏋️ Training: {len(train_ds)} samples")
print(f"   ✅ Validation: {len(val_ds)} samples")

📊 Loading data from gold layer...
✅ Loaded 8875 samples
   🍽️ Cuisines: ['american', 'chinese', 'french', 'international', 'italian', 'japanese', 'mediterranean', 'mexican']
✅ Data splits:
   🏋️ Training: 7100 samples
   ✅ Validation: 1775 samples


In [0]:
# preprocessing - exactly like reference notebook
print("🔄 Setting up preprocessing...")

# Load image processor
image_processor = AutoImageProcessor.from_pretrained(MODEL_CHECKPOINT)

# transform pipeline
transforms = Compose([
    Lambda(lambda b: Image.open(io.BytesIO(b)).convert("RGB")),
    ToTensor(),
    Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
])

def preprocess(batch):
    """preprocessing function"""
    batch["image"] = [transforms(image) for image in batch["image"]]
    return batch

# Apply transforms
train_ds.set_transform(preprocess)
val_ds.set_transform(preprocess)

print("✅ preprocessing setup complete")

🔄 Setting up preprocessing...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ preprocessing setup complete


In [0]:
# model setup - no complex wrappers
print("🧠 Setting up model...")

# Create label mappings
unique_labels = sorted(set(dataset['label']))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(unique_labels)

print(f"✅ Labels: {id2label}")

# Load model
model = AutoModelForImageClassification.from_pretrained(
    MODEL_CHECKPOINT,
    label2id=label2id,
    id2label=id2label,
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)

print(f"✅ Model loaded with {num_labels} classes")

🧠 Setting up model...
✅ Labels: {0: 'american', 1: 'chinese', 2: 'french', 3: 'international', 4: 'italian', 5: 'japanese', 6: 'mediterranean', 7: 'mexican'}


Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([8, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded with 8 classes


In [0]:
# Optimize training performance and eliminate warnings
import os

print("🔧 Optimizing training performance...")

# Set threading for better CPU utilization
os.environ['OMP_NUM_THREADS'] = '8'
os.environ['MKL_NUM_THREADS'] = '8'

# Configure PyTorch for optimal performance
torch.set_num_threads(8)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"   🖥️ Training device: {device}")
print(f"   🧵 CPU threads: 8")
print("✅ Performance optimizations applied")

🔧 Optimizing training performance...
   🖥️ Training device: cpu
   🧵 CPU threads: 8
✅ Performance optimizations applied


In [0]:
# training - no complex custom trainers
print("🏋️ Starting training...")

# Setup MLflow
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run() as run:
    print(f"🔄 MLflow run: {run.info.run_id}")
    

    # Training arguments

    args = TrainingArguments(
            output_dir=f"/dbfs/tmp/cuisine-classifier",
            remove_unused_columns=False,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=NUM_EPOCHS,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            logging_steps=10,
            report_to=[],
            # PERFORMANCE OPTIMIZATIONS:
            dataloader_pin_memory=False,  # Fix pin_memory warning
            ddp_find_unused_parameters=False,  # Fix DDP warning
            use_cpu=not torch.cuda.is_available(),  # Optimize for CPU if no GPU
            )
    
    # args = TrainingArguments(
    #     output_dir=f"/dbfs/tmp/cuisine-classifier",
    #     remove_unused_columns=False,
    #     eval_strategy="epoch",  # Fixed: was evaluation_strategy
    #     save_strategy="epoch",
    #     learning_rate=LEARNING_RATE,
    #     per_device_train_batch_size=BATCH_SIZE,
    #     per_device_eval_batch_size=BATCH_SIZE,
    #     num_train_epochs=NUM_EPOCHS,
    #     weight_decay=0.01,
    #     load_best_model_at_end=True,
    #     metric_for_best_model="eval_loss",
    #     logging_steps=10,
    #     report_to=[]
    # )
    
    # data collator - like reference
    def collate_fn(examples):
        pixel_values = torch.stack([e["image"] for e in examples])
        labels = torch.tensor([label2id[e["label"]] for e in examples], dtype=torch.long)
        return {"pixel_values": pixel_values, "labels": labels}
    
    # metrics
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = predictions.argmax(axis=-1)
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='weighted')
        return {'accuracy': accuracy, 'f1': f1}

    # Trainer - standard Transformers - FIXED VERSION
    trainer = Trainer(
        model=model, 
        args=args, 
        train_dataset=train_ds, 
        eval_dataset=val_ds, 
        processing_class=image_processor,  # Fixed: use processing_class instead of tokenizer
        data_collator=collate_fn,
        compute_metrics=compute_metrics
    )
    # trainer = Trainer(
    #     model=model, 
    #     args=args, 
    #     train_dataset=train_ds, 
    #     eval_dataset=val_ds, 
    #     tokenizer=image_processor, 
    #     data_collator=collate_fn,
    #     compute_metrics=compute_metrics
    # )
    
    # Train the model
    print("🚀 Training started...")
    trainer.train()
    print("✅ Training completed!")
    
    # Evaluate
    print("📊 Evaluating model...")
    eval_results = trainer.evaluate()
    print(f"✅ Final metrics: {eval_results}")
    
    # Log parameters
    mlflow.log_param("model_checkpoint", MODEL_CHECKPOINT)
    mlflow.log_param("num_epochs", NUM_EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("num_labels", num_labels)
    
    # Log metrics
    for key, value in eval_results.items():
        if isinstance(value, (int, float)):
            mlflow.log_metric(key, value)

🏋️ Starting training...


2025/11/10 11:31:25 INFO mlflow.tracking.fluent: Experiment with name '/cuisine_classifier' does not exist. Creating a new experiment.


🔄 MLflow run: bfc6bb0f29964caabefca309111faa45
🚀 Training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5987,1.503981,0.468169,0.444937
2,1.3104,1.269656,0.549296,0.531314
3,0.8601,1.199798,0.585915,0.574303
4,0.7167,1.143764,0.609577,0.60184
5,0.6515,1.156782,0.607324,0.602023


✅ Training completed!
📊 Evaluating model...


✅ Final metrics: {'eval_loss': 1.1437638998031616, 'eval_accuracy': 0.6095774647887324, 'eval_f1': 0.6018402716095184, 'eval_runtime': 94.5246, 'eval_samples_per_second': 18.778, 'eval_steps_per_second': 1.566, 'epoch': 5.0}


In [0]:
# model wrapper for MLflow - like reference
print("📦 Creating model wrapper...")

from transformers import pipeline

# Create pipeline from trained model
classifier = pipeline(
    "image-classification", 
    model=trainer.model, 
    feature_extractor=image_processor
)

class CuisineClassifier(mlflow.pyfunc.PythonModel):
    """wrapper for cuisine classification - like reference notebook"""
    
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.pipeline.model.eval()
    
    def predict(self, context, model_input):
        """prediction method"""
        # Handle DataFrame input
        if isinstance(model_input, pd.DataFrame):
            # Convert bytes to PIL images
            images = model_input['processed_image_data'].apply(
                lambda b: Image.open(io.BytesIO(b)).convert("RGB")
            ).tolist()
            
            # Get predictions
            with torch.no_grad():
                predictions = self.pipeline(images)
            
            # Return top prediction for each image
            return pd.DataFrame([
                max(pred, key=lambda x: x['score']) 
                for pred in predictions
            ])
        
        # Handle single image bytes
        else:
            image = Image.open(io.BytesIO(model_input)).convert("RGB")
            with torch.no_grad():
                prediction = self.pipeline(image)
            return max(prediction, key=lambda x: x['score'])

# Create wrapped model
wrapped_model = CuisineClassifier(classifier)
print("✅ model wrapper created")

📦 Creating model wrapper...


Device set to use cpu


✅ model wrapper created




In [0]:
# MLflow logging and registration
print("📊 Logging model to MLflow...")

# Import signature utilities
from mlflow.models.signature import infer_signature

with mlflow.start_run(run_id=run.info.run_id):
    # Test model with sample data and create signature
    test_df = dataset_df[['processed_image_data']].head(3)
    test_predictions = wrapped_model.predict(None, test_df)
    print(f"✅ Test predictions: {test_predictions}")
    
    # Create model signature - required for Unity Catalog
    signature = infer_signature(test_df, test_predictions)
    print(f"✅ Model signature created: {signature}")
    
    # Log model with signature - required for Unity Catalog
    model_info = mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=wrapped_model,
        signature=signature,  # Added signature for Unity Catalog
        pip_requirements=[
            "torch", 
            "transformers", 
            "pillow", 
            "pandas",
            "numpy"
        ]
    )
    
    print(f"✅ Model logged with signature: {model_info.model_uri}")

# Register to Unity Catalog - registration
full_model_name = f"{CATALOG}.ml_models.cuisine_classifier"
registered_model = mlflow.register_model(
    model_uri=model_info.model_uri, 
    name=full_model_name,
    tags={
        "stage": "development",
        "task": "image_classification",
        "architecture": "ResNet-50",
        "approach": "simple"
    }
)



from mlflow.tracking import MlflowClient

MlflowClient().set_registered_model_alias(
    name=full_model_name,
    alias="complex",
    version=registered_model.version,
)

print(f"🎉 Model registered successfully!")
print(f"   📦 Model: {full_model_name}")
print(f"   🏷️ Version: {registered_model.version}")

📊 Logging model to MLflow...




✅ Test predictions:       label     score
0  american  0.959751
1  american  0.797454
2  american  0.973709
✅ Model signature created: inputs: 
  ['processed_image_data': binary (required)]
outputs: 
  ['label': string (required), 'score': double (required)]
params: 
  None



🔗 View Logged Model at: https://adb-2867553723712000.0.azuredatabricks.net/ml/experiments/2328462332528308/models/m-e990b0dff9114f1db39cf293835fd7a0?o=2867553723712000


✅ Model logged with signature: models:/m-e990b0dff9114f1db39cf293835fd7a0


Successfully registered model 'cuisine_vision_catalog.ml_models.cuisine_classifier'.


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

🔗 Created version '1' of model 'cuisine_vision_catalog.ml_models.cuisine_classifier': https://adb-2867553723712000.0.azuredatabricks.net/explore/data/models/cuisine_vision_catalog/ml_models/cuisine_classifier/version/1?o=2867553723712000


🎉 Model registered successfully!
   📦 Model: cuisine_vision_catalog.ml_models.cuisine_classifier
   🏷️ Version: 1


In [0]:
# testing - verify everything works
print("🧪 Final testing...")

# Test with a few samples
test_samples = dataset_df.sample(n=4)
for idx, row in test_samples.iterrows():
    true_label = row['cuisine_category']
    image_bytes = row['processed_image_data']
    
    # Make prediction
    prediction = wrapped_model.predict(None, image_bytes)
    
    print(f"Sample {idx}:")
    print(f"   ✅ True: {true_label}")
    print(f"   🎯 Predicted: {prediction['label']} (score: {prediction['score']:.3f})")
    print()

# print("🎉 pipeline completed successfully!")
# print("\n📋 Summary:")
# print(f"   📊 Total samples: {len(dataset_df)}")
# print(f"   🏷️ Classes: {num_labels}")
# print(f"   🔄 Epochs: {NUM_EPOCHS}")
# print(f"   📦 Model: {full_model_name} v{registered_model.version}")

🧪 Final testing...
Sample 5299:
   ✅ True: italian
   🎯 Predicted: italian (score: 0.968)

Sample 5155:
   ✅ True: international
   🎯 Predicted: international (score: 0.676)

Sample 8227:
   ✅ True: mexican
   🎯 Predicted: mexican (score: 0.877)

Sample 3175:
   ✅ True: french
   🎯 Predicted: french (score: 0.421)



## 📊 Model Performance Diagnostics

Let's analyze why the model might not be predicting accurately by examining the dataset and training results.

In [0]:
# Dataset Analysis - Check for common issues
print("🔍 Dataset Analysis:")
print(f"📊 Total samples: {len(dataset_df)}")

# Check class distribution
class_counts = dataset_df['cuisine_category'].value_counts()
print(f"\n🍽️ Class Distribution:")
for cuisine, count in class_counts.items():
    percentage = (count / len(dataset_df)) * 100
    print(f"   {cuisine}: {count} samples ({percentage:.1f}%)")

# Check for class imbalance
min_samples = class_counts.min()
max_samples = class_counts.max()
imbalance_ratio = max_samples / min_samples
print(f"\n⚖️ Class Imbalance Analysis:")
print(f"   Min class size: {min_samples} samples")
print(f"   Max class size: {max_samples} samples") 
print(f"   Imbalance ratio: {imbalance_ratio:.2f}x")

# Identify potential issues
print(f"\n⚠️ Potential Issues Detected:")
if imbalance_ratio > 3:
    print("   🚨 SIGNIFICANT CLASS IMBALANCE! Some classes have 3x+ more samples than others")
    print("      → Solution: Use class weights or data augmentation")

if min_samples < 50:
    print("   🚨 VERY SMALL DATASET! Some classes have <50 samples")
    print("      → Solution: Collect more data or use data augmentation")

if len(dataset_df) < 500:
    print("   🚨 SMALL TOTAL DATASET! Less than 500 samples for deep learning")
    print("      → Solution: Collect significantly more data")

if max_samples > 5 * min_samples:
    print("   🚨 EXTREME IMBALANCE! Majority class dominates")
    print("      → Solution: Balance dataset or use stratified sampling")

print(f"\n📈 Recommendations:")
print(f"   • Ideal dataset size: 1000+ samples per class")
print(f"   • Current average: {len(dataset_df) / num_labels:.0f} samples per class")
print(f"   • Minimum recommended: 200+ samples per class")

🔍 Dataset Analysis:
📊 Total samples: 8875

🍽️ Class Distribution:
   american: 2250 samples (25.4%)
   italian: 1375 samples (15.5%)
   french: 1250 samples (14.1%)
   international: 1125 samples (12.7%)
   mexican: 875 samples (9.9%)
   japanese: 875 samples (9.9%)
   chinese: 625 samples (7.0%)
   mediterranean: 500 samples (5.6%)

⚖️ Class Imbalance Analysis:
   Min class size: 500 samples
   Max class size: 2250 samples
   Imbalance ratio: 4.50x

⚠️ Potential Issues Detected:
   🚨 SIGNIFICANT CLASS IMBALANCE! Some classes have 3x+ more samples than others
      → Solution: Use class weights or data augmentation

📈 Recommendations:
   • Ideal dataset size: 1000+ samples per class
   • Current average: 1109 samples per class
   • Minimum recommended: 200+ samples per class


In [0]:
# Training Performance Analysis
print("📊 Training Performance Analysis:")

# Analyze final training metrics
if 'eval_results' in locals():
    print("\n✅ Final Evaluation Metrics:")
    for metric, value in eval_results.items():
        if isinstance(value, (int, float)):
            print(f"   {metric}: {value:.4f}")
    
    # Interpret the metrics
    eval_acc = eval_results.get('eval_accuracy', 0)
    eval_loss = eval_results.get('eval_loss', float('inf'))
    
    print(f"\n🎯 Performance Interpretation:")
    if eval_acc < 0.3:
        print("   🔴 CRITICAL: Very low accuracy (<30%) - model is barely learning")
        print("      → Likely causes: insufficient data, too few epochs, or data quality issues")
    elif eval_acc < 0.5:
        print("   🟡 POOR: Low accuracy (<50%) - significant improvement needed")
        print("      → Likely causes: class imbalance, insufficient training, or weak features")
    elif eval_acc < 0.7:
        print("   🟠 FAIR: Moderate accuracy (<70%) - room for improvement")
        print("      → Solutions: more training, data augmentation, or hyperparameter tuning")
    elif eval_acc < 0.85:
        print("   🟢 GOOD: Solid accuracy (70-85%) - decent performance")
        print("      → Can improve with more data or fine-tuning")
    else:
        print("   🟢 EXCELLENT: High accuracy (>85%) - great performance!")
        
    if eval_loss > 2.0:
        print("   ⚠️ High validation loss - model may be underfitting")
    elif eval_loss < 0.1:
        print("   ⚠️ Very low validation loss - check for overfitting")

# Extended prediction accuracy test
print(f"\n🎯 Extended Prediction Accuracy Test:")
test_size = min(50, len(dataset_df))  # Test on up to 50 samples
test_larger = dataset_df.sample(n=test_size, random_state=42)
correct = 0
total = len(test_larger)
cuisine_correct = {cuisine: 0 for cuisine in dataset_df['cuisine_category'].unique()}
cuisine_total = {cuisine: 0 for cuisine in dataset_df['cuisine_category'].unique()}

print(f"Testing on {total} random samples...")

for idx, row in test_larger.iterrows():
    true_label = row['cuisine_category']
    prediction = wrapped_model.predict(None, row['processed_image_data'])
    predicted_label = prediction['label']
    confidence = prediction['score']
    
    cuisine_total[true_label] += 1
    
    if true_label == predicted_label:
        correct += 1
        cuisine_correct[true_label] += 1
        status = "✅"
    else:
        status = "❌"
    
    if idx < 10:  # Show first 10 predictions
        print(f"   {status} True: {true_label:<15} | Predicted: {predicted_label:<15} | Confidence: {confidence:.3f}")

# Overall accuracy
overall_accuracy = correct / total
print(f"\n📈 Overall Test Accuracy: {overall_accuracy:.1%} ({correct}/{total})")

# Per-class accuracy
print(f"\n📊 Per-Class Accuracy:")
for cuisine in sorted(cuisine_total.keys()):
    if cuisine_total[cuisine] > 0:
        class_acc = cuisine_correct[cuisine] / cuisine_total[cuisine]
        print(f"   {cuisine:<15}: {class_acc:.1%} ({cuisine_correct[cuisine]}/{cuisine_total[cuisine]})")
    else:
        print(f"   {cuisine:<15}: No samples in test set")

# Identify problematic classes
print(f"\n🚨 Classes with Low Accuracy (<50%):")
problem_classes = []
for cuisine in cuisine_total.keys():
    if cuisine_total[cuisine] > 0:
        class_acc = cuisine_correct[cuisine] / cuisine_total[cuisine]
        if class_acc < 0.5:
            problem_classes.append(f"{cuisine} ({class_acc:.1%})")

if problem_classes:
    for problem in problem_classes:
        print(f"   • {problem}")
    print(f"\n💡 Focus improvement efforts on these classes!")
else:
    print("   🎉 All classes performing reasonably well!")

📊 Training Performance Analysis:

✅ Final Evaluation Metrics:
   eval_loss: 1.1438
   eval_accuracy: 0.6096
   eval_f1: 0.6018
   eval_runtime: 94.5246
   eval_samples_per_second: 18.7780
   eval_steps_per_second: 1.5660
   epoch: 5.0000

🎯 Performance Interpretation:
   🟠 FAIR: Moderate accuracy (<70%) - room for improvement
      → Solutions: more training, data augmentation, or hyperparameter tuning

🎯 Extended Prediction Accuracy Test:
Testing on 50 random samples...

📈 Overall Test Accuracy: 74.0% (37/50)

📊 Per-Class Accuracy:
   american       : 78.6% (11/14)
   chinese        : 0.0% (0/1)
   french         : 28.6% (2/7)
   international  : 100.0% (7/7)
   italian        : 100.0% (9/9)
   japanese       : 50.0% (1/2)
   mediterranean  : 60.0% (3/5)
   mexican        : 80.0% (4/5)

🚨 Classes with Low Accuracy (<50%):
   • chinese (0.0%)
   • french (28.6%)

💡 Focus improvement efforts on these classes!


In [0]:
# Improvement Recommendations Based on Analysis
print("🚀 Improvement Recommendations:")

# Get current metrics for recommendations
current_accuracy = eval_results.get('eval_accuracy', 0) if 'eval_results' in locals() else 0
dataset_size = len(dataset_df)
min_class_size = class_counts.min()
max_class_size = class_counts.max()

print(f"\n📋 Priority Actions (implement in order):")

# Priority 1: Data quantity issues
if dataset_size < 1000:
    print(f"   🔴 CRITICAL - Collect more data:")
    print(f"      Current: {dataset_size} samples | Target: 1000+ samples")
    print(f"      Need: {1000 - dataset_size} more samples")

if min_class_size < 100:
    print(f"   🔴 CRITICAL - Balance dataset:")
    print(f"      Smallest class: {min_class_size} samples | Target: 100+ per class")
    print(f"      Focus on collecting data for: {class_counts.idxmin()}")

# Priority 2: Training configuration
if current_accuracy < 0.6:
    print(f"   🟡 HIGH - Improve training:")
    print(f"      • Increase epochs: {NUM_EPOCHS} → 10-15 epochs")
    print(f"      • Increase learning rate: {LEARNING_RATE} → 2e-4")
    print(f"      • Add data augmentation")
    
# Priority 3: Model improvements    
if imbalance_ratio > 3:
    print(f"   🟠 MEDIUM - Address class imbalance:")
    print(f"      • Use class weights during training")
    print(f"      • Apply stratified sampling")
    print(f"      • Generate synthetic data for minority classes")

print(f"\n🔧 Quick Fixes to Try Next:")
print(f"   1. Update configuration in cell 5:")
print(f"      NUM_EPOCHS = 10")
print(f"      BATCH_SIZE = 16  # if memory allows")
print(f"      LEARNING_RATE = 2e-4")

print(f"\n   2. Add data augmentation in cell 7:")
print(f"      from torchvision.transforms import RandomHorizontalFlip, ColorJitter")
print(f"      # Add to transforms: RandomHorizontalFlip(p=0.5), ColorJitter(...)")

print(f"\n   3. Consider using a different model:")
print(f"      MODEL_CHECKPOINT = 'google/vit-base-patch16-224'  # Vision Transformer")
print(f"      # or")
print(f"      MODEL_CHECKPOINT = 'microsoft/swin-tiny-patch4-window7-224'  # Swin Transformer")

# Expected improvement
print(f"\n📈 Expected Improvements:")
if dataset_size < 500:
    print(f"   • With 2-3x more data: +15-25% accuracy")
if NUM_EPOCHS == 3:
    print(f"   • With 10 epochs: +5-15% accuracy") 
if min_class_size < 50:
    print(f"   • With balanced classes: +10-20% accuracy")

print(f"\n🎯 Realistic Targets:")
if dataset_size < 500:
    print(f"   • Short term: 50-60% accuracy (with current data + better training)")
    print(f"   • Long term: 75-85% accuracy (with more balanced data)")
else:
    print(f"   • Short term: 65-75% accuracy (with better training)")
    print(f"   • Long term: 80-90% accuracy (with data augmentation and tuning)")

🚀 Improvement Recommendations:

📋 Priority Actions (implement in order):
   🟠 MEDIUM - Address class imbalance:
      • Use class weights during training
      • Apply stratified sampling
      • Generate synthetic data for minority classes

🔧 Quick Fixes to Try Next:
   1. Update configuration in cell 5:
      NUM_EPOCHS = 10
      BATCH_SIZE = 16  # if memory allows
      LEARNING_RATE = 2e-4

   2. Add data augmentation in cell 7:
      from torchvision.transforms import RandomHorizontalFlip, ColorJitter
      # Add to transforms: RandomHorizontalFlip(p=0.5), ColorJitter(...)

   3. Consider using a different model:
      MODEL_CHECKPOINT = 'google/vit-base-patch16-224'  # Vision Transformer
      # or
      MODEL_CHECKPOINT = 'microsoft/swin-tiny-patch4-window7-224'  # Swin Transformer

📈 Expected Improvements:

🎯 Realistic Targets:
   • Short term: 65-75% accuracy (with better training)
   • Long term: 80-90% accuracy (with data augmentation and tuning)
