In [1]:
!pip install -q transformers trl datasets bitsandbytes peft accelerate num2words

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.0/348.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.1 MB/s[0m 

In [2]:
!pip install -q flash-attn --no-build-isolation

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [4]:
import os
import pandas as pd
import torch
from PIL import Image
from datasets import Dataset
from transformers import Idefics3ForConditionalGeneration, AutoProcessor
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer


# 1. Function to create text descriptions from binary labels
def create_descriptive_labels(df):
    """Convert binary labels to descriptive text for VLM training"""
    components = [
        "front left door",
        "front right door", 
        "rear left door", 
        "rear right door", 
        "hood"
    ]
    
    column_names = [
        "front_left_door",
        "front_right_door",
        "rear_left_door",
        "rear_right_door",
        "hood"
    ]
    
    descriptions = []
    
    for idx, row in df.iterrows():
        # Get the status of each component (0=Closed, 1=Open)
        statuses = [int(row[col]) for col in column_names]
        
        # Identify which components are open and which are closed
        open_components = [components[i] for i in range(len(components)) if statuses[i] == 1]
        closed_components = [components[i] for i in range(len(components)) if statuses[i] == 0]
        
        # Generate the descriptive text
        if len(open_components) == 0:
            # All components are closed
            description = "All doors and the hood of the car are closed."
        elif len(closed_components) == 0:
            # All components are open
            description = "All doors and the hood of the car are open."
        else:
            # Some components are open, some are closed
            if len(open_components) == 1:
                open_text = f"The car's {open_components[0]} is open."
            elif len(open_components) == 2:
                open_text = f"The car's {open_components[0]} and {open_components[1]} are open."
            else:
                # For 3 or more components, use comma formatting with "and" before the last item
                open_list = ", ".join(open_components[:-1]) + f", and {open_components[-1]}"
                open_text = f"The car's {open_list} are open."
            
            if len(closed_components) == 1:
                closed_text = f"The {closed_components[0]} remains closed."
            elif len(closed_components) == 2:
                closed_text = f"The {closed_components[0]} and {closed_components[1]} remain closed."
            else:
                # For 3 or more components, use comma formatting with "and" before the last item
                closed_list = ", ".join(closed_components[:-1]) + f", and {closed_components[-1]}"
                closed_text = f"The {closed_list} remain closed."
            
            description = f"{open_text} {closed_text}"
        
        descriptions.append(description)
    
    return descriptions

# 2. Format data for the VLM training
def format_data(sample):
    return [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": "Describe the current state of the car doors and hood."
                }
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": sample["text_description"]
                }
            ],
        },
    ]

# 3. Data loading and preparation
def prepare_car_dataset(dataset_path):
    """Load and prepare the car dataset for VLM training"""
    # Load labels
    labels_path = os.path.join(dataset_path, "labels.csv")
    labels_df = pd.read_csv(labels_path)
    
    # Generate text descriptions
    text_descriptions = create_descriptive_labels(labels_df)
    labels_df["text_description"] = text_descriptions
    
    # Create dataset dictionary
    dataset_dict = {
        "filename": labels_df["filename"].tolist(),
        "front_left_door": labels_df["front_left_door"].tolist(),
        "front_right_door": labels_df["front_right_door"].tolist(),
        "rear_left_door": labels_df["rear_left_door"].tolist(),
        "rear_right_door": labels_df["rear_right_door"].tolist(),
        "hood": labels_df["hood"].tolist(),
        "text_description": text_descriptions,
        "image": []
    }
    
    # Load images
    images_dir = os.path.join(dataset_path, "images")
    for filename in dataset_dict["filename"]:
        image_path = os.path.join(images_dir, filename)
        try:
            image = Image.open(image_path)
            if image.mode != 'RGB':
                image = image.convert('RGB')
            dataset_dict["image"].append(image)
        except Exception as e:
            print(f"Error loading image {filename}: {e}")
            # Use a placeholder image if the actual image can't be loaded
            dataset_dict["image"].append(Image.new('RGB', (224, 224), color='gray'))
    
    # Create HuggingFace Dataset
    dataset = Dataset.from_dict(dataset_dict)
    
    # Split into train/eval/test (80/10/10 split)
    dataset = dataset.shuffle(seed=42)
    splits = dataset.train_test_split(test_size=0.2)
    train_dataset = splits["train"]
    test_valid = splits["test"].train_test_split(test_size=0.5)
    eval_dataset = test_valid["train"]
    test_dataset = test_valid["test"]
    
    # Format the data for VLM training
    train_dataset = [format_data(sample) for sample in train_dataset]
    eval_dataset = [format_data(sample) for sample in eval_dataset]
    test_dataset = [format_data(sample) for sample in test_dataset]
    
    return train_dataset, eval_dataset, test_dataset

# 5. Main execution flow
# if __name__ == "__main__":
#     # Dataset path
#     dataset_path = "/kaggle/input/3dcardata/car_state_dataset_preprocessed"  # Update to your dataset path
    
#     # Prepare the datasets
#     print("Preparing car dataset...")
#     train_dataset, eval_dataset, test_dataset = prepare_car_dataset(dataset_path)
#     print(f"Dataset prepared: {len(train_dataset)} training, {len(eval_dataset)} validation, {len(test_dataset)} test samples")
    
#     # Model configuration
#     model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"
#     print(f"Loading model: {model_id}")
    
#     # BitsAndBytesConfig for quantization
#     bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.bfloat16
#     )
    
#     # Load model and processor
#     processor = AutoProcessor.from_pretrained(model_id)
#     model = Idefics3ForConditionalGeneration.from_pretrained(
#         model_id,
#         device_map="auto",
#         torch_dtype=torch.bfloat16,
#         quantization_config=bnb_config,
#     )
    
#     # Configure LoRA
#     peft_config = LoraConfig(
#         r=8,
#         lora_alpha=8,
#         lora_dropout=0.1,
#         target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
#         use_dora=True,
#         init_lora_weights="gaussian"
#     )
    
#     # Apply PEFT model adaptation
#     peft_model = get_peft_model(model, peft_config)
#     peft_model.print_trainable_parameters()
    
#     # Configure training
#     training_args = SFTConfig(
#         output_dir="smolvlm-instruct-car-component-detection",
#         num_train_epochs=3,  # Increased epochs for better learning on this task
#         per_device_train_batch_size=1,
#         gradient_accumulation_steps=16,
#         warmup_steps=50,
#         learning_rate=1e-4,
#         weight_decay=0.01,
#         logging_steps=5,
#         save_strategy="steps",
#         save_steps=25,
#         save_total_limit=2,
#         optim="adamw_torch_fused",
#         bf16=True,
#         push_to_hub=False,
#         report_to="none",
#         remove_unused_columns=False,
#         gradient_checkpointing=False,
#         dataset_text_field="",
#         dataset_kwargs={"skip_prepare_dataset": True},
#     )
    
#     # Collate function for batching
#     image_token_id = processor.tokenizer.additional_special_tokens_ids[
#                 processor.tokenizer.additional_special_tokens.index("<image>")]
    
#     def collate_fn(examples):
#         texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]
    
#         image_inputs = []
#         for example in examples:
#             # Access image from the first item (user message), first content element
#             image = example[0]['content'][0]['image']
            
#             if image.mode != 'RGB':
#                 image = image.convert('RGB')
#             image_inputs.append([image])
    
#         batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
#         labels = batch["input_ids"].clone()
#         labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
#         labels[labels == image_token_id] = -100  # Mask image token IDs in labels
    
#         batch["labels"] = labels
    
#         return batch
    
#     # Initialize trainer
#     trainer = SFTTrainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=eval_dataset,
#         data_collator=collate_fn,
#         peft_config=peft_config,
#         processing_class=processor.tokenizer,
#     )
    
#     # Start training
#     print("Starting training...")
#     trainer.train()
    
#     # Save the model
#     print(f"Training complete! Saving model to {training_args.output_dir}")
#     trainer.save_model(training_args.output_dir)
#     print("Model saved successfully.")

2025-05-12 14:31:00.885084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747060261.115731      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747060261.183847      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Dataset path
dataset_path = "/kaggle/input/3dcardata/car_state_dataset_multilabel"  # Update to your dataset path

# Prepare the datasets
print("Preparing car dataset...")
train_dataset, eval_dataset, test_dataset = prepare_car_dataset(dataset_path)
print(f"Dataset prepared: {len(train_dataset)} training, {len(eval_dataset)} validation, {len(test_dataset)} test samples")

# Model configuration
model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"
print(f"Loading model: {model_id}")

# BitsAndBytesConfig for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and processor
processor = AutoProcessor.from_pretrained(model_id)
model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

# Configure LoRA
peft_config = LoraConfig(
    r=16,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora=False,
    init_lora_weights="gaussian"
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

# Configure training
training_args = SFTConfig(
    output_dir="smolvlm-256M-car-component-detection",
    num_train_epochs=1,  
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=32,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=1,
    save_strategy="steps",
    save_steps=32,
    save_total_limit=2,
    optim="adamw_torch_fused",
    bf16=True,
    push_to_hub=False,
    report_to="wandb",
    remove_unused_columns=False,
    gradient_checkpointing=False,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
)

# Collate function for batching
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]

    image_inputs = []
    for example in examples:
        # Access image from the first item (user message), first content element
        image = example[0]['content'][0]['image']
        
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image_inputs.append([image])

    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
    labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels

    return batch

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor.tokenizer,
)

# Start training
print("Starting training...")
    
    


Preparing car dataset...
Dataset prepared: 3840 training, 480 validation, 480 test samples
Loading model: HuggingFaceTB/SmolVLM2-256M-Video-Instruct


processor_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

trainable params: 5,769,216 || all params: 262,254,144 || trainable%: 2.1999


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


In [6]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33m23523016[0m ([33mmosesananta_itb_s2[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Tracking run with wandb version 0.19.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250512_143210-511ssb4g[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msmolvlm2-256M-car-component-detection[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/mosesananta_itb_s2/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/mosesananta_itb_s2/huggingface/runs/511ssb4g[0m


Step,Training Loss
1,3.5244
2,3.4855
3,3.4944
4,3.4988
5,3.4374
6,3.3851
7,3.4184
8,3.3336
9,3.2654
10,3.1189


You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
You are using a model of type smolvlm to instantiate a model of 

TrainOutput(global_step=240, training_loss=0.5978163712347547, metrics={'train_runtime': 22342.8385, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.011, 'total_flos': 6304570252130304.0, 'train_loss': 0.5978163712347547})

In [7]:
trainer.save_model(training_args.output_dir)

You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.
