1. Install exact versions 

In [None]:
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 \
    transformers==4.40.2 huggingface_hub==0.23.0 tqdm==4.66.4 \
    pandas scikit-learn --quiet

2. Imports and configuration

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

from transformers import CLIPModel, CLIPTokenizerFast

# Configuration
class CFG:
    seed = 42
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_name = "openai/clip-vit-large-patch14" #"openai/clip-vit-base-patch32"

    # Paths
    ORIGINAL_TRAIN_PATH = "/kaggle/input/train-old/train.csv"
    IMAGE_EMB_PATH = "/kaggle/input/new-img-embed/image_embeddings.npy"
    VALID_MASK_PATH = "/kaggle/input/new-img-embed/valid_mask.npy"

    # Training
    epochs = 10
    batch_size = 64
    encoder_lr = 2e-5
    decoder_lr = 1e-3
    weight_decay = 1e-5

    # Save path
    model_save_path = "/kaggle/working/clip_img_proj_best.pth"

os.makedirs(os.path.dirname(CFG.model_save_path), exist_ok=True)
torch.manual_seed(CFG.seed)
np.random.seed(CFG.seed)
print(f"Using device: {CFG.device}")

3. Dataset

In [None]:
class ContentOnlyDataset(Dataset):
    def __init__(self, df, tokenizer, image_embeddings, valid_mask, indices):
        self.df = df.iloc[indices].reset_index(drop=True)
        self.tokenizer = tokenizer
        self.image_embeddings = image_embeddings[indices]
        self.valid_mask = valid_mask[indices]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        text = row['catalog_content']
        text_inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=77,
            return_tensors="pt"
        )

        image_emb = self.image_embeddings[idx]
        is_valid_image = self.valid_mask[idx]
        price = row['price']
        log_price = np.log1p(price)

        return {
            'input_ids': text_inputs['input_ids'].squeeze(),
            'attention_mask': text_inputs['attention_mask'].squeeze(),
            'image_emb': torch.tensor(image_emb, dtype=torch.float32),
            'valid_mask': torch.tensor(is_valid_image, dtype=torch.float32),
            'target': torch.tensor(log_price, dtype=torch.float32)
        }

In [None]:
class ContentOnlyFinetuneModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(CFG.model_name)
        embed_dim = self.clip.config.projection_dim

        # Placeholder embedding for missing images
        self.missing_image_embedding = nn.Parameter(torch.randn(1, embed_dim))

        # Trainable projection head for the image embeddings
        self.image_projection_head = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, embed_dim),
            nn.GELU(),
            nn.Linear(embed_dim, embed_dim)
        )

        final_input_size = embed_dim * 2
        self.head = nn.Sequential(
            nn.LayerNorm(final_input_size),
            nn.Linear(final_input_size, embed_dim), # Intermediate layer
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(embed_dim, 1) # Final output layer
        )

    def forward(self, batch):
        text_features = self.clip.get_text_features(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask']
        )
        
        image_emb_raw, valid_mask = batch['image_emb'], batch['valid_mask'].unsqueeze(-1)
        image_features_raw = image_emb_raw * valid_mask + self.missing_image_embedding * (1 - valid_mask)

        image_features = self.image_projection_head(image_features_raw)

        final_vector = torch.cat([image_features, text_features], dim=1)
        prediction = self.head(final_vector)
        return prediction.squeeze(-1)

5. Training Utils

In [None]:
def calculate_smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return 100 * np.mean(numerator / (denominator + 1e-8))

def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        predictions = model(batch)
        loss = criterion(predictions, batch['target'])
        if torch.isnan(loss): 
            continue
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            predictions = model(batch)
            loss = criterion(predictions, batch['target'])
            total_loss += loss.item()
            all_preds.append(predictions.cpu().numpy())
            all_targets.append(batch['target'].cpu().numpy())
    all_preds, all_targets = np.concatenate(all_preds), np.concatenate(all_targets)
    true_prices, pred_prices = np.expm1(all_targets), np.expm1(all_preds)
    val_smape = calculate_smape(true_prices, pred_prices)
    return total_loss / len(loader), val_smape

6. Load data and tokenizer

In [None]:
print("Loading data and pre-trained tokenizer...")

df_main = pd.read_csv(CFG.ORIGINAL_TRAIN_PATH)
image_embeddings = np.load(CFG.IMAGE_EMB_PATH)
valid_mask = np.load(CFG.VALID_MASK_PATH)

tokenizer = CLIPTokenizerFast.from_pretrained(CFG.model_name)
print("Data and tokenizer loaded successfully.")

# We create a list of indices covering the entire dataframe
all_indices = range(len(df_main))
print(f"Using all {len(all_indices)} samples for final training.")

# Create a single dataset and dataloader with all the data
full_train_dataset = ContentOnlyDataset(df_main, tokenizer, image_embeddings, valid_mask, all_indices)
full_train_loader = DataLoader(full_train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=2, pin_memory=True)

7. Training Loop

In [None]:
model = ContentOnlyFinetuneModel().to(CFG.device)
criterion = nn.MSELoss()

param_groups = [
    {'params': model.clip.text_model.parameters(), 'lr': CFG.encoder_lr},
    {'params': list(model.head.parameters()) +
               list(model.clip.text_projection.parameters()) +
               list(model.image_projection_head.parameters()) +
               [model.missing_image_embedding], 'lr': CFG.decoder_lr}
]
optimizer = AdamW(param_groups, weight_decay=CFG.weight_decay)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# Adjusting scheduler and loader for full training
scheduler = OneCycleLR(
    optimizer,
    max_lr=[CFG.encoder_lr, CFG.decoder_lr],
    epochs=CFG.epochs,
    steps_per_epoch=len(full_train_loader), # Use the new loader
    anneal_strategy='linear',
    pct_start=0.3
)

print("\n Starting training on all data ")
for epoch in range(CFG.epochs):
    print(f"\nEpoch {epoch+1}/{CFG.epochs}")
    # We only run the training step, no validation
    train_loss = train_one_epoch(model, full_train_loader, optimizer, scheduler, criterion, CFG.device)
    print(f"Train Loss = {train_loss:.4f}")

# Saving final model after the last epoch is complete ---
# Note: model.module is used to save the underlying model when using nn.DataParallel
torch.save(model.module.state_dict(), CFG.model_save_path)
print(f"\n Final model saved successfully to: {CFG.model_save_path}")

print("\n Final Training Complete ")

8. Inference Setup

In [None]:
# Define the paths for the test data
class TestCFG:
    TEST_CSV_PATH = "/kaggle/input/test-set/test.csv"
    TEST_IMAGE_EMB_PATH = "/kaggle/input/test-img-embeds/image_embeddings.npy"
    TEST_VALID_MASK_PATH = "/kaggle/input/test-img-embeds/valid_mask.npy"
    SUBMISSION_PATH = "/kaggle/working/submission1.csv"

# A dataset for the test data (no price/target column)
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, image_embeddings, valid_mask):
        self.df = df
        self.tokenizer = tokenizer
        self.image_embeddings = image_embeddings
        self.valid_mask = valid_mask

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['catalog_content']
        text_inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=77,
            return_tensors="pt"
        )
        image_emb = self.image_embeddings[idx]
        is_valid_image = self.valid_mask[idx]
        return {
            'input_ids': text_inputs['input_ids'].squeeze(),
            'attention_mask': text_inputs['attention_mask'].squeeze(),
            'image_emb': torch.tensor(image_emb, dtype=torch.float32),
            'valid_mask': torch.tensor(is_valid_image, dtype=torch.float32),
        }

# A function to run inference and get predictions
def get_predictions(model, loader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Predicting"):
            batch = {k: v.to(device) for k, v in batch.items()}
            predictions = model(batch)
            all_preds.append(predictions.cpu().numpy())
    return np.concatenate(all_preds)

print("Inference setup is ready.")

9. Generate Predictions and Submission File

In [None]:
print("Loading test data...")
df_test = pd.read_csv(TestCFG.TEST_CSV_PATH)
test_image_embeddings = np.load(TestCFG.TEST_IMAGE_EMB_PATH)
test_valid_mask = np.load(TestCFG.TEST_VALID_MASK_PATH)

# Creating the test dataset and dataloader
test_dataset = TestDataset(df_test, tokenizer, test_image_embeddings, test_valid_mask)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=2, pin_memory=True)

# Load trained model
print("Loading the final trained model weights...")
# Creating a new model instance with the same architecture
inference_model = ContentOnlyFinetuneModel().to(CFG.device)

# Load the state_dict we saved from the final training run
inference_model.load_state_dict(torch.load(CFG.model_save_path))

# Used multiple GPUs for training, wrap the inference model as well
if torch.cuda.device_count() > 1:
    inference_model = nn.DataParallel(inference_model)

print("Model loaded successfully.")

# Get Predictions
log_predictions = get_predictions(inference_model, test_loader, CFG.device)

# Convert predictions from the log scale back to actual prices
final_predictions = np.expm1(log_predictions)

# Create Submission File
submission_df = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': final_predictions
})

# Save to csv in the desired format
submission_df.to_csv(TestCFG.SUBMISSION_PATH, index=False)

print(f"\n Submission file created successfully at: {TestCFG.SUBMISSION_PATH}")
print("Submission file head:")
print(submission_df.head())