In [10]:
import bson
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import io
import os
from tqdm import tqdm
import pandas as pd

In [11]:
hyper_params = {
    "batch_size": 128,
    "num_classes": 5270,  
    "epochs": 10,
    "lr": 0.001
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data transformations
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [12]:
def create_category_mapping(train_bson_file):
    category_mapping = {}
    idx = 0
    with open(train_bson_file, "rb") as f:
        data = bson.decode_file_iter(f)
        for item in tqdm(data, desc="Creating Category Mapping"):
            category_id = item["category_id"]
            if category_id not in category_mapping:
                category_mapping[category_id] = idx
                idx += 1
    return category_mapping

In [13]:
class CDiscountDataset(Dataset):
    def __init__(self, bson_file, category_mapping, transform=None):
        self.bson_file = bson_file
        self.transform = transform
        self.category_mapping = category_mapping
        self.data = []
        self.load_bson_data()

    def load_bson_data(self):
        """Load BSON data into a memory-efficient list."""
        with open(self.bson_file, "rb") as f:
            data = bson.decode_file_iter(f)
            for item in tqdm(data, desc="Loading BSON Data"):
                product_id = item["_id"]
                category_id = item["category_id"]
                category_idx = self.category_mapping[category_id]
                for img in item["imgs"]:
                    self.data.append((product_id, category_idx, img["picture"]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        product_id, category_idx, img_binary = self.data[idx]
        image = Image.open(io.BytesIO(img_binary)).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, category_idx
        
        
        

In [14]:
train_bson_file = '/kaggle/input/cdiscount-image-classification-challenge/train.bson'
category_mappings = create_category_mapping(train_bson_file)

Creating Category Mapping: 7069896it [10:11, 11565.89it/s]


In [None]:
train_data = CDiscountDataset(train_bson_file,category_mappings,transform=transform)

Loading BSON Data: 3426953it [04:58, 9335.58it/s] 

In [None]:
# ResNet Model
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(category_mapping))  # Replace final layer
model = model.to(DEVICE)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=hyper_params['lr'])
epochs = hyper_params['epochs']
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images,labels in tqdm(train_data,desc=f"Epoch {epoch + 1}/{EPOCHS}"):
        images,labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        predicted_labels = model(images)
        loss = criterion(predicted_labels,labels)
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {running_loss / len(train_loader)}")

torch.save(model.state_dict(), "resnet18_cdiscount.pth")