In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, AutoModel
from transformers import CLIPProcessor, CLIPModel
from sklearn.preprocessing import LabelEncoder
import os

# Paths
image_dir = '/kaggle/input/mesho-chll/MESHO/train_images/'  # Path to train images
csv_file = '/kaggle/input/mesho-chll/MESHO/train_MESH.csv'       # Path to CSV file

# Parameters
batch_size = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Load the CSV that contains image IDs, Category, and target labels (attr_1 to attr_10)
df = pd.read_csv(csv_file)

# Fill NaN values in attr_1 to attr_10 with 'no'
attr_columns = [f'attr_{i}' for i in range(1, 11)]
df[attr_columns] = df[attr_columns].fillna('no')

# Label encode the 'Category' column
category_encoder = LabelEncoder()
df['Category'] = category_encoder.fit_transform(df['Category'])

# Encode each attribute (attr_1 to attr_10) using LabelEncoder
attr_encoders = {}
for col in attr_columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    attr_encoders[col] = encoder  # Store the encoder for later use

# Create a new column for the full file path of each image
df['file_path'] = df['id'].apply(lambda x: f'{image_dir}{str(x).zfill(6)}.jpg')

# 2. Load the CLIP model and processor
clip_model = AutoModel.from_pretrained("google/siglip-base-patch16-512")
clip_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-512")

clip_model.to(device)
clip_model.eval()  # Set to evaluation mode since we only use the encoders

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['file_path']
        image = Image.open(img_path).convert('RGB')

        # Use CLIP processor to process the image (resizing, normalization)
        inputs = self.processor(images=image, return_tensors="pt")

        # Get additional features (Category, len)
        features = self.df.iloc[idx][['Category']].values.astype(np.float32)

        # Get labels for attributes
        labels = self.df.iloc[idx][attr_columns].values.astype(np.int64)

        return (inputs, features), torch.tensor(labels)  # Return inputs for multi-label classification

# 3. DataLoader for training and validation
def custom_dataloader(df, processor, batch_size):
    dataset = CustomDataset(df, processor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_loader = custom_dataloader(df, clip_processor, batch_size)

In [None]:
# 4. Define the multi-output model using CLIP encodings
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast
class MultiOutputModel(nn.Module):
    def __init__(self, clip_model, num_features, num_outputs_per_attr):
        super(MultiOutputModel, self).__init__()
        
        self.clip_model = clip_model
        self.clip_model.eval()  # Set CLIP to eval mode

        # Custom fully connected layers
        clip_output_dim = 768 # Assuming using ViT-Large with 768-dimensional embeddings
        self.fc_features = nn.Linear(clip_output_dim + num_features, 512)
        self.fc1 = nn.Linear(512, 256)

        # Output layers for each of the 10 attributes (multi-label output)
        self.attr_outputs = nn.ModuleList([nn.Linear(256, num_outputs) for num_outputs in num_outputs_per_attr])

    def forward(self, inputs, features):
        image_inputs = inputs['pixel_values'].squeeze(1).to(device)

        # Forward pass through CLIP image encoder
        with torch.no_grad():
            image_embeddings = self.clip_model.get_image_features(image_inputs)

        # Concatenate image embeddings with the additional features
        x = torch.cat([image_embeddings, features], dim=1)

        # Pass through the fully connected layers
        x = torch.relu(self.fc_features(x))
        x = torch.relu(self.fc1(x))

        # Pass through the attribute-specific output layers (multi-label output)
        outputs = [attr_output(x) for attr_output in self.attr_outputs]

        return outputs

# 5. Instantiate the model
num_features = 1 # 'Category' and 'len'
num_classes_list = [len(attr_encoders[f'attr_{i}'].classes_) for i in range(1, 11)]
model = MultiOutputModel(clip_model, num_features=num_features, num_outputs_per_attr=num_classes_list).to(device)

# 6. Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:

def load_model_checkpoint(model, checkpoint_path):
    """Load the model from the specified checkpoint path."""
    if os.path.exists(checkpoint_path):
        state_dict = torch.load(checkpoint_path)
        
        # Modify keys for DataParallel
        new_state_dict = {}
        for k, v in state_dict.items():
            new_state_dict[f'module.{k}'] = v
            
        model.load_state_dict(new_state_dict)
        print(f"Model loaded from {checkpoint_path}")
    else:
        print(f"Checkpoint not found at {checkpoint_path}")

In [None]:
import os
import torch
from tqdm import tqdm

def load_model_checkpoint(model, checkpoint_path):
    """Load the model from the specified checkpoint path."""
    if os.path.exists(checkpoint_path):
        model.load_state_dict(torch.load(checkpoint_path))
        print(f"Model loaded from {checkpoint_path}")
    else:
        print(f"Checkpoint not found at {checkpoint_path}")

def train_model(model, train_loader, criterion, optimizer, epochs=15, save_dir='model_checkpoints'):
    os.makedirs(save_dir, exist_ok=True)
    model.train()
    
    for epoch in range(epochs):
        running_loss = 0.0
        with tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
            for batch_idx, ((inputs, features), labels) in enumerate(train_loader):
                inputs = {k: v.to(device) for k, v in inputs.items()}
                features, labels = features.to(device), labels.to(device)
                optimizer.zero_grad()

                # Forward pass
                outputs = model(inputs, features)

                total_loss = 0.0
                for idx, (output, label) in enumerate(zip(outputs, labels.T)):
                    loss = criterion(output, label)
                    total_loss += loss

                total_loss.backward()
                optimizer.step()

                running_loss += total_loss.item()
                pbar.set_postfix({'Batch Loss': total_loss.item()})
                pbar.update(1)

        avg_epoch_loss = running_loss / len(train_loader)
        print(f'Epoch {epoch + 1} completed. Average Loss: {avg_epoch_loss:.4f}')

        # Save the model after each epoch
        checkpoint_path = os.path.join(save_dir, f'model_epoch_siglip_large_{epoch + 1}.pth')
        torch.save(model.state_dict(), checkpoint_path)
        print(f'Model saved at {checkpoint_path}')

# Load model from checkpoint if specified
checkpoint_path = '/kaggle/input/newmods/model_checkpoints/model_epoch_siglip_base_12.pth'
load_model_checkpoint(model, checkpoint_path)

# Example usage
train_model(model, train_loader, criterion, optimizer, epochs=15)


inference

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import pandas as pd
import numpy as np
import os
from tqdm import tqdm  # Import tqdm for progress bar

def load_trained_model(model_path, clip_model, num_features, num_classes_list):
    # Load the state dictionary from the model path
    state_dict = torch.load(model_path)
    
    # Create the model instance before loading the state dictionary
    model = MultiOutputModel(clip_model, num_features=num_features, num_outputs_per_attr=num_classes_list)
    
    # Load the state dictionary with strict=False to avoid key mismatches
    model.load_state_dict(state_dict)
    
    model.to(device)
    model.eval()  # Set to evaluation mode
    return model


# Paths and parameters
model_path = '/kaggle/input/siglip/model_epoch_siglip_base_3.pth'  # Path to your trained model
image_dir = '/kaggle/input/mesho-chll/MESHO/test_images/'  # Path to test images directory
csv_file = '/kaggle/input/mesho-chll/MESHO/test.csv'  # Path to test CSV file
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the CLIP model and processor
# Load the CLIP model and processor
clip_model = AutoModel.from_pretrained("google/siglip-base-patch16-512")
clip_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-512")
num_features = 1  # 'Category'
num_classes_list = [len(attr_encoders[f'attr_{i}'].classes_) for i in range(1, 11)]

# Load the model with the trained weights
model = load_trained_model(model_path, clip_model, num_features, num_classes_list)

# Load the test CSV that contains the 'id' and 'Category'
test_df = pd.read_csv(csv_file)

# Preprocess a single image for inference
def preprocess_image(image_path, processor):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    return inputs

# Inference function
def predict(model, image_path, processor, features):
    # Preprocess the image
    inputs = preprocess_image(image_path, processor)
    inputs = inputs.to(device)

    # Ensure features are properly shaped for model input
    features = torch.tensor(features).unsqueeze(0).to(device)  # Shape as (1, num_features)

    # Perform forward pass (inference)
    with torch.no_grad():
        attr_outputs = model(inputs, features)

    # Convert outputs to predicted labels
    predicted_labels = [torch.argmax(output, dim=1).item() for output in attr_outputs]
   
    return predicted_labels

# Example: Perform inference on all test images
predictions_list = []

# Wrap the loop with tqdm for progress bar
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Images"):
    image_id = str(row['id']).zfill(6) + '.jpg'  # Convert ID to format 000000.jpg
    image_path = os.path.join(image_dir, image_id)

    # Encode the Category (make sure it was encoded similarly as in training)
    category_encoded_value = category_encoder.transform([row['Category']])[0]

    # Perform prediction
    predicted_attrs = predict(model, image_path, clip_processor, [category_encoded_value])

    # Decode the predicted attributes back to their original labels
    decoded_predictions = {f'attr_{i}': attr_encoders[f'attr_{i}'].inverse_transform([pred])[0] 
                           for i, pred in enumerate(predicted_attrs, 1)}
    
    # Store the results for this image, without Category for now
    predictions_list.append({'id': row['id'], **decoded_predictions})

# Convert predictions to a DataFrame for better output readability
predictions_df = pd.DataFrame(predictions_list)

# Merge predictions with the original test DataFrame based on 'id'
merged_df = pd.merge(test_df[['id', 'Category']], predictions_df, on='id')

# Count attributes that are not predicted as 'no'
# Assuming attribute columns are named attr_1, attr_2, ..., attr_10
attribute_columns = [f'attr_{i}' for i in range(1, 11)]

# Create the 'len' column based on the count of attributes that are not 'no'
merged_df['len'] = merged_df[attribute_columns].apply(lambda x: sum(attr != 'no' for attr in x), axis=1)

# Reorder columns to have 'len' after 'Category' and before 'attr_1'
cols = ['id', 'Category', 'len'] + attribute_columns
merged_df = merged_df[cols]

# Display merged predictions for all test images
print(merged_df)

# Save predictions to a CSV file
merged_df.to_csv('sub_siglip_ba.csv', index=False)
