In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("oumniyya/data-competition")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/data-competition


In [3]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from sklearn.decomposition import PCA
import torch.nn as nn

# =================CONFIG=================

IMG_DIR = "/kaggle/input/data-competition/data/item_images"
ITEM_INFO_PATH = "/kaggle/input/data-competition/data/MicroLens_1M_x1/item_info.parquet"
OUTPUT_PATH = "item_info_new_fusion.parquet" # fichier pour la soumission

BATCH_SIZE = 128
NUM_WORKERS = 0 # Mis √† 0 pour √©viter les erreurs Assertion Error
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TARGET_DIM = 128  # OBLIGATOIRE pour le mod√®le DIN
# ========================================\

print(f"Using Device: {DEVICE}")

# 1. Chargement des donn√©es
print("Chargement de item_info.parquet...")
df_items = pd.read_parquet(ITEM_INFO_PATH)
print(df_items.head(2))

# 2. Dataset Custom pour PyTorch
class MultimodalDataset(Dataset):
    def __init__(self, df, img_dir, processor):
        self.df = df
        self.img_dir = img_dir
        self.processor = processor
        self.item_ids = df['item_id'].tolist()
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        item_id = self.item_ids[idx]
        
        # Gestion du chemin image
        img_path = os.path.join(self.img_dir, f"{item_id}.jpg") 
        
        # Gestion Texte (Titre ou Description)
        text = "Item" # Fallback si pas de texte
        if 'title' in self.df.columns:
            text = str(self.df.iloc[idx]['title'])
            
        # Chargement Image (avec gestion d'erreur si image manquante)
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            # Si image vide/manquante, on cr√©e une image noire
            image = Image.new('RGB', (224, 224), color='black')

        # Processing CLIP
        inputs = self.processor(text=[text], images=image, return_tensors="pt", padding=True, truncation=True)
        
        # On retire la dimension de batch ajout√©e par le processor
        return {k: v.squeeze(0) for k, v in inputs.items()}

# 3. Initialisation Mod√®le (CLIP)
print("Chargement du mod√®le CLIP...")
model_id = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

model.to(DEVICE)
model.eval()

# 4. Extraction des Embeddings
dataset = MultimodalDataset(df_items, IMG_DIR, processor)
#  num_workers=0 pour la stabilit√©
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)

all_embeddings = []

print("D√©but de l'extraction des features (Fusion Concat√©nation)...")
with torch.no_grad():
    for batch in tqdm(dataloader):
        # Envoi sur GPU
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        
        # Passage dans le mod√®le
        outputs = model(**batch)
        
        img_emb = outputs.image_embeds # [Batch, 512]
        txt_emb = outputs.text_embeds  # [Batch, 512]
        
        #  Concat√©nation (512 + 512 = 1024)
        fused_emb = torch.cat([img_emb, txt_emb], dim=-1) # [Batch, 1024]
        
        all_embeddings.append(fused_emb.cpu().numpy())

# Concat√©ner tous les batchs -> Matrice (N_items, 1024)
full_embeddings = np.concatenate(all_embeddings, axis=0)
print(f"Features extraites. Shape: {full_embeddings.shape} (1024D)")

# 5. R√©duction de Dimension (1024 -> 128) via PCA
print("R√©duction de dimension vers 128 (PCA)...")
#  PCA est maintenant de 1024 -> 128
pca = PCA(n_components=TARGET_DIM)
reduced_embeddings = pca.fit_transform(full_embeddings)

# Normalisation finale pour aider le mod√®le DIN
norms = np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)
reduced_embeddings = reduced_embeddings / (norms + 1e-8)

print(f"Shape finale : {reduced_embeddings.shape}")

# 6. Sauvegarde 
print("Mise √† jour du DataFrame...")

# 6a. Supprimer la/les ancienne(s) colonne(s) d'embedding si elle(s) existe(nt)
# Nous v√©rifions √† la fois l'ancien nom interm√©diaire ('item_emb') et le nom final ('item_emb_d128')
for col in ['item_emb', 'item_emb_d128']:
    if col in df_items.columns:
        print(f"üîß Suppression de l'ancienne colonne: {col}")
        df_items = df_items.drop(columns=[col])

# 6b. Cr√©er la nouvelle colonne avec le nom final
df_items['item_emb_d128'] = list(reduced_embeddings)

print(f"Sauvegarde dans {OUTPUT_PATH}...")
df_items.to_parquet(OUTPUT_PATH)

print("Termin√©! Le nouveau fichier d'embeddings est pr√™t pour la Task 2.")

Using Device: cuda
Chargement de item_info.parquet...
   item_id        item_tags                                      item_emb_d128
0        0  [0, 0, 0, 0, 0]  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1        1  [0, 0, 0, 0, 1]  [-0.587724506855011, -0.38462838530540466, 0.4...
Chargement du mod√®le CLIP...
D√©but de l'extraction des features (Fusion Concat√©nation)...


  0%|          | 0/717 [00:00<?, ?it/s]

Features extraites. Shape: (91718, 1024) (1024D)
R√©duction de dimension vers 128 (PCA)...
Shape finale : (91718, 128)
Mise √† jour du DataFrame...
üîß Suppression de l'ancienne colonne: item_emb_d128
Sauvegarde dans item_info_new_fusion.parquet...
Termin√©! Le nouveau fichier d'embeddings est pr√™t pour la Task 2.


In [4]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# ==================== CONFIGURATION (MOD√àLE CTR GAGNANT) ====================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üîß Device: {DEVICE}")

# Chemins des donn√©es

ITEM_INFO_PATH = "/kaggle/working/item_info_new_fusion.parquet"
TRAIN_PATH = "/kaggle/input/data-competition/data/MicroLens_1M_x1/train.parquet"
VALID_PATH = "/kaggle/input/data-competition/data/MicroLens_1M_x1/valid.parquet"
TEST_PATH = "/kaggle/input/data-competition/data/MicroLens_1M_x1/test.parquet"
ITEM_SEQ_PATH = "/kaggle/input/data-competition/data/item_seq.parquet"

# Hyperparam√®tres 
BATCH_SIZE = 4096
EPOCHS = 15
LR = 0.002
WEIGHT_DECAY = 5e-6 # Petite r√©gularisation pour la stabilit√©
DROPOUT = 0.3
PATIENCE = 4


EMB_DIM = 64
HIDDEN_DIMS = [512, 256, 128]
ATTENTION_DIM = 256

# ==================== DATASET ====================
class CTRDataset(Dataset):
    def __init__(self, df, item_info, item_seq, max_seq_len=50):
        self.df = df.reset_index(drop=True)
        self.item_info = item_info
        self.item_seq = item_seq
        self.max_seq_len = max_seq_len
        
        # Merge item features
        self.df = pd.merge(self.df, item_info, on='item_id', how='left')
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Features de base
        user_id = row['user_id']
        item_id = row['item_id']
        likes_level = row.get('likes_level', 0)
        views_level = row.get('views_level', 0)
        
        # Embedding multimodal (Task 1)
        item_emb = np.array(row['item_emb_d128'])
        
        # Tags de l'item (gestion robuste)
        item_tags_val = row.get('item_tags', [])
        if isinstance(item_tags_val, np.ndarray):
            item_tags_val = item_tags_val.tolist()
        if not isinstance(item_tags_val, list):
            item_tags_val = []
            
        # Filtrer les valeurs non valides et tronquer/padder
        item_tags = [int(x) for x in item_tags_val if x is not None and x > 0]
        item_tags = item_tags[:5] + [0] * (5 - len(item_tags))
        
        # S√©quence historique
        hist_seq = self.item_seq.get(user_id, [])
        
        # Conversion en liste si c'est un numpy array
        if isinstance(hist_seq, np.ndarray):
            hist_seq = hist_seq.tolist()
        elif not isinstance(hist_seq, list):
            hist_seq = []
        
        if len(hist_seq) > self.max_seq_len:
            hist_seq = hist_seq[-self.max_seq_len:]
        
        hist_len = len(hist_seq)
        hist_seq = hist_seq + [0] * (self.max_seq_len - len(hist_seq))
        
        # Label
        label = row.get('label', 0)
        
        return {
            'user_id': user_id,
            'item_id': item_id,
            'item_emb': torch.FloatTensor(item_emb),
            'item_tags': torch.LongTensor(item_tags),
            'likes_level': likes_level,
            'views_level': views_level,
            'hist_seq': torch.LongTensor(hist_seq),
            'hist_len': torch.LongTensor([hist_len]),
            'label': torch.FloatTensor([label])
        }

# ==================== MOD√àLE CTR (Attention/DNN) ====================
class MultimodalAttentionCTR(nn.Module):
    def __init__(self, n_users, n_items, n_tags, emb_dim=64, 
                 mm_dim=128, hidden_dims=HIDDEN_DIMS, dropout=DROPOUT):
        super().__init__()
        
        self.user_emb = nn.Embedding(n_users + 1, emb_dim, padding_idx=0)
        self.item_emb = nn.Embedding(n_items + 1, emb_dim, padding_idx=0)
        self.tag_emb = nn.Embedding(n_tags + 1, emb_dim, padding_idx=0)
        
        self.mm_proj = nn.Sequential(
            nn.Linear(mm_dim, emb_dim),
            nn.LayerNorm(emb_dim),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.attention = nn.MultiheadAttention(
            embed_dim=emb_dim,
            num_heads=4,
            dropout=dropout * 0.5,
            batch_first=True
        )
        
        input_dim = emb_dim * 5 + 2 
        
        layers = []
        prev_dim = input_dim
        for hdim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hdim),
                nn.BatchNorm1d(hdim),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
            prev_dim = hdim
        
        layers.append(nn.Linear(prev_dim, 1))
        self.dnn = nn.Sequential(*layers)
        
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.01)
    
    def forward(self, user_id, item_id, item_emb, item_tags, 
                likes_level, views_level, hist_seq, hist_len):
        
        user_vec = self.user_emb(user_id)
        item_vec = self.item_emb(item_id)
        mm_vec = self.mm_proj(item_emb)
        tags_vec = self.tag_emb(item_tags).mean(dim=1)
        
        hist_emb = self.item_emb(hist_seq)
        
        # Masque pour l'attention
        mask = torch.arange(hist_seq.size(1), device=hist_seq.device)[None, :] >= hist_len[:, None]
        
        # Self-attention sur l'historique
        hist_att, _ = self.attention(
            hist_emb, hist_emb, hist_emb,
            key_padding_mask=mask
        )
        
        hist_vec = hist_att.sum(dim=1) / (hist_len.unsqueeze(1) + 1e-8)
        
        likes_vec = likes_level.float().unsqueeze(1) / 10.0
        views_vec = views_level.float().unsqueeze(1) / 10.0
        
        combined = torch.cat([
            user_vec, item_vec, mm_vec, tags_vec, hist_vec,
            likes_vec, views_vec
        ], dim=1)
        
        logits = self.dnn(combined)
        return torch.sigmoid(logits)


# ==================== CHARGEMENT DES DONN√âES ====================
print("üìÇ Chargement des donn√©es...")

df_train = pd.read_parquet(TRAIN_PATH)
df_valid = pd.read_parquet(VALID_PATH)
df_test = pd.read_parquet(TEST_PATH)
df_item_info = pd.read_parquet(ITEM_INFO_PATH)

# V√©rification de la colonne d'embeddings
if 'item_emb_d128' not in df_item_info.columns:
    if 'item_emb' in df_item_info.columns:
        df_item_info['item_emb_d128'] = df_item_info['item_emb']
    else:
        raise ValueError("‚ùå Colonne 'item_emb_d128' introuvable!")

# Chargement des s√©quences
df_seq = pd.read_parquet(ITEM_SEQ_PATH)
item_seq_dict = dict(zip(df_seq['user_id'], df_seq['item_seq']))

# Statistiques
n_users = max(df_train['user_id'].max(), df_valid['user_id'].max()) + 1
n_items = df_item_info['item_id'].max() + 1
n_tags = 11740 

print(f"‚úÖ Users: {n_users:,} | Items: {n_items:,} | Train: {len(df_train):,}")

# Datasets
train_dataset = CTRDataset(df_train, df_item_info, item_seq_dict)
valid_dataset = CTRDataset(df_valid, df_item_info, item_seq_dict)

# num_workers=0 pour la stabilit√©
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# ==================== ENTRA√éNEMENT ====================
model = MultimodalAttentionCTR(
    n_users=n_users,
    n_items=n_items,
    n_tags=n_tags,
    emb_dim=EMB_DIM,
    mm_dim=128,
    hidden_dims=HIDDEN_DIMS,
    dropout=DROPOUT
).to(DEVICE)

criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

print(f"\nüöÄ D√©but de l'entra√Ænement (Task 2 - VRAI FINETUNING) | Epochs: {EPOCHS} | Batch Size: {BATCH_SIZE}")
print("=" * 70)

best_auc = 0.0
patience_counter = 0

for epoch in range(EPOCHS):
    # Train
    model.train()
    train_loss = 0.0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        
        hist_len = batch['hist_len'].to(DEVICE)
        
        preds = model(
            batch['user_id'].to(DEVICE),
            batch['item_id'].to(DEVICE),
            batch['item_emb'].to(DEVICE),
            batch['item_tags'].to(DEVICE),
            batch['likes_level'].to(DEVICE),
            batch['views_level'].to(DEVICE),
            batch['hist_seq'].to(DEVICE),
            hist_len.squeeze(1)
        )
        
        loss = criterion(preds, batch['label'].to(DEVICE))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        train_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_preds, val_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Validation"):
            hist_len = batch['hist_len'].to(DEVICE)
            
            preds = model(
                batch['user_id'].to(DEVICE),
                batch['item_id'].to(DEVICE),
                batch['item_emb'].to(DEVICE),
                batch['item_tags'].to(DEVICE),
                batch['likes_level'].to(DEVICE),
                batch['views_level'].to(DEVICE),
                batch['hist_seq'].to(DEVICE),
                hist_len.squeeze(1)
            )
            
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(batch['label'].numpy())
    
    auc = roc_auc_score(val_labels, val_preds)
    scheduler.step(auc)
    
    print(f"üìä Epoch {epoch+1} | Loss: {avg_train_loss:.4f} | AUC: {auc:.4f}")
    
    # Early stopping
    if auc > best_auc:
        best_auc = auc
        patience_counter = 0
        torch.save(model.state_dict(), '/kaggle/working/best_model_new_emb.pt')
        print(f"‚úÖ Nouveau meilleur mod√®le sauvegard√©! AUC: {best_auc:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"‚èπÔ∏è Early stopping apr√®s {epoch+1} epochs")
            break

print(f"\nüéâ Entra√Ænement termin√©! Meilleur AUC: {best_auc:.4f}")

# ==================== PR√âDICTION ====================
print("\nüîÆ G√©n√©ration des pr√©dictions...")

# Charger le mod√®le CTR sauvegard√©
model.load_state_dict(torch.load('/kaggle/working/best_model_new_emb.pt'))
model.eval()

test_dataset = CTRDataset(df_test, df_item_info, item_seq_dict)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

test_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test"):
        hist_len = batch['hist_len'].to(DEVICE)

        preds = model(
            batch['user_id'].to(DEVICE),
            batch['item_id'].to(DEVICE),
            batch['item_emb'].to(DEVICE),
            batch['item_tags'].to(DEVICE),
            batch['likes_level'].to(DEVICE),
            batch['views_level'].to(DEVICE),
            batch['hist_seq'].to(DEVICE),
            hist_len.squeeze(1)
        )
        test_preds.extend(preds.cpu().numpy().flatten())

# Sauvegarde
submission = pd.DataFrame({
    'ID': range(len(test_preds)),
    'Task1': 0,
    'Task2': 0,
    'Task1&2': test_preds
})

submission.to_csv('/kaggle/working/submission_task1_2_new_emb.csv', index=False)
print(f"‚úÖ Soumission sauvegard√©e: submission_task1_2_new_emb.csv")
print(f"üìà Meilleur AUC validation: {best_auc:.4f}")
print(submission.head())

üîß Device: cuda
üìÇ Chargement des donn√©es...
‚úÖ Users: 1,000,001 | Items: 91,718 | Train: 3,600,000

üöÄ D√©but de l'entra√Ænement (Task 2 - VRAI FINETUNING) | Epochs: 15 | Batch Size: 4096


Epoch 1/15:   0%|          | 0/879 [00:00<?, ?it/s]

Validation:   0%|          | 0/3 [00:00<?, ?it/s]

üìä Epoch 1 | Loss: 0.2055 | AUC: 0.7167
‚úÖ Nouveau meilleur mod√®le sauvegard√©! AUC: 0.7167


Epoch 2/15:   0%|          | 0/879 [00:00<?, ?it/s]

Validation:   0%|          | 0/3 [00:00<?, ?it/s]

üìä Epoch 2 | Loss: 0.0653 | AUC: 0.7752
‚úÖ Nouveau meilleur mod√®le sauvegard√©! AUC: 0.7752


Epoch 3/15:   0%|          | 0/879 [00:00<?, ?it/s]

Validation:   0%|          | 0/3 [00:00<?, ?it/s]

üìä Epoch 3 | Loss: 0.0186 | AUC: 0.7530


Epoch 4/15:   0%|          | 0/879 [00:00<?, ?it/s]

Validation:   0%|          | 0/3 [00:00<?, ?it/s]

üìä Epoch 4 | Loss: 0.0071 | AUC: 0.7271


Epoch 5/15:   0%|          | 0/879 [00:00<?, ?it/s]

Validation:   0%|          | 0/3 [00:00<?, ?it/s]

üìä Epoch 5 | Loss: 0.0040 | AUC: 0.7376


Epoch 6/15:   0%|          | 0/879 [00:00<?, ?it/s]

Validation:   0%|          | 0/3 [00:00<?, ?it/s]

üìä Epoch 6 | Loss: 0.0016 | AUC: 0.7470
‚èπÔ∏è Early stopping apr√®s 6 epochs

üéâ Entra√Ænement termin√©! Meilleur AUC: 0.7752

üîÆ G√©n√©ration des pr√©dictions...


Test:   0%|          | 0/93 [00:00<?, ?it/s]

‚úÖ Soumission sauvegard√©e: submission_task1_2_new_emb.csv
üìà Meilleur AUC validation: 0.7752
   ID  Task1  Task2   Task1&2
0   0      0      0  0.993067
1   1      0      0  0.999798
2   2      0      0  0.901648
3   3      0      0  0.005161
4   4      0      0  0.002700
