In [1]:
# ==================================================================================
# MM-CTR TASK 1&2: EMBEDDING GENERATION + CTR TRAINING + SUBMISSION
# ==================================================================================

import os
import sys
import gc
import zipfile
import subprocess
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from google.colab import drive


In [2]:
print("üì¶ Installing Dependencies...")

try:
    import clip
except ImportError:
    subprocess.check_call([
        sys.executable, "-m", "pip", "install",
        "ftfy", "regex", "tqdm", "git+https://github.com/openai/CLIP.git"
    ])
    import clip

try:
    import polars as pl
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "polars"])
    import polars as pl

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')


üì¶ Installing Dependencies...
Mounted at /content/drive


In [3]:
class Config:
    BASE_PATH = '/content/drive/MyDrive/compet/MicroLens_1M_MMCTR'  # UPDATE if needed
    DATA_DIR = os.path.join(BASE_PATH, 'MicroLens_1M_x1')

    FEATURE_PATH = os.path.join(BASE_PATH, 'item_feature.parquet')
    RAR_PATH = os.path.join(BASE_PATH, 'item_images_2.rar')

    GENERATED_EMB_PATH = os.path.join(BASE_PATH, 'item_emb_task1_clip.parquet')

    MODEL_SAVE_DIR = os.path.join(BASE_PATH, 'models_task1and2')
    PRED_SAVE_DIR = os.path.join(BASE_PATH, 'predictions_task1and2')

    IMG_EXTRACT_PATH = '/content/item_images'

    BATCH_SIZE_CLIP = 128
    BATCH_SIZE_TRAIN = 2048
    EMBED_DIM = 128
    SIDE_EMBED_DIM = 16
    LR = 5e-4
    EPOCHS = 30
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

config = Config()
os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(config.PRED_SAVE_DIR, exist_ok=True)

print("Device:", config.DEVICE)


Device: cuda


In [4]:
def setup_images():
    if not os.path.exists(config.IMG_EXTRACT_PATH):
        print(f"üìÇ Extracting images from {config.RAR_PATH}...")
        subprocess.run("apt-get update -y", shell=True, check=False)
        subprocess.run("apt-get install -y unrar", shell=True, check=False)

        os.makedirs(config.IMG_EXTRACT_PATH, exist_ok=True)
        cmd = f"unrar x -inul '{config.RAR_PATH}' '{config.IMG_EXTRACT_PATH}/'"
        subprocess.run(cmd, shell=True, check=False)
        print("‚úÖ Images extracted.")
    else:
        print("‚úÖ Images already ready.")


In [5]:
class CLIPDataset(Dataset):
    def __init__(self, df, img_dir, preprocess):
        self.item_ids = df['item_id'].values
        self.titles = df['item_title'].fillna("").astype(str).values
        self.img_dir = img_dir
        if os.path.exists(os.path.join(img_dir, 'item_images')):
            self.img_dir = os.path.join(img_dir, 'item_images')
        self.preprocess = preprocess

    def __len__(self):
        return len(self.item_ids)

    def __getitem__(self, idx):
        item_id = self.item_ids[idx]

        text = str(self.titles[idx])[:77]
        text_tensor = clip.tokenize([text], truncate=True).squeeze(0)

        img_path = os.path.join(self.img_dir, f"{item_id}.jpg")
        if os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert("RGB")
                img_tensor = self.preprocess(image)
            except Exception:
                img_tensor = torch.zeros(3, 224, 224)
        else:
            img_tensor = torch.zeros(3, 224, 224)

        return item_id, text_tensor, img_tensor


In [6]:
def generate_embeddings():
    print("\n--- STEP 1: GENERATING EMBEDDINGS ---")

    if os.path.exists(config.GENERATED_EMB_PATH):
        print(f"‚úÖ Found existing embeddings at {config.GENERATED_EMB_PATH}. Skipping generation.")
        return

    setup_images()

    print("üß† Loading CLIP...")
    model, preprocess = clip.load("ViT-B/32", device=config.DEVICE)
    model.eval()

    print("üìÑ Reading item features...")
    df = pd.read_parquet(config.FEATURE_PATH)

    ds = CLIPDataset(df, config.IMG_EXTRACT_PATH, preprocess)
    dl = DataLoader(ds, batch_size=config.BATCH_SIZE_CLIP, shuffle=False, num_workers=2)

    all_emb = []
    all_ids = []

    print("‚ö° Extracting CLIP embeddings...")
    with torch.no_grad():
        for ids, text, imgs in tqdm(dl):
            text = text.to(config.DEVICE)
            imgs = imgs.to(config.DEVICE)

            txt_feat = model.encode_text(text)
            img_feat = model.encode_image(imgs)

            txt_feat = txt_feat / (txt_feat.norm(dim=-1, keepdim=True) + 1e-12)
            img_feat = img_feat / (img_feat.norm(dim=-1, keepdim=True) + 1e-12)

            combined = (txt_feat + img_feat) / 2.0
            all_emb.append(combined.detach().cpu().numpy())
            all_ids.extend(ids.numpy().tolist())

    raw_matrix = np.vstack(all_emb)

    del model, text, imgs, txt_feat, img_feat
    torch.cuda.empty_cache()

    print(f"üìâ Reducing Dimensions ({raw_matrix.shape} -> 128)...")
    pca = PCA(n_components=128, random_state=42)
    reduced_matrix = pca.fit_transform(raw_matrix)
    print(f"   Explained Variance: {float(np.sum(pca.explained_variance_ratio_)):.2f}")

    df_out = pd.DataFrame({
        'item_id': all_ids,
        'item_emb_d128': list(reduced_matrix.astype(np.float32))
    })
    df_out.to_parquet(config.GENERATED_EMB_PATH, index=False)
    print(f"üíæ Saved embeddings: {config.GENERATED_EMB_PATH}")

    del raw_matrix, reduced_matrix, all_emb, df, df_out
    gc.collect()

generate_embeddings()



--- STEP 1: GENERATING EMBEDDINGS ---
‚úÖ Found existing embeddings at /content/drive/MyDrive/compet/MicroLens_1M_MMCTR/item_emb_task1_clip.parquet. Skipping generation.


In [7]:
def load_assets_task1and2():
    print("\n--- STEP 2: PREPARING TRAINING ASSETS ---")
    print(f"üõ†Ô∏è  Loading embeddings from: {config.GENERATED_EMB_PATH}")

    df_emb = pl.read_parquet(config.GENERATED_EMB_PATH)

    real_ids = df_emb['item_id'].to_list()
    id_to_idx = {rid: i + 1 for i, rid in enumerate(real_ids)}

    vectors = np.array(df_emb['item_emb_d128'].to_list(), dtype=np.float32)
    padding = np.zeros((1, config.EMBED_DIM), dtype=np.float32)
    matrix = np.vstack([padding, vectors])

    print(f"‚úÖ Matrix shape: {matrix.shape} (padding included)")
    del df_emb, vectors
    gc.collect()

    return torch.tensor(matrix), id_to_idx

PRETRAINED_WEIGHTS, ID_MAP = load_assets_task1and2()



--- STEP 2: PREPARING TRAINING ASSETS ---
üõ†Ô∏è  Loading embeddings from: /content/drive/MyDrive/compet/MicroLens_1M_MMCTR/item_emb_task1_clip.parquet
‚úÖ Matrix shape: (91718, 128) (padding included)


In [8]:
class RichDataset(Dataset):
    def __init__(self, parquet_path, id_map, is_test=False):
        df = pl.read_parquet(parquet_path)

        def map_ids(arr):
            return np.array([id_map.get(x, 0) for x in arr], dtype=np.int32)

        self.target = map_ids(df['item_id'].to_numpy())
        seq_matrix = np.stack(df['item_seq'].to_numpy())
        self.history = map_ids(seq_matrix.flatten()).reshape(seq_matrix.shape)

        self.likes = df['likes_level'].to_numpy().astype(np.int32)
        self.views = df['views_level'].to_numpy().astype(np.int32)

        if not is_test:
            self.label = df['label'].to_numpy().astype(np.float32)
            self.ids = None
        else:
            self.label = np.zeros(len(df), dtype=np.float32)
            self.ids = df['ID'].to_numpy().astype(np.int32)

        del df, seq_matrix
        gc.collect()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return (
            self.history[idx],
            self.target[idx],
            self.likes[idx],
            self.views[idx],
            self.label[idx]
        )


In [9]:
class Dice(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.bn = nn.BatchNorm1d(num_features, eps=1e-9)
        self.sig = nn.Sigmoid()
        self.alpha = nn.Parameter(torch.zeros((num_features,)))

    def forward(self, x):
        p = self.sig(self.bn(x))
        return p * x + (1 - p) * self.alpha * x


class DIN_Task1and2(nn.Module):
    def __init__(self, weights):
        super().__init__()
        num_items, dim = weights.shape

        self.item_emb = nn.Embedding(num_items, dim, padding_idx=0)
        self.item_emb.weight.data.copy_(weights)
        self.item_emb.weight.requires_grad = True

        self.likes_emb = nn.Embedding(20, config.SIDE_EMBED_DIM)
        self.views_emb = nn.Embedding(20, config.SIDE_EMBED_DIM)

        self.att_mlp = nn.Sequential(
            nn.Linear(dim * 4, 80), nn.Sigmoid(),
            nn.Linear(80, 40), nn.Sigmoid(),
            nn.Linear(40, 1)
        )

        in_dim = dim * 2 + config.SIDE_EMBED_DIM * 2
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, 512), Dice(512), nn.Dropout(0.3),
            nn.Linear(512, 256), Dice(256), nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def attention(self, target, history, mask):
        seq_len = history.size(1)
        target_tile = target.expand(-1, seq_len, -1)
        inp = torch.cat([target_tile, history, target_tile - history, target_tile * history], dim=-1)
        scores = self.att_mlp(inp).masked_fill(mask.unsqueeze(-1) == 0, -1e9)
        weighted = (torch.softmax(scores, dim=1) * history).sum(dim=1)
        return weighted

    def forward(self, history, target, likes, views):
        h_emb = self.item_emb(history)
        t_emb = self.item_emb(target).unsqueeze(1)

        mask = (history != 0)
        user_int = self.attention(t_emb, h_emb, mask)

        feats = torch.cat([
            t_emb.squeeze(1),
            user_int,
            self.likes_emb(likes),
            self.views_emb(views)
        ], dim=1)

        return self.mlp(feats).squeeze(-1)


In [10]:
def train_model():
    print("\n--- STEP 3: TRAINING CTR MODEL (Task1&2) ---")

    train_path = os.path.join(config.DATA_DIR, 'train.parquet')
    valid_path = os.path.join(config.DATA_DIR, 'valid.parquet')

    train_dl = DataLoader(
        RichDataset(train_path, ID_MAP),
        batch_size=config.BATCH_SIZE_TRAIN,
        shuffle=True,
        num_workers=2
    )

    valid_dl = DataLoader(
        RichDataset(valid_path, ID_MAP),
        batch_size=config.BATCH_SIZE_TRAIN * 2,
        shuffle=False,
        num_workers=2
    )

    model = DIN_Task1and2(PRETRAINED_WEIGHTS).to(config.DEVICE)

    # ‚úÖ Regularisation + stabilit√©
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=config.LR, weight_decay=1e-4)  # ‚úÖ weight_decay ajout√©
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
      optimizer, mode='max', factor=0.5, patience=1
    )


    best_auc = 0.0
    best_path = os.path.join(config.MODEL_SAVE_DIR, 'task1and2_best.pt')

    # ‚úÖ Early stopping
    patience = 3
    no_improve = 0
    min_delta = 1e-4

    for epoch in range(config.EPOCHS):
        model.train()
        total_loss = 0.0

        for hist, tgt, lk, vw, lbl in tqdm(train_dl, desc=f"Epoch {epoch+1}/{config.EPOCHS}"):
            hist = hist.to(config.DEVICE).long()
            tgt = tgt.to(config.DEVICE).long()
            lk = lk.to(config.DEVICE).long()
            vw = vw.to(config.DEVICE).long()
            lbl = lbl.to(config.DEVICE).float()

            # ‚úÖ Label smoothing l√©ger (option safe)
            lbl = lbl * 0.98 + 0.01

            optimizer.zero_grad()
            logits = model(hist, tgt, lk, vw)
            loss = criterion(logits, lbl)
            loss.backward()

            # ‚úÖ Gradient clipping pour stabilit√©
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

            optimizer.step()
            total_loss += float(loss.item())

        # Validation
        model.eval()
        preds, labels = [], []
        with torch.no_grad():
            for hist, tgt, lk, vw, lbl in valid_dl:
                hist = hist.to(config.DEVICE).long()
                tgt = tgt.to(config.DEVICE).long()
                lk = lk.to(config.DEVICE).long()
                vw = vw.to(config.DEVICE).long()

                logits = model(hist, tgt, lk, vw)
                preds.extend(torch.sigmoid(logits).cpu().numpy().tolist())
                labels.extend(lbl.numpy().tolist())

        auc = roc_auc_score(labels, preds)
        lr = optimizer.param_groups[0]['lr']
        print(f"üìä Epoch {epoch+1}: Loss={total_loss/len(train_dl):.4f} | Val AUC={auc:.4f} | LR={lr:.1e}")

        scheduler.step(auc)
        new_lr = optimizer.param_groups[0]['lr']
        if new_lr != lr:
            print(f"üîª LR reduced to {new_lr:.1e}")


        # ‚úÖ Save best + early stop
        if auc > best_auc + min_delta:
            best_auc = auc
            torch.save(model.state_dict(), best_path)
            print(f"üèÜ New best model saved: AUC={auc:.4f}")
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"‚õî Early stopping: no improvement for {patience} epochs.")
                break

    print(f"\n‚úÖ Best Val AUC: {best_auc:.4f}")
    return best_path

best_model_path = train_model()




--- STEP 3: TRAINING CTR MODEL (Task1&2) ---


Epoch 1/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:11<00:00, 24.54it/s]


üìä Epoch 1: Loss=0.2158 | Val AUC=0.8756 | LR=5.0e-04
üèÜ New best model saved: AUC=0.8756


Epoch 2/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.39it/s]


üìä Epoch 2: Loss=0.0980 | Val AUC=0.9280 | LR=5.0e-04
üèÜ New best model saved: AUC=0.9280


Epoch 3/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.45it/s]


üìä Epoch 3: Loss=0.0833 | Val AUC=0.9384 | LR=5.0e-04
üèÜ New best model saved: AUC=0.9384


Epoch 4/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.37it/s]


üìä Epoch 4: Loss=0.0761 | Val AUC=0.9461 | LR=5.0e-04
üèÜ New best model saved: AUC=0.9461


Epoch 5/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.35it/s]


üìä Epoch 5: Loss=0.0689 | Val AUC=0.9494 | LR=5.0e-04
üèÜ New best model saved: AUC=0.9494


Epoch 6/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.41it/s]


üìä Epoch 6: Loss=0.0636 | Val AUC=0.9481 | LR=5.0e-04


Epoch 7/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.19it/s]


üìä Epoch 7: Loss=0.0607 | Val AUC=0.9491 | LR=5.0e-04
üîª LR reduced to 2.5e-04


Epoch 8/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.23it/s]


üìä Epoch 8: Loss=0.0585 | Val AUC=0.9526 | LR=2.5e-04
üèÜ New best model saved: AUC=0.9526


Epoch 9/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:10<00:00, 24.95it/s]


üìä Epoch 9: Loss=0.0576 | Val AUC=0.9536 | LR=2.5e-04
üèÜ New best model saved: AUC=0.9536


Epoch 10/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.27it/s]


üìä Epoch 10: Loss=0.0573 | Val AUC=0.9542 | LR=2.5e-04
üèÜ New best model saved: AUC=0.9542


Epoch 11/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.18it/s]


üìä Epoch 11: Loss=0.0572 | Val AUC=0.9554 | LR=2.5e-04
üèÜ New best model saved: AUC=0.9554


Epoch 12/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.31it/s]


üìä Epoch 12: Loss=0.0571 | Val AUC=0.9546 | LR=2.5e-04


Epoch 13/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.32it/s]


üìä Epoch 13: Loss=0.0570 | Val AUC=0.9540 | LR=2.5e-04
üîª LR reduced to 1.3e-04


Epoch 14/30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1758/1758 [01:09<00:00, 25.13it/s]


üìä Epoch 14: Loss=0.0568 | Val AUC=0.9550 | LR=1.3e-04
‚õî Early stopping: no improvement for 3 epochs.

‚úÖ Best Val AUC: 0.9554


In [11]:
def predict_and_submit(best_path):
    print("\n--- STEP 4: PREDICT + SUBMISSION (Task1&2) ---")

    test_path = os.path.join(config.DATA_DIR, 'test.parquet')

    test_ds = RichDataset(test_path, ID_MAP, is_test=True)
    test_dl = DataLoader(
        test_ds,
        batch_size=config.BATCH_SIZE_TRAIN * 2,
        shuffle=False,
        num_workers=2
    )

    model = DIN_Task1and2(PRETRAINED_WEIGHTS).to(config.DEVICE)
    model.load_state_dict(torch.load(best_path, map_location=config.DEVICE))
    model.eval()

    all_preds = []
    with torch.no_grad():
        for hist, tgt, lk, vw, _ in tqdm(test_dl):
            hist = hist.to(config.DEVICE).long()
            tgt = tgt.to(config.DEVICE).long()
            lk = lk.to(config.DEVICE).long()
            vw = vw.to(config.DEVICE).long()

            logits = model(hist, tgt, lk, vw)
            all_preds.extend(torch.sigmoid(logits).cpu().numpy().tolist())

    # ‚úÖ CORRECTION TASK1&2: fill Task1&2 column, not Task1
    df = pd.DataFrame({
        'ID': test_ds.ids,
        'Task1': 0,
        'Task2': 0,
        'Task1&2': all_preds
    })

    csv_path = os.path.join(config.PRED_SAVE_DIR, 'prediction.csv')
    df.to_csv(csv_path, index=False)

    zip_path = os.path.join(config.PRED_SAVE_DIR, 'prediction.zip')
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        z.write(csv_path, 'prediction.csv')

    print(f"‚úÖ Submission ready: {zip_path}")
    return zip_path

zip_path = predict_and_submit(best_model_path)
zip_path



--- STEP 4: PREDICT + SUBMISSION (Task1&2) ---


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 93/93 [00:02<00:00, 31.20it/s]


‚úÖ Submission ready: /content/drive/MyDrive/compet/MicroLens_1M_MMCTR/predictions_task1and2/prediction.zip


'/content/drive/MyDrive/compet/MicroLens_1M_MMCTR/predictions_task1and2/prediction.zip'