In [4]:

import os


In [5]:
import pandas as pd

In [6]:
def load_posts_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    posts = []
    for _, row in df.iterrows():
        post_id = str(row["org_index"])
        image_path = f"/kaggle/input/nckh123/IMG_DATA/IMG_DATA/{post_id}.png"

        post = {
            "id": post_id,
            "title": str(row["title"]),
            "content": str(row["content"]),
            "label": str(row["Final_label"]).lower(),
            "image": image_path,
            "explanation": str(row["Claude_reason"])  # hoặc "claude_explanation"
        }
        posts.append(post)
    return posts

In [7]:
def load_metadata_only(metadata_root):
    metadata_all = {}

    for pid in sorted(os.listdir(metadata_root)):
        path = os.path.join(metadata_root, pid)
        if not os.path.isdir(path):
            continue

        related_articles = []

        for sub_idx in sorted(os.listdir(path)):
            sub_path = os.path.join(path, sub_idx)
            if not os.path.isdir(sub_path):
                continue

            try:
                with open(os.path.join(sub_path, "title.txt"), encoding="utf-8") as f:
                    title = f.read().strip()
                with open(os.path.join(sub_path, "content.txt"), encoding="utf-8") as f:
                    content = f.read().strip()
            except:
                continue

            image_paths = [
                os.path.join(sub_path, img)
                for img in sorted(os.listdir(sub_path))
                if img.lower().endswith((".jpg", ".jpeg", ".png"))
            ]

            related_articles.append({
                "title": title,
                "content": content,
                "images": image_paths
            })

        # ✅ Chỉ thêm vào dict nếu có ít nhất 1 bài báo phụ
        if related_articles:
            metadata_all[pid] = related_articles

    return metadata_all

In [8]:
#Load data 
train_set = load_posts_from_csv('/kaggle/input/nckh123/train.csv')
dev_set = load_posts_from_csv('/kaggle/input/nckh123/val.csv')
test_set = load_posts_from_csv('/kaggle/input/nckh123/test.csv')
metadata = load_metadata_only('/kaggle/input/envide/crawl_articles')


In [14]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load model text embedding (BERT, PhoBERT,...)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "uitnlp/visobert"  # thay bằng model tiếng Việt nếu cần
tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name).to(device)
text_model.eval()

def get_text_embedding(text):
    """Trả về embedding mean pooling cho một đoạn văn bản"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = text_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()  # (hidden_size,)
    
def text_emb(maindata, metadata, tokenizer, model):
    org_embeddings = {}

    for idx, row in maindata.iterrows():
        pid = str(row["id"])

        # Tập hợp text chính và các evidence
        texts = [f"{row['title']} {row['content']}".strip()]
        if pid in metadata:
            for item in metadata[pid]:
                texts.append(f"{item['title']} {item['content']}".strip())
            org_embeddings[pid] = texts
        #Tokenize & lấy embedding
        inputs = tokenizer(texts, padding=True, max_length=512, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1)  # embedding từng văn bản

        # Gộp mean theo org_index
        org_embeddings[pid] = emb.mean(dim=0)  # (hidden_size,)

    return org_embeddings  # dict: {org_index: embedding}


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/390M [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

In [13]:
import torch
from torchvision import models, transforms
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load ResNet-50 pretrained
model_name = "facebook/deit-base-distilled-patch16-224"
feature_extractor = DeiTFeatureExtractor.from_pretrained(model_name)
img_model = DeiTModel.from_pretrained(model_name).to(device).eval()

# 2. Transform cho ảnh
img_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # resize chuẩn
    transforms.ToTensor(),          # thành tensor
    transforms.Normalize(           # normalize theo ImageNet
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

def get_image_embedding(img_path):
    """Trả về embedding (768,) của ảnh từ DeiT"""
    image = Image.open(img_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = img_model(**inputs)
        # outputs.last_hidden_state shape: [1, num_patches+1, hidden_size]
        emb = outputs.last_hidden_state[:, 0, :]  # lấy CLS token (1, 768)
    return emb.squeeze(0).cpu()  # (768,)

def img_emb(maindata, metadata):
    org_img_embeddings = {}

    for idx, row in maindata.iterrows():
        pid = str(row["id"])

        # --- Ảnh chính ---
        try:
            main_emb = get_image_embedding(row['image'])
        except Exception as e:
            print(f"⚠️ Lỗi ảnh chính {row['image']}: {e}")
            main_emb = None

        # --- Ảnh evidence ---
        evidence_embs = []
        if pid in metadata:
            for item in metadata[pid]:
                img_paths = item['images']  # list path ảnh evidence
                emb_list = []
                for path in img_paths:
                    try:
                        emb_list.append(get_image_embedding(path))
                    except Exception as e:
                        print(f"⚠️ Lỗi ảnh {path}: {e}")
                if emb_list:
                    # lấy trung bình trong từng evidence group
                    evidence_embs.append(torch.stack(emb_list).mean(dim=0))

        # --- Trung bình main + evidence ---
        all_embs = []
        if main_emb is not None:
            all_embs.append(main_emb)
        if evidence_embs:
            all_embs.extend(evidence_embs)

        if all_embs:
            org_img_embeddings[pid] = torch.stack(all_embs).mean(dim=0)
        else:
            org_img_embeddings[pid] = None

    return org_img_embeddings


Some weights of DeiTModel were not initialized from the model checkpoint at facebook/deit-base-distilled-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:



# text_emb = text_emb(main, metadata, tokenizer, text_model)
# img_emb = img_emb(main, metadata)

In [15]:
train_set = pd.DataFrame(train_set)
dev_set = pd.DataFrame(dev_set)
test_set = pd.DataFrame(test_set)


In [16]:
train_text = text_emb(train_set, metadata, tokenizer, text_model)
train_img = img_emb(train_set, metadata)



In [17]:
dev_text = text_emb(dev_set, metadata, tokenizer, text_model)
dev_img = img_emb(dev_set, metadata)

test_text = text_emb(test_set, metadata, tokenizer, text_model)
test_img = img_emb(test_set, metadata)



In [18]:
import torch
from torch.utils.data import Dataset

class FusionEmbeddingDataset(Dataset):
    def __init__(self, main_df, text_emb_dict, img_emb_dict, id_col="id", label_col="label"):
        self.ids = main_df[id_col].tolist()
        self.labels = main_df[label_col].tolist()
        self.text_emb_dict = text_emb_dict
        self.img_emb_dict = img_emb_dict

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        pid = str(self.ids[idx])
        label = torch.tensor(float(self.labels[idx]), dtype=torch.float)
    
        text_val = self.text_emb_dict[pid]
        img_val  = self.img_emb_dict[pid]
    
        # Đảm bảo trả ra tensor đúng dtype
        text_emb = torch.as_tensor(text_val, dtype=torch.float)
        img_emb  = torch.as_tensor(img_val, dtype=torch.float)
    
        return text_emb, img_emb, label


In [19]:
import torch
import torch.nn as nn

class EarlyFusionModel(nn.Module):
    def __init__(self, text_dim, img_dim, hidden_size=768, dropout_rate=0.2):
        """
        text_dim: chiều embedding text (VD: 768 cho ViSOBERT, ViT5)
        img_dim: chiều embedding ảnh (VD: 2048 cho ResNet50)
        hidden_size: chiều không gian chung sau khi chiếu
        """
        super(EarlyFusionModel, self).__init__()

        # Projection để đưa cả hai về cùng kích thước hidden_size
        self.text_proj = nn.Linear(text_dim, hidden_size)
        self.img_proj = nn.Linear(img_dim, hidden_size)

        # Shared Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size, nhead=8,
            dim_feedforward=2048, dropout=dropout_rate,
            batch_first=False
        )
        self.shared_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)

        # Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 1)
        )

    def forward(self, text_emb, img_emb):
        """
        text_emb: [B, text_dim]
        img_emb: [B, img_dim]
        """
        # 1. Projection
        t_proj = self.text_proj(text_emb).unsqueeze(1)  # [B, 1, H]
        i_proj = self.img_proj(img_emb).unsqueeze(1)    # [B, 1, H]

        # 2. Ghép lại sequence: [B, 2, H] -> [2, B, H]
        fusion = torch.cat([i_proj, t_proj], dim=1).transpose(0, 1)

        # 3. Transformer encoder
        encoded = self.shared_encoder(fusion)  # [2, B, H]

        # 4. Lấy token ảnh (vị trí 0) làm đại diện
        cls_token = encoded[0]  # [B, H]

        # 5. Classifier
        output = self.classifier(cls_token)  # [B, 1]
        return output


In [20]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

# Tạo dataset
train = FusionEmbeddingDataset(train_set, train_text, train_img)
val = FusionEmbeddingDataset(dev_set, dev_text, dev_img)
test = FusionEmbeddingDataset(test_set, test_text, test_img)

# DataLoader
train_loader = DataLoader(train, batch_size=32, shuffle=True)
val_loader   = DataLoader(val, batch_size=32, shuffle=False)
test_loader  = DataLoader(test, batch_size=32, shuffle=False)


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = EarlyFusionModel(text_dim=768, img_dim=768, hidden_size=768).to(device)
criterion = nn.BCEWithLogitsLoss()   # nhị phân
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)




In [22]:
for epoch in range(20):  # số epoch ví dụ
    # --- Training ---
    model.train()
    total_loss = 0
    for text_emb, img_emb, label in train_loader:
        text_emb, img_emb, label = text_emb.to(device), img_emb.to(device), label.to(device)

        optimizer.zero_grad()
        outputs = model(text_emb, img_emb).squeeze(1)  # [B]
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

    # --- Validation ---
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for text_emb, img_emb, label in val_loader:
            text_emb, img_emb, label = text_emb.to(device), img_emb.to(device), label.to(device)
            outputs = model(text_emb, img_emb).squeeze(1)

            loss = criterion(outputs, label)
            val_loss += loss.item()

            # nếu là phân loại nhị phân (sigmoid đầu ra)
            preds = (outputs > 0.5).long()
            correct += (preds == label).sum().item()
            total += label.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total

    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")


Epoch 1 - Train Loss: 0.4695 | Val Loss: 0.3609 | Val Acc: 0.8160
Epoch 2 - Train Loss: 0.3932 | Val Loss: 0.3705 | Val Acc: 0.8400
Epoch 3 - Train Loss: 0.3620 | Val Loss: 0.3783 | Val Acc: 0.8420
Epoch 4 - Train Loss: 0.3144 | Val Loss: 0.3995 | Val Acc: 0.8500
Epoch 5 - Train Loss: 0.2792 | Val Loss: 0.3798 | Val Acc: 0.8460
Epoch 6 - Train Loss: 0.2478 | Val Loss: 0.5144 | Val Acc: 0.8280
Epoch 7 - Train Loss: 0.2133 | Val Loss: 0.4937 | Val Acc: 0.8400
Epoch 8 - Train Loss: 0.2290 | Val Loss: 0.4832 | Val Acc: 0.8520
Epoch 9 - Train Loss: 0.1860 | Val Loss: 0.5232 | Val Acc: 0.8440
Epoch 10 - Train Loss: 0.1807 | Val Loss: 0.5704 | Val Acc: 0.8420
Epoch 11 - Train Loss: 0.1584 | Val Loss: 0.4542 | Val Acc: 0.8260
Epoch 12 - Train Loss: 0.1563 | Val Loss: 0.4593 | Val Acc: 0.8400
Epoch 13 - Train Loss: 0.1545 | Val Loss: 0.6011 | Val Acc: 0.8480
Epoch 14 - Train Loss: 0.1375 | Val Loss: 0.5912 | Val Acc: 0.8440
Epoch 15 - Train Loss: 0.1353 | Val Loss: 0.5605 | Val Acc: 0.8400
Epoc

In [23]:
# for epoch in range(5):  # số epoch ví dụ
#     model.train()
#     total_loss = 0
#     for text_emb, img_emb, label in train_loader:
#         text_emb, img_emb, label = text_emb.to(device), img_emb.to(device), label.to(device)

#         optimizer.zero_grad()
#         outputs = model(text_emb, img_emb).squeeze(1)  # [B]
#         loss = criterion(outputs, label)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#     print(f"Epoch {epoch+1} - Train Loss: {total_loss/len(train_loader):.4f}")


In [24]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for text_emb, img_emb, label in val_loader:
        text_emb, img_emb, label = text_emb.to(device), img_emb.to(device), label.to(device)
        outputs = model(text_emb, img_emb).squeeze(1)
        preds = torch.sigmoid(outputs) > 0.5
        correct += (preds == label.int()).sum().item()
        total += label.size(0)

print(f"Validation Accuracy: {correct/total:.4f}")


Validation Accuracy: 0.8480


In [25]:
import random

# Lấy random index trong test set
idx = random.randint(0, len(test) - 1)

# Lấy sample từ dataset
text_emb, img_emb, label = test[idx]
pid = test.ids[idx]   # nếu FusionEmbeddingDataset có self.ids

print(f"Sample ID: {pid}")
print(f"Label: {label.item()}")
print(f"Text embedding shape: {text_emb.shape}")
print(f"Image embedding shape: {img_emb.shape}")


Sample ID: 4391
Label: 1.0
Text embedding shape: torch.Size([768])
Image embedding shape: torch.Size([768])


In [26]:
text_emb

tensor([ 2.9355e-01,  1.3075e-01, -1.9709e-02,  2.6250e-02,  1.0093e-03,
        -6.2689e-01,  2.4840e-01,  3.4099e-01, -1.4325e-01, -5.8854e-01,
        -1.9034e-01, -1.0828e-01, -1.1120e-01, -1.3024e+00,  3.5936e-01,
         3.7063e-01, -1.6561e-01, -4.7806e-01, -7.3034e-01, -6.7138e-01,
         5.5841e-02,  4.1499e-01, -5.6283e-01, -1.4150e-01,  2.7962e-01,
         2.3754e-01,  2.1334e-01,  9.0556e-02, -1.6093e-01,  7.3268e-01,
        -6.0794e-01,  2.7506e-01,  2.9857e-01,  6.1257e-01,  4.2190e-01,
         7.0229e-01, -5.9661e-01,  5.0705e-01, -7.5005e-02, -2.3987e-02,
         5.3443e-01, -9.8235e-01, -8.6952e-01,  2.1556e-01, -8.5297e-01,
         2.4150e-01, -2.3121e-01,  1.8493e-01,  6.7805e-01,  2.2387e-01,
        -3.5422e-01,  4.2971e-01, -2.6120e-01, -2.6230e-03, -7.5583e-01,
        -5.9578e-01, -8.9433e-02, -8.2594e-01,  2.4971e-01, -1.2534e+00,
        -4.0194e-03,  6.4654e-01,  6.1894e-02, -1.7787e-01, -1.2918e+00,
        -4.6381e-01, -4.8205e-01, -3.1871e-01,  6.6