In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
# ==========================================
# 1. CHUẨN BỊ DỮ LIỆU (DATASET)
# ==========================================
class DisasterTweetDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]

        # Tokenize văn bản
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,    # Thêm '[CLS]' và '[SEP]'
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',       # Pad câu ngắn thành dài
            truncation=True,            # Cắt câu dài thành ngắn
            return_attention_mask=True,
            return_tensors='pt',        # Trả về PyTorch tensors
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

# ==========================================
# 2. XÂY DỰNG MÔ HÌNH BERT (CLASSIFIER)
# ==========================================
class BertClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(
            'bert-base-uncased',
            output_attentions=True,   # BẬT ATTENTION
            return_dict=True
        )
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, return_attentions=False):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        logits = self.sigmoid(self.out(output))

        if return_attentions:
            return logits, outputs.attentions  # ⬅️ TRẢ ATTENTION
        return logits


# ==========================================
# 3. QUÁ TRÌNH TRAIN & VALIDATE
# ==========================================
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = (outputs > 0.5).float()
        loss = loss_fn(outputs.squeeze(), targets)

        correct_predictions += torch.sum(preds.squeeze() == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            preds = (outputs > 0.5).float()
            loss = loss_fn(outputs.squeeze(), targets)

            correct_predictions += torch.sum(preds.squeeze() == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)

# ==========================================
# 4. CHẠY CHƯƠNG TRÌNH (MAIN)
# ==========================================
if __name__ == "__main__":
    # Load dữ liệu (Giả sử file tên là 'train.csv')
    df = pd.read_csv("train.csv") # Thay bằng đường dẫn file của bạn
    # Config
    MAX_LEN = 60
    BATCH_SIZE = 32
    EPOCHS = 3
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Chia train/test
    df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

    train_data_loader = DataLoader(
        DisasterTweetDataset(df_train.text.to_numpy(), df_train.target.to_numpy(), tokenizer, MAX_LEN),
        batch_size=BATCH_SIZE,
        shuffle=True
    )
    val_data_loader = DataLoader(
        DisasterTweetDataset(df_val.text.to_numpy(), df_val.target.to_numpy(), tokenizer, MAX_LEN),
        batch_size=BATCH_SIZE
    )

    # Khởi tạo model
    attentionModel = BertClassifier(n_classes=1)
    attentionModel = attentionModel.to(device)

    optimizer = AdamW(attentionModel.parameters(), lr=1e-5)
    total_steps = len(train_data_loader) * EPOCHS

    # Scheduler giúp learning rate giảm dần, train ổn định hơn
    from transformers import get_linear_schedule_with_warmup
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    loss_fn = nn.BCELoss().to(device)

    # Vòng lặp Training
    print("Bắt đầu training...")
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)

        train_acc, train_loss = train_epoch(
            attentionModel, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
        )
        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            attentionModel, val_data_loader, loss_fn, device, len(df_val)
        )
        print(f'Val   loss {val_loss} accuracy {val_acc}')
        print()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Bắt đầu training...
Epoch 1/3
----------


100%|██████████| 191/191 [00:57<00:00,  3.32it/s]


Train loss 0.49215746534432414 accuracy 0.780623973727422


100%|██████████| 48/48 [00:05<00:00,  9.26it/s]


Val   loss 0.42464531989147264 accuracy 0.830597504924491

Epoch 2/3
----------


100%|██████████| 191/191 [01:00<00:00,  3.15it/s]


Train loss 0.3760706630988895 accuracy 0.8525451559934318


100%|██████████| 48/48 [00:05<00:00,  8.79it/s]


Val   loss 0.42116114652405184 accuracy 0.829940906106369

Epoch 3/3
----------


100%|██████████| 191/191 [00:59<00:00,  3.19it/s]


Train loss 0.3310647333011577 accuracy 0.8771756978653531


100%|██████████| 48/48 [00:05<00:00,  9.06it/s]

Val   loss 0.4259406852846344 accuracy 0.8273145108338804






In [None]:
# ==========================================
# 5. DỰ ĐOÁN TEST & TẠO submission.csv
# ==========================================
class DisasterTestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


def predict(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = (outputs > 0.5).int()
            predictions.extend(preds.squeeze().cpu().numpy())

    return predictions


# ====== LOAD TEST FILE ======
test_df = pd.read_csv("test.csv")  # đường dẫn test.csv

test_dataset = DisasterTestDataset(
    test_df.text.to_numpy(),
    tokenizer,
    MAX_LEN
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE
)

# ====== DỰ ĐOÁN ======
test_preds = predict(attentionModel, test_loader, device)

# ====== TẠO submission.csv ======
submission = pd.DataFrame({
    "id": test_df.id,
    "target": test_preds
})

submission.to_csv("submissionAttention.csv", index=False)

print("✅ Đã tạo file submissionAttention.csv")


100%|██████████| 102/102 [00:11<00:00,  8.84it/s]

✅ Đã tạo file submission.csv





In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
# ==========================================
# 1. CLASS DATASET (Đơn giản hóa)
# ==========================================
class SimpleDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]

        # Tokenize: Biến chữ thành số
        # Chúng ta dùng BertTokenizer để tận dụng bộ từ điển 30k từ chuẩn của nó
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length', # Pad để các câu dài bằng nhau (cho vào batch được)
            truncation=True,
            return_tensors='pt'
        )

        # Với model đơn giản, ta chỉ cần input_ids
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

# ==========================================
# 2. MODEL CỦA BẠN (SimpleEmbeddingModel)
# ==========================================
class SimpleEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim):
        super(SimpleEmbeddingModel, self).__init__()
        # Embedding: V x D
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # FC Layers
        self.fc1 = nn.Linear(embed_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        # text: [Batch, Seq_Len]

        # Bước 1: Look-up
        embedded = self.embedding(text) # [Batch, Seq_Len, Embed_Dim]

        # Bước 2: Mean Pooling (Toán học ngây thơ)
        # Cộng tất cả vector từ lại và chia trung bình
        # Mất hoàn toàn thông tin vị trí
        pooled = embedded.mean(dim=1) # [Batch, Embed_Dim]

        # Bước 3: Phân loại
        x = self.fc1(pooled)
        x = self.relu(x)
        out = self.fc2(x)
        return self.sigmoid(out)

# ==========================================
# 3. HÀM TRAIN & EVAL
# ==========================================
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(loader):
        # Move data to GPU/CPU
        input_ids = batch['input_ids'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids) # Chỉ truyền input_ids
        loss = criterion(outputs.squeeze(), targets)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predicted = (outputs.squeeze() > 0.5).float()
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    return total_loss / len(loader), correct / total

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs.squeeze(), targets)

            total_loss += loss.item()
            predicted = (outputs.squeeze() > 0.5).float()
            correct += (predicted == targets).sum().item()
            total += targets.size(0)

    return total_loss / len(loader), correct / total

# ==========================================
# 4. MAIN PROGRAM
# ==========================================
if __name__ == "__main__":
    # --- Config ---
    CSV_PATH = 'train.csv' # Đường dẫn file CSV của bạn
    MAX_LEN = 60
    BATCH_SIZE = 32
    EMBED_DIM = 100    # Kích thước vector (nhỏ hơn BERT 768)
    EPOCHS = 7
    LR = 0.001         # Learning rate cao hơn BERT một chút vì model đơn giản

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # --- Load Data ---
    df = pd.read_csv(CSV_PATH)

    # Sử dụng Tokenizer của BERT để lấy bộ từ điển (Vocab) chuẩn
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Chia tập Train/Valid
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    train_dataset = SimpleDataset(train_df.text.to_numpy(), train_df.target.to_numpy(), tokenizer, MAX_LEN)
    val_dataset = SimpleDataset(val_df.text.to_numpy(), val_df.target.to_numpy(), tokenizer, MAX_LEN)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Init Model ---
    # Quan trọng: vocab_size phải bằng kích thước bộ từ điển của Tokenizer
    model = SimpleEmbeddingModel(vocab_size=tokenizer.vocab_size, embed_dim=EMBED_DIM, output_dim=1)
    model = model.to(device)

    # --- Optimizer & Loss ---
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCELoss()

    # --- Training Loop ---
    print("\nBắt đầu training Simple Model (Baseline)...")
    for epoch in range(EPOCHS):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")
        print("-" * 30)

Using device: cuda

Bắt đầu training Simple Model (Baseline)...


100%|██████████| 191/191 [00:04<00:00, 43.33it/s]
100%|██████████| 48/48 [00:00<00:00, 53.16it/s]


Epoch 1/7
Train Loss: 0.6595 | Train Acc: 0.5864
Val Loss:   0.6371 | Val Acc:   0.6402
------------------------------


100%|██████████| 191/191 [00:04<00:00, 38.86it/s]
100%|██████████| 48/48 [00:00<00:00, 53.78it/s]


Epoch 2/7
Train Loss: 0.6079 | Train Acc: 0.6732
Val Loss:   0.5848 | Val Acc:   0.6881
------------------------------


100%|██████████| 191/191 [00:04<00:00, 47.45it/s]
100%|██████████| 48/48 [00:00<00:00, 53.50it/s]


Epoch 3/7
Train Loss: 0.5160 | Train Acc: 0.7650
Val Loss:   0.5414 | Val Acc:   0.7479
------------------------------


100%|██████████| 191/191 [00:04<00:00, 39.66it/s]
100%|██████████| 48/48 [00:00<00:00, 54.31it/s]


Epoch 4/7
Train Loss: 0.4309 | Train Acc: 0.8113
Val Loss:   0.5060 | Val Acc:   0.7518
------------------------------


100%|██████████| 191/191 [00:03<00:00, 47.94it/s]
100%|██████████| 48/48 [00:00<00:00, 53.62it/s]


Epoch 5/7
Train Loss: 0.3604 | Train Acc: 0.8516
Val Loss:   0.4891 | Val Acc:   0.7814
------------------------------


100%|██████████| 191/191 [00:04<00:00, 45.31it/s]
100%|██████████| 48/48 [00:01<00:00, 31.25it/s]


Epoch 6/7
Train Loss: 0.3082 | Train Acc: 0.8813
Val Loss:   0.4845 | Val Acc:   0.7748
------------------------------


100%|██████████| 191/191 [00:04<00:00, 47.46it/s]
100%|██████████| 48/48 [00:00<00:00, 53.93it/s]

Epoch 7/7
Train Loss: 0.2701 | Train Acc: 0.8929
Val Loss:   0.5228 | Val Acc:   0.7643
------------------------------





In [None]:
# ==========================================
# 5. DATASET CHO TEST
# ==========================================
class SimpleTestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten()
        }
def predict(model, loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)

            outputs = model(input_ids)
            preds = (outputs.squeeze() > 0.5).int()

            predictions.extend(preds.cpu().numpy())

    return predictions
# ==========================================
# 6. RUN TEST & EXPORT submission.csv
# ==========================================
TEST_CSV_PATH = "test.csv"

# Load test data
test_df = pd.read_csv(TEST_CSV_PATH)

test_dataset = SimpleTestDataset(
    test_df.text.to_numpy(),
    tokenizer,
    MAX_LEN
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Predict
test_preds = predict(model, test_loader, device)

# Create submission file
submission = pd.DataFrame({
    "id": test_df.id,
    "target": test_preds
})

submission.to_csv("submission.csv", index=False)

print("✅ Đã tạo submission.csv")


100%|██████████| 102/102 [00:02<00:00, 39.37it/s]

✅ Đã tạo submission.csv



