In [2]:
# https://medium.com/@raoashish10/fine-tuning-a-pre-trained-bert-model-for-classification-using-native-pytorch-c5f33e87616e
# https://www.kaggle.com/datasets/gauravduttakiit/covid-19-tweet-classification
# https://zindi.africa/competitions/covid-19-tweet-classification/data
import pandas as pd

data_train = pd.read_csv('../data/COVID-19-Tweet/updated_train.csv')
data_test = pd.read_csv('../data/COVID-19-Tweet/updated_test.csv')
submission_example = pd.read_csv('../data/COVID-19-Tweet/updated_ss.csv')

data_train.head()

Unnamed: 0,ID,text,target
0,train_0,The bitcoin halving is cancelled due to,1
1,train_1,MercyOfAllah In good times wrapped in its gran...,0
2,train_2,266 Days No Digital India No Murder of e learn...,1
3,train_3,India is likely to run out of the remaining RN...,1
4,train_4,In these tough times the best way to grow is t...,0


In [3]:
data_test.head()

Unnamed: 0,ID,text
0,test_2,Why is explained in the video take a look
1,test_3,Ed Davey fasting for Ramadan No contest
2,test_4,Is Doja Cat good or do you just miss Nicki Minaj
3,test_8,How Boris Johnson s cheery wounded in action p...
4,test_9,Man it s terrible Not even a reason to get on ...


In [14]:
submission_example.head()

Unnamed: 0,ID,target
0,test_2,0
1,test_3,0
2,test_4,0
3,test_8,0
4,test_9,0


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    data_train['text'], data_train['target'],
    test_size=0.3, shuffle=True, random_state=42)

In [5]:
from transformers import AutoTokenizer, BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
train_tokens = tokenizer(list(X_train), padding=True, truncation=True)
valid_tokens = tokenizer(list(X_valid), padding=True, truncation=True)
train_tokens.keys(), valid_tokens.keys()

(dict_keys(['input_ids', 'token_type_ids', 'attention_mask']),
 dict_keys(['input_ids', 'token_type_ids', 'attention_mask']))

In [6]:
tk0 = train_tokens['input_ids'][0]
print(tk0)
print(tokenizer.decode(tk0))

[101, 3725, 9216, 8653, 2740, 124, 5187, 1559, 5465, 1545, 1367, 5966, 1580, 9493, 2740, 122, 5539, 1571, 26516, 4859, 1545, 4735, 1116, 21606, 5117, 1477, 3993, 1604, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[CLS] Update Total cases 3 017 766 12 879 Current cases 1 915 580 856 Deaths 207 722 468 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [7]:
from torch.utils.data import DataLoader, Dataset
import torch

class TweetDataset(Dataset):
    def __init__(self, is_train=False):
        if is_train:
            self.text = X_train
            self.tokens = train_tokens
            self.labels = list(y_train)
        else:
            self.text = X_valid
            self.tokens = valid_tokens
            self.labels = list(y_valid)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

BATCH_SIZE = 40
train_dataset = TweetDataset(is_train=True)
valid_dataset = TweetDataset(is_train=False)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
verify_batch = next(iter(train_loader))
print(verify_batch['input_ids'].shape)
print(verify_batch['attention_mask'].shape)
print(verify_batch['labels'].shape)
print(verify_batch['input_ids'][0])
print(tokenizer.decode(verify_batch['input_ids'][0]))
print(verify_batch['labels'])

torch.Size([40, 97])
torch.Size([40, 97])
torch.Size([40])
tensor([  101, 14130,  2346,  2087,  1592, 17734, 15078,  6778,  1649,  1110,
         1750,   170,  1669,  1104,  1120,  4798,  1880,  1190,  1122,  1110,
          170,  1159,  1111,  6979,  1106,  2415,  2191, 21769,  1107,  3709,
         1114,   170,  2246,  1306,  4944,  1106, 24296,  1141,  1104,  1103,
        15592,  1104,  6489,  1103,  1421,  3501,  1995,  6248,  1104,  1103,
         4360,  4483,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])
[CLS] MercyOfAllah Ramadan however is less a period of atonement than it is a time for Muslims to practice self restraint in keeping with awm Arabic to refrain one of the pillars of

In [12]:
pretrained_model = BertForSequenceClassification.from_pretrained('bert-base-cased')
optimizer = torch.optim.Adam(pretrained_model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

NUM_EPOCHS = 30
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pretrained_model.to(DEVICE)

for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
    pretrained_model.train()
    for idx, batch in enumerate(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = pretrained_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        y_hat = outputs.logits
        loss = loss_fn(y_hat, batch['labels'])
        loss.backward()
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / BATCH_SIZE
        print(f'Training batch {idx + 1}, last loss: {train_last_loss:.4f}')
    print(f'\nTraining epoch {epoch + 1} completed, last loss: {train_last_loss:.4f}\n')

    pretrained_model.eval()
    num_correct = 0
    y_pred = []
    for idx, batch in enumerate(valid_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            outputs = pretrained_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
            y_hat = outputs.logits
            loss = loss_fn(y_hat, batch['labels'])
            valid_batch_loss = loss.item()
            valid_last_loss = valid_batch_loss / BATCH_SIZE
            print(f'Validation batch {idx + 1}, last loss: {valid_last_loss:.4f}')

            y_pred.extend(y_hat.argmax(dim=1).cpu().numpy())
            num_correct += (y_hat.argmax(dim=1) == batch['labels']).sum().item()
            print(f'Validation accuracy: {num_correct / ((idx + 1) * BATCH_SIZE):.4f}')
    print(f'\nValidation epoch {epoch + 1} completed, last loss: {valid_last_loss:.4f}, accuracy: {num_correct / len(valid_dataset):.4f}\n')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/30
Training batch 1, last loss: 0.0222
Training batch 2, last loss: 0.0210
Training batch 3, last loss: 0.0184
Training batch 4, last loss: 0.0217
Training batch 5, last loss: 0.0215
Training batch 6, last loss: 0.0217
Training batch 7, last loss: 0.0229
Training batch 8, last loss: 0.0183
Training batch 9, last loss: 0.0219
Training batch 10, last loss: 0.0174
Training batch 11, last loss: 0.0222
Training batch 12, last loss: 0.0188
Training batch 13, last loss: 0.0239
Training batch 14, last loss: 0.0202
Training batch 15, last loss: 0.0217
Training batch 16, last loss: 0.0245
Training batch 17, last loss: 0.0224
Training batch 18, last loss: 0.0204
Training batch 19, last loss: 0.0203
Training batch 20, last loss: 0.0243
Training batch 21, last loss: 0.0214
Training batch 22, last loss: 0.0185
Training batch 23, last loss: 0.0193
Training batch 24, last loss: 0.0256
Training batch 25, last loss: 0.0173
Training batch 26, last loss: 0.0218
Training batch 27, last loss: 0.0201

In [13]:
# 初版训练任务跑通
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, BertForSequenceClassification, AutoConfig
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 加载数据
data_train = pd.read_csv('../data/COVID-19-Tweet/updated_train.csv')
data_test = pd.read_csv('../data/COVID-19-Tweet/updated_test.csv')
submission_example = pd.read_csv('../data/COVID-19-Tweet/updated_ss.csv')

# 检查数据分布
print("训练数据标签分布:")
print(data_train['target'].value_counts())
print(f"数据不平衡比例: {data_train['target'].value_counts()[0] / data_train['target'].value_counts()[1]:.2f}")

# 数据预处理 - 清理文本
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    # 移除特殊字符但保留基本标点
    import re
    text = re.sub(r'[^\w\s@#.,!?-]', ' ', text)
    # 移除多余空格
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data_train['text'] = data_train['text'].apply(clean_text)

# 分层采样确保训练集和验证集的标签分布一致
X_train, X_valid, y_train, y_valid = train_test_split(
    data_train['text'], data_train['target'],
    test_size=0.2, shuffle=True, random_state=42, stratify=data_train['target']
)

print(f"训练集大小: {len(X_train)}, 验证集大小: {len(X_valid)}")
print(f"训练集标签分布: {y_train.value_counts().to_dict()}")
print(f"验证集标签分布: {y_valid.value_counts().to_dict()}")

# 使用更适合这个任务的模型
MODEL_NAME = 'bert-base-uncased'  # 改用uncased版本，对社交媒体文本更友好
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 分析文本长度，优化截断长度
train_lengths = [len(tokenizer.encode(text, add_special_tokens=True)) for text in X_train[:1000]]
print(f"平均token长度: {np.mean(train_lengths):.2f}")
print(f"95%分位数token长度: {np.percentile(train_lengths, 95):.0f}")

MAX_LENGTH = min(128, int(np.percentile(train_lengths, 95)))  # 使用更合适的长度
print(f"使用最大长度: {MAX_LENGTH}")

# 改进的tokenization
train_tokens = tokenizer(
    list(X_train),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)
valid_tokens = tokenizer(
    list(X_valid),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)

class TweetDataset(Dataset):
    def __init__(self, texts, tokens, labels):
        self.texts = texts
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokens['input_ids'][idx],
            'attention_mask': self.tokens['attention_mask'][idx],
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }

# 创建数据集
train_dataset = TweetDataset(X_train, train_tokens, y_train)
valid_dataset = TweetDataset(X_valid, valid_tokens, y_valid)

# 调整批次大小
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 使用更好的模型配置
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 2
config.hidden_dropout_prob = 0.3  # 增加dropout防止过拟合
config.attention_probs_dropout_prob = 0.3

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(DEVICE)
print(f"使用设备: {DEVICE}")

# 处理类别不平衡 - 计算类别权重
class_counts = y_train.value_counts().sort_index()
total_samples = len(y_train)
class_weights = total_samples / (2 * class_counts)
class_weights = torch.FloatTensor(class_weights.values).to(DEVICE)
print(f"类别权重: {class_weights}")

# 改进的优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# 学习率调度器
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)

# 早停机制
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_score):
        if self.best_score is None:
            self.best_score = val_score
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.counter = 0

early_stopping = EarlyStopping(patience=3)

# 训练函数
def train_epoch(model, train_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(train_loader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )

        loss = outputs.loss
        loss.backward()

        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_predictions += (predictions == batch['labels']).sum().item()
        total_predictions += batch['labels'].size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

# 验证函数
def validate_epoch(model, valid_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Validation"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    avg_loss = total_loss / len(valid_loader)
    accuracy = accuracy_score(all_labels, all_predictions)

    return avg_loss, accuracy, all_predictions, all_labels

# 训练循环
NUM_EPOCHS = 10
best_val_accuracy = 0

print("开始训练...")
for epoch in range(NUM_EPOCHS):
    print(f'\nEpoch {epoch + 1}/{NUM_EPOCHS}')
    print('-' * 50)

    # 训练
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, loss_fn, DEVICE)

    # 验证
    val_loss, val_acc, val_predictions, val_labels = validate_epoch(model, valid_loader, loss_fn, DEVICE)

    # 调整学习率
    scheduler.step()

    print(f'训练损失: {train_loss:.4f}, 训练准确率: {train_acc:.4f}')
    print(f'验证损失: {val_loss:.4f}, 验证准确率: {val_acc:.4f}')
    print(f'当前学习率: {scheduler.get_last_lr()[0]:.2e}')

    # 保存最佳模型
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
        print(f'保存最佳模型，验证准确率: {val_acc:.4f}')

    # 早停检查
    early_stopping(val_acc)
    if early_stopping.early_stop:
        print("早停触发!")
        break

# 最终评估
print(f'\n最佳验证准确率: {best_val_accuracy:.4f}')
print("\n分类报告:")
print(classification_report(val_labels, val_predictions, target_names=['Non-COVID', 'COVID']))

训练数据标签分布:
target
0    2746
1    2541
Name: count, dtype: int64
数据不平衡比例: 1.08
训练集大小: 4229, 验证集大小: 1058
训练集标签分布: {0: 2196, 1: 2033}
验证集标签分布: {0: 550, 1: 508}


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

平均token长度: 24.88
95%分位数token长度: 48
使用最大长度: 48


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


使用设备: cuda
类别权重: tensor([0.9629, 1.0401], device='cuda:0')
开始训练...

Epoch 1/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:15<00:00, 17.66it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 79.23it/s]


训练损失: 0.4195, 训练准确率: 0.8078
验证损失: 0.2813, 验证准确率: 0.8932
当前学习率: 1.95e-05
保存最佳模型，验证准确率: 0.8932

Epoch 2/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:15<00:00, 17.64it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 73.54it/s]


训练损失: 0.2494, 训练准确率: 0.9092
验证损失: 0.2607, 验证准确率: 0.9074
当前学习率: 1.82e-05
保存最佳模型，验证准确率: 0.9074

Epoch 3/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:15<00:00, 17.23it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 76.81it/s]


训练损失: 0.1912, 训练准确率: 0.9371
验证损失: 0.3052, 验证准确率: 0.9093
当前学习率: 1.61e-05
保存最佳模型，验证准确率: 0.9093

Epoch 4/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:15<00:00, 17.53it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 74.78it/s]


训练损失: 0.1585, 训练准确率: 0.9492
验证损失: 0.3214, 验证准确率: 0.9187
当前学习率: 1.34e-05
保存最佳模型，验证准确率: 0.9187

Epoch 5/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:15<00:00, 17.53it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 77.80it/s]


训练损失: 0.1272, 训练准确率: 0.9648
验证损失: 0.3650, 验证准确率: 0.9121
当前学习率: 1.05e-05

Epoch 6/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:14<00:00, 17.88it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 78.37it/s]


训练损失: 0.1140, 训练准确率: 0.9704
验证损失: 0.3908, 验证准确率: 0.9083
当前学习率: 7.56e-06

Epoch 7/10
--------------------------------------------------


Training: 100%|██████████| 265/265 [00:14<00:00, 18.05it/s]
Validation: 100%|██████████| 67/67 [00:00<00:00, 80.23it/s]

训练损失: 0.0751, 训练准确率: 0.9823
验证损失: 0.4030, 验证准确率: 0.9159
当前学习率: 4.92e-06
早停触发!

最佳验证准确率: 0.9187

分类报告:
              precision    recall  f1-score   support

   Non-COVID       0.91      0.93      0.92       550
       COVID       0.92      0.90      0.91       508

    accuracy                           0.92      1058
   macro avg       0.92      0.92      0.92      1058
weighted avg       0.92      0.92      0.92      1058




