In [1]:
import pandas as pd
import re

In [2]:
##为训练集数据添加标签，把文本数据转化为csv
train_file_path = 'train.txt'
train_data = pd.read_csv(train_file_path, sep='\s+', header=None, engine='python')
train_data.columns = ['text', 'target']
train_data.to_csv('train_data.csv', index=False)
##为测试集数据添加标签，把文本数据转化为csv
dev_file_path = 'dev.txt'
dev_data = pd.read_csv(dev_file_path, sep='\s+', header=None, engine='python')
dev_data.columns = ['text', 'target']
dev_data.to_csv('dev_data.csv', index=False)
##为标签集数据添加标签，把文本数据转化为csv
class_file_path = 'class.txt'
data = []
with open(class_file_path, 'r') as file:
    for line in file:
        match = re.match(r'^(.*)\s+(\d+)$', line.strip())
        if match:
            category, label = match.groups()
            data.append((category, int(label)))
class_data = pd.DataFrame(data, columns=['category', 'label'])
class_data.to_csv('class_data.csv', index=False)

In [3]:
x_train = train_data['text'].tolist()
train_label = train_data['target'].tolist()

# 提取测试数据的文本和标签
x_test = dev_data['text'].tolist()
test_label = dev_data['target'].tolist()

In [4]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64)


In [5]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_label)
test_dataset = NewsDataset(test_encoding, test_label)

In [6]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [7]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=6)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size = 8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size= 8,shuffle=True)

# 优化方法
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()


------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.4045, 25.00%
epoth: 0, iter_num: 200, loss: 0.3224, 50.00%
epoth: 0, iter_num: 300, loss: 0.6282, 75.00%
epoth: 0, iter_num: 400, loss: 0.2371, 100.00%
Epoch: 0, Average training loss: 0.6618
Accuracy: 0.8502
Average testing loss: 0.4794
-------------------------------


In [35]:
len(train_loader)

400

In [9]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    
    all_predictions = []
    all_true_labels = []
    
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        predictions = np.argmax(logits, axis=1).flatten()
        label_ids = labels.to('cpu').numpy()
        
        all_predictions.extend(predictions)
        all_true_labels.extend(label_ids)
        
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    
        # 创建一个DataFrame并保存为CSV
    df = pd.DataFrame({'True Labels': all_true_labels, 'Predictions': all_predictions})
    df.to_csv('predictions.csv', index=False)
    print("Predictions saved to predictions.csv")

for epoch in range(3):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()


------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.2239, 13.33%
epoth: 0, iter_num: 200, loss: 0.0564, 26.67%
epoth: 0, iter_num: 300, loss: 0.2672, 40.00%
epoth: 0, iter_num: 400, loss: 0.7337, 53.33%
epoth: 0, iter_num: 500, loss: 0.0628, 66.67%
epoth: 0, iter_num: 600, loss: 0.2807, 80.00%
epoth: 0, iter_num: 700, loss: 0.4916, 93.33%
Epoch: 0, Average training loss: 0.3284
Accuracy: 0.8660
Average testing loss: 0.4582
-------------------------------
Predictions saved to predictions.csv
------------Epoch: 1 ----------------
epoth: 1, iter_num: 100, loss: 0.0808, 13.33%
epoth: 1, iter_num: 200, loss: 0.5397, 26.67%
epoth: 1, iter_num: 300, loss: 1.1698, 40.00%
epoth: 1, iter_num: 400, loss: 0.1138, 53.33%
epoth: 1, iter_num: 500, loss: 0.1091, 66.67%
epoth: 1, iter_num: 600, loss: 0.1232, 80.00%
epoth: 1, iter_num: 700, loss: 0.4343, 93.33%
Epoch: 1, Average training loss: 0.3290
Accuracy: 0.8660
Average testing loss: 0.4582
-------------------------------
Predict