In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive

drive.mount("/content/drive")

torch.manual_seed(42)
np.random.seed(42)

base_dir = "/content/drive/MyDrive/your_folder"
data_path = f"{base_dir}/dataset.csv"
model_name = "bert-base-chinese"

dataset = pd.read_csv(data_path, sep="\t", header=None, names=["text", "label"])
dataset = dataset.dropna()
dataset["text"] = dataset["text"].astype(str)
dataset["label"] = dataset["label"].astype(str)

lbl = LabelEncoder()
y_all = lbl.fit_transform(dataset["label"].values)
x_all = dataset["text"].values

x_train, x_test, y_train, y_test = train_test_split(
    x_all,
    y_all,
    test_size=0.2,
    random_state=42,
    stratify=y_all
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_enc = tokenizer(list(x_train), truncation=True, padding=True, max_length=64)
test_enc = tokenizer(list(x_test), truncation=True, padding=True, max_length=64)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_enc, y_train)
test_dataset = NewsDataset(test_enc, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

num_labels = len(lbl.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optim = torch.optim.AdamW(model.parameters(), lr=2e-5)

def accuracy_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    return (preds == labels).mean()

def train_one_epoch(epoch_idx):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch_idx} | train_loss={avg_loss:.4f}")

@torch.no_grad()
def evaluate(epoch_idx):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits.detach().cpu().numpy()
        y_true = labels.detach().cpu().numpy()
        total_loss += loss.item()
        total_acc += accuracy_from_logits(logits, y_true)
    avg_loss = total_loss / len(test_loader)
    avg_acc = total_acc / len(test_loader)
    print(f"Epoch {epoch_idx} | val_loss={avg_loss:.4f} | val_acc={avg_acc:.4f}")

@torch.no_grad()
def predict(texts):
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(texts, truncation=True, padding=True, max_length=64, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    model.eval()
    logits = model(**enc).logits
    pred_ids = torch.argmax(logits, dim=1).detach().cpu().numpy()
    pred_labels = lbl.inverse_transform(pred_ids)
    probs = torch.softmax(logits, dim=1).detach().cpu().numpy()
    conf = probs[np.arange(len(pred_ids)), pred_ids]
    return list(zip(pred_labels.tolist(), conf.tolist()))

epochs = 4
for ep in range(epochs):
    train_one_epoch(ep)
    evaluate(ep)

print("labels:", list(lbl.classes_))

examples = [
    "三星手机和华为手机哪个好？"
]

preds = predict(examples)
for text, (label, conf) in zip(examples, preds):
    print(f"text={text}\tpred={label}\tconfidence={conf:.4f}")

save_dir = f"{base_dir}/finetuned_bert"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

with open(f"{save_dir}/label_classes.txt", "w", encoding="utf-8") as f:
    for c in lbl.classes_:
        f.write(str(c) + "\n")

print("saved_to:", save_dir)
