# 利用中文微博評價資料進行Bert微調


In [20]:
! pip install transformers datasets
! pip install evaluate



## 下載微博評價資料

In [22]:
!wget https://github.com/shhuangmust/AI/raw/refs/heads/113-1/weibo_senti_100k.csv

--2025-04-10 06:51:04--  https://github.com/shhuangmust/AI/raw/refs/heads/113-1/weibo_senti_100k.csv
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/shhuangmust/AI/refs/heads/113-1/weibo_senti_100k.csv [following]
--2025-04-10 06:51:04--  https://raw.githubusercontent.com/shhuangmust/AI/refs/heads/113-1/weibo_senti_100k.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19699818 (19M) [application/octet-stream]
Saving to: ‘weibo_senti_100k.csv.2’


2025-04-10 06:51:04 (196 MB/s) - ‘weibo_senti_100k.csv.2’ saved [19699818/19699818]



## 讀取Weibo資料集
- 共有119988筆資料

In [24]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("csv", data_files="weibo_senti_100k.csv")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 119988
    })
})


## 分割資料集
- 80%訓練(train)資料
- 10%測試(test)資料
- 10%驗證(valid)資料


In [26]:
train_testvalid = ds['train'].train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})


## 進行分詞

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/11999 [00:00<?, ? examples/s]

## 為簡化訓練，挑選10000筆作為訓練與測試資料

In [30]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000))
print(small_train_dataset)
print(small_eval_dataset)

Dataset({
    features: ['label', 'review', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})
Dataset({
    features: ['label', 'review', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})


## 列印一筆資料出來看

In [31]:
tokenized_datasets["train"][100]

{'label': 0,
 'review': '回复@流云的影子:哎，事实证明，我滴智商确实不再适合从事脑力劳动了[泪][泪][泪] //@流云的影子:为啥叫杏仁麦片，不是用的开心果吗？why why why？[挖鼻屎]',
 'input_ids': [101,
  1726,
  1908,
  137,
  3837,
  756,
  4638,
  2512,
  2094,
  131,
  1511,
  8024,
  752,
  2141,
  6395,
  3209,
  8024,
  2769,
  4017,
  3255,
  1555,
  4802,
  2141,
  679,
  1086,
  6844,
  1394,
  794,
  752,
  5554,
  1213,
  1227,
  1220,
  749,
  138,
  3801,
  140,
  138,
  3801,
  140,
  138,
  3801,
  140,
  120,
  120,
  137,
  3837,
  756,
  4638,
  2512,
  2094,
  131,
  711,
  1567,
  1373,
  3331,
  785,
  7931,
  4275,
  8024,
  679,
  3221,
  4500,
  4638,
  2458,
  2552,
  3362,
  1408,
  8043,
  11177,
  11177,
  11177,
  8043,
  138,
  2905,
  7965,
  2241,
  140,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

## 本次微調需要得到正面/負面的判斷結果，因此挑選AutoModelForSequenceClassification
- 輸出結果為正面/負面，因此num_labels=2

In [32]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 利用TrainingArguments設定微調參數

In [33]:
from transformers import TrainingArguments
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer_chinese", evaluation_strategy="epoch")




## 利用Trainer進行訓練
- 此處須輸入wandb key

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpeggypeng865[0m ([33mpeggypeng865-must[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1062,0.080539,0.9837
2,0.0867,0.100385,0.9827
3,0.0722,0.058515,0.9837


TrainOutput(global_step=3750, training_loss=0.09856639404296876, metrics={'train_runtime': 4367.8821, 'train_samples_per_second': 6.868, 'train_steps_per_second': 0.859, 'total_flos': 7893331660800000.0, 'train_loss': 0.09856639404296876, 'epoch': 3.0})

## 利用pipeline進行測試
- LABEL_0：負面
- LABEL_1：正面

In [11]:
from transformers import pipeline
pipe = pipeline("sentiment-analysis", model='test_trainer_chinese/checkpoint-1500', tokenizer=tokenizer)

Device set to use cuda:0


In [12]:
pipe("我喜歡這個產品")

[{'label': 'LABEL_1', 'score': 0.999847412109375}]