# **目標**:
### 使用 BERT-chinese-base 訓練 Taipei_FAQ 分類器


# 安裝套件

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


# 確認 GPU 分配

In [None]:
!nvidia-smi

Tue Feb 28 12:42:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    27W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Mount 雲端硬碟

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## cd 到自己的雲端硬碟中的colab

In [None]:
%cd /content/drive/"MyDrive"/"Colab Notebooks"  

/content/drive/MyDrive/Colab Notebooks


In [None]:
!ls

Taipei_FAQ_Classifier  Taipei_FAQ_Model  Untitled0.ipynb


# 資料前處理

In [None]:
import os
from torch.utils import data
import torch
import csv
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
### 將資料中的 taget 做 LabelEncoder，並將資料分別收集成 list ###
###  ex: questions=[問句1, 問句2,...]，tagets=[t1, t2,....]  ###

def read_data(fpath):
  questions = []
  tagets = [] 
  
  with open(fpath) as file:
    data = csv.reader(file)
    questions = []
    answers = []
    for line in data:
      questions.append(line[0])
      answers.append(line[1])
  
  os.makedirs("Taipei_FAQ_Model/", exist_ok=True)    ##建立儲存 model 資料夾

  try:
    with open('Taipei_FAQ_Model/label_encoder.pkl', 'rb') as reader:
      le = pickle.load(reader)
    le_target = le.transform(answers)              
  except:
    le = LabelEncoder()
    le_target = le.fit_transform(answers)

  with open('Taipei_FAQ_Model/label_encoder.pkl', "wb") as writer:
    pickle.dump(le, writer) 

    
  for ele in le_target: # 將資料中的 taget 做 LabelEncoder，questions=[問句1, 問句2,...]，tagets=[t1, t2,....]
    tagets.append(ele)

  return questions, tagets

In [None]:
train_questions, train_tagets = read_data('/content/drive/MyDrive/新生訓練/drive-download-20230214T072635Z-001/Taipei_FAQ/Taipei_FAQ_classifier_training.csv')
eval_questions, eval_tagets = read_data('/content/drive/MyDrive/新生訓練/drive-download-20230214T072635Z-001/Taipei_FAQ/Taipei_FAQ_classifier_testing.csv')

In [None]:
ans = 0
for i in train_tagets:
  if(train_tagets[i]>ans):
    print("get/n")
    ans = train_tagets[i]
print(ans)

get/n
get/n
get/n
77


In [None]:
print("training data 筆數：", len(train_questions))
print("testing data 筆數：", len(eval_questions))
print("question：", train_questions[0])
print("taget：", train_tagets[0])

with open('Taipei_FAQ_Model/label_encoder.pkl', 'rb') as reader:
  le = pickle.load(reader)
print("taget name：", le.inverse_transform([train_tagets[0]]))

training data 筆數： 5397
testing data 筆數： 1350
question： 兵役-延期入營問答
taget： 3
taget name： ['臺北市內湖區公所']


# 將資料進行 Tokenize
## 將 input 資料轉換成 token id 、tpye_id 與 attention_mask

In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

train_encodings = tokenizer(train_questions, truncation=True, padding=True) #與最長一樣長
eval_encodings = tokenizer(eval_questions, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

## 檢查轉換是否正確

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
print(train_encodings['input_ids'][0])
print(tokenizer.decode(train_encodings['input_ids'][0]))

print(train_encodings['token_type_ids'][0])
print(train_encodings['attention_mask'][0])

[101, 1070, 2514, 118, 2454, 3309, 1057, 4245, 1558, 5031, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[CLS] 兵 役 - 延 期 入 營 問 答 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

## 加入 Label

In [None]:
def add_tagets(encodings, tagets):
  encodings.update({'label': tagets})

add_tagets(train_encodings, train_tagets)
add_tagets(eval_encodings, eval_tagets)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [None]:
print(train_encodings['label'][0])

3


# 定義 Dataset，並轉換成 tensor 格式

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
eval_dataset = Dataset(eval_encodings) 

In [None]:
train_dataset[0]

{'input_ids': tensor([ 101, 1070, 2514,  118, 2454, 3309, 1057, 4245, 1558, 5031,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, 

# 載入模型架構( SequenceClassification )

In [None]:
from transformers import BertConfig, BertForSequenceClassification
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=78)  #num_labels 設定類別數
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', config=config)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## 查看模型架構

In [None]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# 訓練模型

In [None]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.0-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m20.3

In [None]:
pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.16.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.16.0


In [None]:

import logging
import datasets
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
import math

import transformers
from accelerate import Accelerator
from transformers import (
    AdamW,
    AutoConfig,
    default_data_collator,
    get_scheduler
)

## 設定 epoch 與 batch size

In [None]:
train_batch_size = 40      # 設定 training batch size
eval_batch_size = 10      # 設定 eval batch size
num_train_epochs = 10      # 設定 epoch 

## 將資料丟入 DataLoader


In [None]:
data_collator = default_data_collator
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=eval_batch_size)

## Optimizer 、Learning rate 、Scheduler 設定

In [None]:
learning_rate=3e-5          # 設定 learning_rate
gradient_accumulation_steps = 1   # 設定 幾步後進行反向傳播

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },                                
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch
print('max_train_steps', max_train_steps)

# scheduler
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

max_train_steps 1350




## 將資料、參數丟入 Accelerator



In [None]:
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()

# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

## 設定 metric 方法

In [None]:
# Get the metric function

metric = load_metric("accuracy")

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
print(metric)

Metric(name: "accuracy", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = datasets.load_metric("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
        {'accuracy': 0.5}

   

## 開始訓練

In [None]:
# Train!
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state)
output_dir = 'Taipei_FAQ_Classifier/'


total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps

logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(train_dataset)}")
logger.info(f"  Num Epochs = {num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
logger.info(f"  Total optimization steps = {max_train_steps}")


completed_steps = 0
best_epoch = {"epoch:": 0, "acc": 0 }

for epoch in trange(num_train_epochs, desc="Epoch"): #trange 顯示循環標籤 desc 顯示進度條標籤
  model.train()
  for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    outputs = model(**batch)
    loss = outputs.loss
    loss = loss / gradient_accumulation_steps #可得單個樣本的loss
    accelerator.backward(loss)
    if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
      optimizer.step() #更新參數
      lr_scheduler.step() #更新學習率的優化器
      optimizer.zero_grad() #清空梯度緩存，以便進行下一個小批次的梯度計算
      completed_steps += 1

    if step % 50 == 0:
      print({'epoch': epoch, 'step': step, 'loss': loss.item()})

    if completed_steps >= max_train_steps:
      break
      
  logger.info("***** Running eval *****")
  model.eval()
  for step, batch in enumerate(tqdm(eval_dataloader, desc="Eval Iteration")):
    outputs = model(**batch)
    predictions = outputs.logits.argmax(dim=-1) #挑機率最大的
    metric.add_batch(
        predictions=accelerator.gather(predictions),
        references=accelerator.gather(batch["labels"]),
    )

  eval_metric = metric.compute()
  logger.info(f"epoch {epoch}: {eval_metric}")
  if eval_metric['accuracy'] > best_epoch['acc']:
    best_epoch['epoch'] = num_train_epochs #記錄最佳的epoch
    best_epoch['acc'] = eval_metric['accuracy'] #記錄最佳的準確率


  if output_dir is not None:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model) #accelerator.unwrap_model 將model的參數轉移到device上的過程
    unwrapped_model.save_pretrained(output_dir + 'epoch_' + str(num_train_epochs) + '/', save_function=accelerator.save) #num_train_epochs 當前epoch編號


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 0, 'step': 0, 'loss': 4.379201889038086}
{'epoch': 0, 'step': 50, 'loss': 3.5889346599578857}
{'epoch': 0, 'step': 100, 'loss': 2.728724241256714}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 1, 'step': 0, 'loss': 2.055311918258667}
{'epoch': 1, 'step': 50, 'loss': 1.999396562576294}
{'epoch': 1, 'step': 100, 'loss': 2.0151591300964355}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 2, 'step': 0, 'loss': 1.395156979560852}
{'epoch': 2, 'step': 50, 'loss': 1.1996139287948608}
{'epoch': 2, 'step': 100, 'loss': 0.9824458956718445}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 3, 'step': 0, 'loss': 0.701355516910553}
{'epoch': 3, 'step': 50, 'loss': 0.7251821160316467}
{'epoch': 3, 'step': 100, 'loss': 0.7478501200675964}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 4, 'step': 0, 'loss': 0.47476476430892944}
{'epoch': 4, 'step': 50, 'loss': 0.4029313921928406}
{'epoch': 4, 'step': 100, 'loss': 0.4712938368320465}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 5, 'step': 0, 'loss': 0.3483648896217346}
{'epoch': 5, 'step': 50, 'loss': 0.39600855112075806}
{'epoch': 5, 'step': 100, 'loss': 0.30967047810554504}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 6, 'step': 0, 'loss': 0.3704070448875427}
{'epoch': 6, 'step': 50, 'loss': 0.236676886677742}
{'epoch': 6, 'step': 100, 'loss': 0.293918251991272}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 7, 'step': 0, 'loss': 0.16896331310272217}
{'epoch': 7, 'step': 50, 'loss': 0.17498111724853516}
{'epoch': 7, 'step': 100, 'loss': 0.13823023438453674}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 8, 'step': 0, 'loss': 0.18624632060527802}
{'epoch': 8, 'step': 50, 'loss': 0.21790897846221924}
{'epoch': 8, 'step': 100, 'loss': 0.12742920219898224}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

{'epoch': 9, 'step': 0, 'loss': 0.15643705427646637}
{'epoch': 9, 'step': 50, 'loss': 0.1345149129629135}
{'epoch': 9, 'step': 100, 'loss': 0.15413261950016022}


Eval Iteration:   0%|          | 0/135 [00:00<?, ?it/s]

In [None]:
print(best_epoch)

{'epoch:': 0, 'acc': 0.8407407407407408, 'epoch': 10}


# Inference

In [None]:
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification

In [None]:
# **撰寫預測程式**
def FAQ_model(model, question):
  input_encodings = tokenizer([question], truncation=True, padding=True) #把資料轉成模型看得懂的
  input_dataset = Dataset(input_encodings)

  data_collator = default_data_collator #資料丟入dataloader
  input_dataloader = DataLoader(input_dataset, collate_fn=data_collator, batch_size=1)  

  accelerator = Accelerator() #加速器
  model, input_dataloader = accelerator.prepare(model, input_dataloader)

  for batch in input_dataloader:
    outputs = model(**batch)
    predicted = outputs.logits.argmax(dim=-1) #挑78個label裡機率最大的
  return predicted

In [None]:
model_path = "./Taipei_FAQ_Model"

with open(model_path + "/label_encoder.pkl", 'rb') as reader:
  le = pickle.load(reader)


tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
model = BertForSequenceClassification.from_pretrained(model_path + "/epoch_10")

In [None]:
'''
如何查詢藝文推廣處城市舞台檔期?	臺北市藝文推廣處
個案若安置於機構，是否能使用失能身心障礙日間照顧中心服務?	臺北市政府社會局身心障礙者福利科
105年12月25日是否還是勞工國定假日?國定假日全國一致規定何時開始實施?	臺北市政府勞動局勞動基準科	
數人共同承買的不動產，誰要負責申報?	臺北市政府地政局地價科	
'''

questions = ['如何查詢藝文推廣處城市舞台檔期?',
        '個案若安置於機構，是否能使用失能身心障礙日間照顧中心服務?',
        '105年12月25日是否還是勞工國定假日?國定假日全國一致規定何時開始實施?',
        '數人共同承買的不動產，誰要負責申報?']

for q in questions:
  answer = FAQ_model(model, q)
  
  print(q, le.inverse_transform([answer.item()])) #le.inverse_transformru 將標籤轉換為答案字串


如何查詢藝文推廣處城市舞台檔期? ['臺北市藝文推廣處']
個案若安置於機構，是否能使用失能身心障礙日間照顧中心服務? ['臺北市政府社會局身心障礙者福利科']
105年12月25日是否還是勞工國定假日?國定假日全國一致規定何時開始實施? ['臺北市政府勞動局勞動基準科']
數人共同承買的不動產，誰要負責申報? ['臺北市政府地政局地價科']
