In [1]:
from transformers import BertConfig, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM

config = BertConfig.from_pretrained('bert-base-chinese', num_labels=2)  #num_labels 設定類別數
model = BertForSequenceClassification.from_pretrained("bert-base-chinese",config=config)  
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import pandas as pd

def read_data(dataset):
    claim = list(dataset['claim'].values)
    narrative = list(dataset['narrative'].values)
    label = list(dataset['labels'].values)
    return claim,narrative,label

def add_targets(encodings,label):
    encodings.update({'label':label})

2023-05-24 03:24:40,029 [INFO] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-05-24 03:24:40,030 [INFO] NumExpr defaulting to 8 threads.


In [4]:
train_data = pd.read_csv("Training Dataset_v2/訓練資料集/claim_verification_train.csv")
eval_data = pd.read_csv("Training Dataset_v2/訓練資料集/claim_verification_val.csv")
test_data = pd.read_csv("Training Dataset_v2/訓練資料集/claim_verification_test.csv")

train_claim,train_narrative,trainlabel=read_data(train_data)
eval_claim,eval_narrative,evallabel=read_data(eval_data)
test_claim,test_narrative,testlabel=read_data(test_data)

In [5]:
from torch.utils import data

class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    item = {key: torch.tensor(eval[idx]) if not isinstance(eval[idx], str) else eval[idx] for key, eval in self.encodings.items()}

    return item

  def __len__(self):
    return len(self.encodings.input_ids)

In [6]:
'''因為trainlabel是以['refutes','supports'...]的方式儲存，需要轉成0,1的tensor才能給Datasets'''
# Define the label-to-index mapping
label2idx = {'refutes': 0, 'supports': 1}

# Convert labels to indices
trainlabel_convert = [label2idx[label] for label in trainlabel]
evallabel_convert = [label2idx[label] for label in evallabel]
# Convert label indices to tensor
trainlabel_convert = torch.tensor(trainlabel_convert)
evallabel_convert = torch.tensor(evallabel_convert)

print(trainlabel_convert)

tensor([0, 0, 0,  ..., 1, 0, 0])


In [7]:
train_encodings = tokenizer(train_claim, train_narrative, truncation=True, padding=True)
eval_encodings = tokenizer(eval_claim, eval_narrative, truncation=True, padding=True)

#原本只有[claim, narrative]，執行底下加入label
add_targets(train_encodings,trainlabel_convert)
add_targets(eval_encodings,evallabel_convert)

train_dataset = Dataset(train_encodings)
eval_dataset = Dataset(eval_encodings)

In [19]:
import logging
import datasets
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
import math

import transformers
from accelerate import Accelerator
from transformers import (
    AdamW,
    AutoConfig,
    default_data_collator,
    get_scheduler
)

train_batch_size = 12      # 設定 training batch size 
eval_batch_size = 12      # 設定 eval batch size
num_train_epochs = 3      # 設定 epoch 

In [9]:
data_collator = default_data_collator
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=eval_batch_size)

In [10]:
learning_rate=3e-5          # 設定 learning_rate
gradient_accumulation_steps = 1   # 設定 幾步後進行反向傳播

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },                                
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch
print('max_train_steps', max_train_steps)

# scheduler
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

max_train_steps 1076




In [11]:
from datasets import load_dataset, load_metric

# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()

# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

eval_dataloader
metric = load_metric("accuracy")

  metric = load_metric("accuracy")


In [17]:
model = BertForSequenceClassification.from_pretrained("epoch_4/pytorch_model.bin",config='epoch_4/config.json') 
model.to(device) 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [20]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state)
output_dir = '.'  # your folder
 

total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps

logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(train_dataset)}")
logger.info(f"  Num Epochs = {num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
logger.info(f"  Total optimization steps = {max_train_steps}")


completed_steps = 0
best_epoch = {"epoch": 0, "acc": 0 }

for epoch in trange(num_train_epochs, desc="Epoch"):#trange是print進度條的方式
  model.train()
  for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    outputs = model(**batch)
    #loss = outputs
    loss = outputs.loss
    #loss = loss / gradient_accumulation_steps
    accelerator.backward(loss)
    #if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: 把if刪掉了
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    completed_steps += 1

    if step % 50 == 0:
      print({'epoch': epoch, 'step': step, 'loss': loss.item()})

    if completed_steps >= max_train_steps:
      break
      
  logger.info("***** Running eval *****")
  model.eval()
  for step, batch in enumerate(tqdm(eval_dataloader, desc="eval Iteration")):
    outputs = model(**batch)
    predictions = outputs.logits.argmax(dim=-1)
    metric.add_batch(
        predictions=accelerator.gather(predictions),
        references=accelerator.gather(batch["labels"]),
    )

  eval_metric = metric.compute()
  logger.info(f"epoch {epoch}: {eval_metric}")
  if eval_metric['accuracy'] > best_epoch['acc']:
    best_epoch.update({"epoch": epoch, "acc": eval_metric['accuracy']})

  if output_dir is not None:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir + '/' + 'claim_verification_epoch_' + str(epoch), save_function=accelerator.save)

2023-05-24 03:27:51,106 [INFO] Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: no

2023-05-24 03:27:51,107 [INFO] ***** Running training *****
2023-05-24 03:27:51,107 [INFO]   Num examples = 3222
2023-05-24 03:27:51,108 [INFO]   Num Epochs = 3
2023-05-24 03:27:51,108 [INFO]   Instantaneous batch size per device = 12
2023-05-24 03:27:51,109 [INFO]   Total train batch size (w. parallel, distributed & accumulation) = 12
2023-05-24 03:27:51,109 [INFO]   Gradient Accumulation steps = 1
2023-05-24 03:27:51,110 [INFO]   Total optimization steps = 1076
  item = {key: torch.tensor(eval[idx]) if not isinstance(eval[idx], str) else eval[idx] for key, eval in self.encodings.items()}


{'epoch': 0, 'step': 0, 'loss': 0.3067873418331146}




{'epoch': 0, 'step': 50, 'loss': 0.21079297363758087}




{'epoch': 0, 'step': 100, 'loss': 0.35971006751060486}




{'epoch': 0, 'step': 150, 'loss': 0.3457610607147217}




{'epoch': 0, 'step': 200, 'loss': 0.3721717596054077}




{'epoch': 0, 'step': 250, 'loss': 0.2983368933200836}


Iteration: 100%|██████████| 269/269 [01:58<00:00,  2.27it/s]
2023-05-24 03:29:49,634 [INFO] ***** Running eval *****
eval Iteration: 100%|██████████| 69/69 [00:10<00:00,  6.74it/s]
2023-05-24 03:29:59,878 [INFO] epoch 0: {'accuracy': 0.5520581113801453}
  item = {key: torch.tensor(eval[idx]) if not isinstance(eval[idx], str) else eval[idx] for key, eval in self.encodings.items()}


{'epoch': 1, 'step': 0, 'loss': 0.1927836388349533}




{'epoch': 1, 'step': 50, 'loss': 0.44497981667518616}




{'epoch': 1, 'step': 100, 'loss': 0.2777636647224426}




{'epoch': 1, 'step': 150, 'loss': 0.19186241924762726}




{'epoch': 1, 'step': 200, 'loss': 0.2380562573671341}




{'epoch': 1, 'step': 250, 'loss': 0.20319266617298126}


Iteration: 100%|██████████| 269/269 [01:59<00:00,  2.25it/s]
2023-05-24 03:31:59,655 [INFO] ***** Running eval *****
eval Iteration: 100%|██████████| 69/69 [00:10<00:00,  6.73it/s]
2023-05-24 03:32:09,912 [INFO] epoch 1: {'accuracy': 0.5520581113801453}
  item = {key: torch.tensor(eval[idx]) if not isinstance(eval[idx], str) else eval[idx] for key, eval in self.encodings.items()}


{'epoch': 2, 'step': 0, 'loss': 0.3657432794570923}




{'epoch': 2, 'step': 50, 'loss': 0.5146598815917969}




{'epoch': 2, 'step': 100, 'loss': 0.26395276188850403}




{'epoch': 2, 'step': 150, 'loss': 0.19241875410079956}




{'epoch': 2, 'step': 200, 'loss': 0.7060050368309021}




{'epoch': 2, 'step': 250, 'loss': 0.23825357854366302}


Iteration: 100%|██████████| 269/269 [01:59<00:00,  2.25it/s]
2023-05-24 03:34:10,002 [INFO] ***** Running eval *****
eval Iteration: 100%|██████████| 69/69 [00:10<00:00,  6.74it/s]
2023-05-24 03:34:20,251 [INFO] epoch 2: {'accuracy': 0.5520581113801453}
Epoch: 100%|██████████| 3/3 [06:29<00:00, 129.83s/it]


In [21]:
print(best_epoch)

{'epoch': 0, 'acc': 0.5520581113801453}


In [22]:
def mrpc_model(model, sen1, sen2):
  input_encodings = tokenizer([sen1], [sen2], padding='max_length', truncation=True)
  input_dataset = Dataset(input_encodings)
  #print(input_encodings)
  #print(input_dataset[0])
  data_collator = default_data_collator
  input_dataloader = DataLoader(input_dataset, collate_fn=data_collator, batch_size=1)  

  accelerator = Accelerator()
  model, input_dataloader = accelerator.prepare(model, input_dataloader)

  for batch in input_dataloader:
    outputs = model(**batch)
    predicted = outputs.logits.argmax(dim=-1)
  return predicted

In [23]:
sen1="lisa goes to school everyday"
sen2="lisa everyday goes to school"
#sen1="lisa is a singer"
#sen2="lisa is not a singer"

predict = mrpc_model(model, sen1, sen2)
print("sentence= : ", sen1)
print("sentence= : ", sen2)

print("predict_label : ", predict.item())
if predict.item():
  print("有關聯")
else:
  print("沒關聯")


sentence= :  lisa goes to school everyday
sentence= :  lisa everyday goes to school
predict_label :  1
有關聯


In [24]:
len(eval_data)

826

In [26]:
cnt=0
errorcnt=0
for i in range(len(test_data['labels'])):
    if test_data['labels'][i]=='refutes' or test_data['labels'][i]=='supports':
        cnt+=1
        sen1=test_data['claim'][i]
        sen2=test_data['narrative'][i]
        predict=mrpc_model(model,sen1,sen2)
        #print("sentence= : ", sen1)
        #print("sentence= : ", sen2)

        #print("predict_label : ", predict.item())
        if predict.item()!=label2idx[test_data['labels'][i]]:
            errorcnt+=1
        
print(cnt)
print(errorcnt)

553
233
