In [None]:
import os
import numpy as np
import torch
import time
import datetime
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

# 超參數設置
EPOCHS = 10
BATCH_SIZE = 10
PRINT_TIME_PER_STEP = 40
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-2
EPSILON = 1e-8

LREC_label = ['app', 'bus', 'calc', 'chat', 'cinemas', 'contacts', 'cookbook', 'datetime', 'email', 'epg', 'flight', 'health', 
              'lottery', 'map', 'match', 'message', 'music', 'news', 'novel', 'poetry', 'radio', 'riddle', 'schedule', 'stock', 
              'telephone', 'train', 'translation', 'tvchannel', 'video', 'weather', 'website']

LREC_train_size = [53, 24, 24, 455, 24, 30, 269, 18, 24, 107, 62, 55,
                   24, 68, 24, 63, 66, 58, 24, 102, 24, 34, 29, 71,
                   63, 70, 61, 71, 182, 66, 54]

LREC_test_size = [18, 8, 8, 51, 8, 10, 90, 6, 8, 36, 21, 18,
                  8, 23, 8, 21, 22, 19, 8, 34, 8, 11, 10, 24,
                  21, 23, 20, 24, 61, 22, 18]

LREC_develop_size = [18, 8, 8, 154, 8, 10, 89, 6, 8, 36, 21, 19,
                     8, 23, 8, 21, 22, 20, 8, 34, 8, 11, 9, 24,
                     21, 24, 21, 23, 60, 22, 18]
 
# classes: 31


In [None]:
# 讀檔
def readfile(filename):
    with open(filename, encoding="utf-8-sig") as f:
        content = f.readlines()
        return content
    
train_path = '/BERT/dataset/text_cls/train/'
test_path = '/BERT/dataset/text_cls/test/'
predict_path = '/BERT/dataset/text_cls/develop/'

# 檔名依字典排序排好
train_files = os.listdir(train_path)
train_files.sort(key=lambda x:str(x[:-4]))

test_files = os.listdir(test_path)
test_files.sort(key=lambda x:str(x[:-4]))

predict_files = os.listdir(predict_path)
predict_files.sort(key=lambda x:str(x[:-4]))

# 讀入資料，設定標籤
def get_text_and_label(filepath, files, zero_shot_cls = []):
    total_text = []
    target = []
    for i in range(len(files)):
        if i in zero_shot_cls:
            continue
        current_text = readfile(filepath + files[i])
        total_text.append(current_text)

        current_target = np.full(len(current_text), i)
        target = np.concatenate((target, current_target), axis=0)

    target = target.reshape(-1, 1)
    total_target = torch.tensor(target)
    return total_text, total_target

zero_shot_cls = []

train_text, train_target = get_text_and_label(train_path, train_files, zero_shot_cls)
test_text, test_target = get_text_and_label(test_path, test_files, zero_shot_cls)
predict_text, predict_target = get_text_and_label(predict_path, predict_files)

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir = "transformer_file/")

# 測試輸出

# training sen: 2299
# training sen Max size: 49

# testing sen: 667
# testing sen Max size: 27

In [None]:
# 把每一句轉成數字，每具長度都是64 (62 + 首尾2個特殊 token)
def convert_text_to_token(tokenizer, input_text, limit_size = 62):
    tokens = tokenizer.encode(input_text[:limit_size])
    if len(tokens) < limit_size + 2:
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens

train_ids = []
for i in range(len(train_text)):
    for j in range(len(train_text[i])):
        train_ids.append(convert_text_to_token(tokenizer, train_text[i][j]))
        
train_tokens = torch.tensor(train_ids)
print(train_tokens.shape)

test_ids = []
for i in range(len(test_text)):
    for j in range(len(test_text[i])):
        test_ids.append(convert_text_to_token(tokenizer, test_text[i][j]))
        
test_tokens = torch.tensor(test_ids)
print(test_tokens.shape)

In [None]:
# 建立mask
def attention_masks(input_ids):
    atten_masks = []
    for seq in input_ids:
        seq_mask =  [float(i>0) for i in seq]
        atten_masks.append(seq_mask)
    return atten_masks

train_atten_masks = attention_masks(train_ids)
train_attention_tokens = torch.tensor(train_atten_masks)
print(train_attention_tokens.shape)

test_atten_masks = attention_masks(test_ids)
test_attention_tokens = torch.tensor(test_atten_masks)
print(test_attention_tokens.shape)

In [None]:
print("Original text:\n", train_text[0][0])
print("Token tensor:\n", train_tokens[0])
print("Attention mask tensor:\n", train_attention_tokens[0])
print("Target label tensor:\n", train_target[0])

print("Original text:\n", test_text[0][0])
print("Token tensor:\n", test_tokens[0])
print("Attention mask tensor:\n", test_attention_tokens[0])
print("Target label tensor:\n", test_target[0])

In [None]:
# 創建 DataLoader
train_data = TensorDataset(train_tokens, train_attention_tokens, train_target)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE) 

test_data = TensorDataset(test_tokens, test_attention_tokens, test_target)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE) 

# 測試輸出
for i, (train, mask, label) in enumerate (train_dataloader):
    print(train.shape, mask.shape, label.shape)
    break
print('Train datalodaer length:', len(train_dataloader))
for i, (test, mask, label) in enumerate (test_dataloader):
    print(test.shape, mask.shape, label.shape)
    break
print('Test datalodaer length:', len(test_dataloader))

In [None]:
# 創建模型
LREC_model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels = 31) # 共31種分類
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LREC_model.to(device)

# 定義優化器
#optimizer = AdamW(LREC_model.parameters(), lr = LEARNING_RATE, eps = EPSILON)

# 定義優化器: bias 和 LayerNorm.weight 不用權重衰減
no_decay = ['bias', 'LayerNorm.weight']
optimizer = [
    {'params': [p for n, p in LREC_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in LREC_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = AdamW(optimizer, lr = LEARNING_RATE, eps = EPSILON)

# learning rate scheduler
epochs = EPOCHS
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
# 模型準確率
def cls_acc(preds, labels):
    correct = torch.eq(torch.max(preds, dim = 1)[1], labels.flatten()).float()
    acc = correct.sum().item() / len(correct)
    return acc

# 模型運算時間
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds = elapsed_rounded))

In [None]:
# 訓練模型
def train(model, optimizer):
    t0 = time.time()
    avg_loss, avg_acc = [], []
    
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % PRINT_TIME_PER_STEP == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,} of {:>5,}.  Time: {:}.'.format(step, len(train_dataloader), elapsed))
        
        b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = output[0], output[1]
        
        avg_loss.append(loss.item())
        
        acc = cls_acc(logits, b_labels)
        avg_acc.append(acc)
        
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_loss = round(np.array(avg_loss).mean(),3)
    avg_acc = round(np.array(avg_acc).mean(),3)
    return avg_loss, avg_acc

In [None]:
# 評估模型
def evaluate(model):
    avg_acc = []
    model.eval()
    
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
            
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            
            acc = cls_acc(output[0], b_labels)
            avg_acc.append(acc)
    avg_acc = round(np.array(avg_acc).mean(),3)
    return avg_acc

In [None]:
# 執行訓練與評估模型
for epoch in range(epochs):
    train_loss, train_acc = train(LREC_model, optimizer)
    print('epoch = {}, train_acc = {}, train_loss = {}'.format(epoch, train_acc, train_loss))
    test_acc = evaluate(LREC_model)
    print('epoch = {}, test_acc = {}'.format(epoch, test_acc))

In [None]:
# 預測
def predict(model, sen):
    input_id = convert_text_to_token(tokenizer, sen)
    input_token = torch.tensor(input_id).long().to(device)
    
    atten_mask = [float(i>0) for i in input_id]
    attention_token = torch.tensor(atten_mask).long().to(device)
    
    output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_token.view(1, -1))
    return torch.max(output[0], dim=1)[1]

# 預測測試
error_count = 0
total_count = 0

cls_result = []

for i in range(len(predict_text)):
    cls_total = 0
    cls_error = 0
    for j in range(len(predict_text[i])):
        total_count += 1
        cls_total += 1
        input_sen = predict_text[i][j]
        label = predict(LREC_model, input_sen)
        if label != i:
            error_count += 1
            cls_error += 1
            #print('Sentence: ', input_sen[:-1], '\nClass:', int(label), LREC_label[label].ljust(15), 'Correct class:', i, LREC_label[i])
    cls_result.append("%d / %d" % (cls_error, cls_total))

for i in range(len(cls_result)):
    print("Class %d errors: %s" %(i, cls_result[i]))
    
print("\nTotal errors: %d/%d" % (error_count, total_count))

In [None]:
for i in range(len(predict_text)):
    print(len(predict_text[i]))

In [None]:
a = [3,6]
for i in range(10):
    if i in a:
        continue
    print(i)