In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('./pretrained_models/chinese-macbert-base')

print(tokenizer)

tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'contains no wit , only labored gags'
])

There was a problem when trying to write in your cache folder (/work/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.


PreTrainedTokenizerFast(name_or_path='./pretrained_models/chinese-macbert-base', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


{'input_ids': [[101, 8913, 8510, 8343, 11634, 11137, 8670, 8174, 9519, 10858, 8315, 12816, 8118, 102], [101, 11485, 8383, 9556, 8275, 8541, 8165, 117, 10110, 11441, 9132, 8168, 11005, 9726, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# 定义数据集

## numpy array 转 pytorch tensor

In [13]:
import numpy as np
import torch

class Dataset(torch.utils.data.Dataset):

    def __init__(self, split):
        data = np.load('./data/5.news_data/news_matrix_fine_tuning.npy', allow_pickle=True) 

        data_0 = data[data[:, 1] == 0] 
        data_1 = data[data[:, 1] == 1] 


        data = np.concatenate([data_0, data_1])


        # train_len = int(len(data) * 0.8) 
        train_len = int(len(data) * 0.6)
        val_len =  int(len(data) * 0.2)
        # test_len = len(data) - train_len 
        test_len = len(data) - train_len - val_len
        train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(dataset = data.tolist(), lengths = [train_len, val_len, test_len], generator = torch.Generator().manual_seed(1))
        
        if split == 'train':
            data = train_dataset
        elif split == 'valid':
            data = valid_dataset
        elif split == 'test':
            data = test_dataset

        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        text = self.data[i][0]
        label = self.data[i][1]

        return text, label

len(Dataset('train')), len(Dataset('valid')), len(Dataset('test')), Dataset('train')[0]

(2176, 725, 726, ('上海协调机制出台首份政策文件 持续加大复工复产金融支持', 1))

# DataLoader

## 自定义 collate_fn 函数

In [15]:

def collate_fn(batch):  
    sents = [i[0] for i in batch]
    labels = [i[1] for i in batch]


    batch = tokenizer.batch_encode_plus( 
        batch_text_or_text_pairs=sents,
        padding='longest', 
        add_special_tokens = True, 
        return_tensors='pt')

    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    token_type_ids = batch['token_type_ids']
    labels = torch.LongTensor(labels)

    return input_ids, attention_mask, token_type_ids, labels 

## train_dataloader

In [16]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=Dataset('train'),
    batch_size=16,  
    collate_fn=collate_fn,
    shuffle=True,  
    drop_last=True) 

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_dataloader): 
    break 

print(len(train_dataloader)) 
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels 

136


(torch.Size([16, 40]),
 torch.Size([16, 40]),
 torch.Size([16, 40]),
 tensor([0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]))

## valid_dataloader

In [17]:
valid_dataloader = torch.utils.data.DataLoader(
    dataset=Dataset('valid'),
    batch_size=16, 
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
)

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(valid_dataloader): 
    break 

print(len(valid_dataloader)) 
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels 

45


(torch.Size([16, 44]),
 torch.Size([16, 44]),
 torch.Size([16, 44]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0]))

## test_dataloader

In [18]:
test_dataloader = torch.utils.data.DataLoader(
    dataset=Dataset('test'),
    batch_size=16, 
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
)

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(test_dataloader): 
    break 

print(len(test_dataloader)) 
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels 

45


(torch.Size([16, 41]),
 torch.Size([16, 41]),
 torch.Size([16, 41]),
 tensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]))

# 模型

In [19]:
from transformers import BertModel

class Model(torch.nn.Module):

    def __init__(self):
        super().__init__() 
        
        self.pretrained = BertModel.from_pretrained('./model/pre_trained_bert/chinese-macbert-base') 

        self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),
                                      torch.nn.ReLU(), 
                                      torch.nn.Dropout(p=0.2),
                                      torch.nn.Linear(768, 2))
        
        self.criterion = torch.nn.CrossEntropyLoss() 

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        logits = self.pretrained(input_ids=input_ids, 
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids)
        logits = logits.last_hidden_state[:, 0] 
        logits = self.fc(logits)
        
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        return {'loss': loss, 'logits': logits} 

model = Model() 

print(sum(i.numel() for i in model.parameters()) / 10000)

out = model(input_ids, attention_mask, token_type_ids, labels)

out['loss'], out['logits'].shape

Some weights of the model checkpoint at ./pretrained_models/chinese-macbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


10285.9778


(tensor(0.7349, grad_fn=<NllLossBackward0>), torch.Size([16, 2]))

# 测试函数

In [24]:
def test(model,test_dataloader):
    print("Running Test...")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu' 
    print('device=', device)
    
    model = model.to(device)
    
    model.eval()
    
    correct = 0
    total = 0
    loop = tqdm(enumerate(test_dataloader), total =len(test_dataloader), desc='Test progress bar', file=sys.stdout)
    for i, (input_ids, attention_mask, token_type_ids, labels) in loop:
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)
        
        with torch.no_grad(): 
            out = model(input_ids, attention_mask, token_type_ids, None)
        
        correct += (out['logits'].argmax(dim=1) == labels).sum().item()
        total += len(labels)
    
    print("Test Acc: {}".format(correct / total))

# 训练函数

## 加验证集训练

In [65]:
from transformers import AdamW
from transformers.optimization import get_scheduler
import time
import sys
import datetime
from tqdm import tqdm 

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def train(model, train_dataloader, valid_dataloader, num_epoch):
    print("Running Train...")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu' 
    print('device=', device)
    
    model = model.to(device)
    
    num_epoch = num_epoch
    total_steps = len(train_dataloader) * num_epoch

    optimizer = AdamW(model.parameters(), lr=2e-5) 
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0, 
                              num_training_steps=total_steps,  
                              optimizer=optimizer)
    
    for epoch in range(num_epoch):
        
        print("——————Epoch {:} / {:} ——————".format(epoch + 1,num_epoch))


        start = time.time()

        model.train() 
        
        total_train_acc = 0
        total_train_loss = 0

        loop = tqdm(enumerate(train_dataloader), total =len(train_dataloader), desc='Train progress bar', file=sys.stdout) 
        for i, (input_ids, attention_mask, token_type_ids, labels) in loop:
           
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            labels = labels.to(device)

            optimizer.zero_grad() 
            model.zero_grad() 

            out = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels,
            )

            loss = out['loss']

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
            optimizer.step() 
            scheduler.step() 
            
            out = out['logits'].argmax(dim=1)
            accuracy = (labels == out).sum().item() / len(labels) 
            total_train_acc += accuracy
            total_train_loss += loss
            
            if i % 50 == 0:    
                tqdm.write("Batch: {}, Train Acc: {}, Train Loss: {}, Lr: {}".format(i, accuracy, loss.item(), optimizer.state_dict()['param_groups'][0]['lr']))
        
        training_time = format_time(time.time() - start)
        print("Epoch: {}, Train Acc: {}, Train Loss: {}, Lr: {}, Time: {}".format(epoch, total_train_acc/len(train_dataloader), total_train_loss/len(train_dataloader), optimizer.state_dict()['param_groups'][0]['lr'], training_time))               
   
        print("Running Validation...")
        
        model.eval()

        total_valid_acc = 0
        total_valid_loss = 0

        loop = tqdm(enumerate(valid_dataloader), total =len(valid_dataloader), desc='Valid progress bar', file=sys.stdout)
        for i, (input_ids, attention_mask, token_type_ids, labels) in loop:

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            labels = labels.to(device)
            
            with torch.no_grad(): 
                out = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels,
                )
                loss = out['loss']
                out = out['logits'].argmax(dim=1)
                accuracy = (labels == out).sum().item() / len(labels) 
            
            total_valid_acc += accuracy
            total_valid_loss += loss
        
        print("Valid Acc: {}, Valid Loss: {}".format(total_valid_acc/len(valid_dataloader),total_valid_loss/len(valid_dataloader))) 
    
    torch.save(model, './fine-tuning-add-epoch-gpu-valid-30epoch.model') # modify path to './fine-tuned-bert.model'

# 加载模型并测试

In [None]:
def test(model,test_dataloader):
    print("Running Test...")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu' 
    print('device=', device)
    
    model = model.to(device)
    
    model.eval()
    
    correct = 0
    total = 0
    
    loop = tqdm(enumerate(test_dataloader), total =len(test_dataloader), desc='Test progress bar', file=sys.stdout)
    
    labels_list = []
    outs_list = []
    
    for i, (input_ids, attention_mask, token_type_ids, labels) in loop:
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            out = model(input_ids, attention_mask, token_type_ids, None)
        
        labels_list.append(labels.tolist())
        outs_list.append((out['logits'].argmax(dim=1)).tolist())
        
        correct += (out['logits'].argmax(dim=1) == labels).sum().item()
        total += len(labels)
        
    print("Test Acc: {}".format(correct / total))
    return labels_list, outs_list

# 定义Main函数

In [71]:
if __name__ == '__main__':
    train_dataloader = torch.utils.data.DataLoader(dataset=Dataset('train'), batch_size=16, collate_fn=collate_fn, shuffle=True, drop_last=True)  
    valid_dataloader = torch.utils.data.DataLoader(dataset=Dataset('valid'), batch_size=16, collate_fn=collate_fn, shuffle=True, drop_last=True)
    test_dataloader = torch.utils.data.DataLoader(dataset=Dataset('test'), batch_size=16, collate_fn=collate_fn, shuffle=True, drop_last=True)
    
    model = Model()
    train(model,train_dataloader,valid_dataloader, num_epoch = 30) # set num_epoch
    
    model = torch.load('./fine-tuning-add-epoch-gpu-valid-30epoch.model') # modify path to './fine-tuned-bert.model'
    labels_list, outs_list = test(model,test_dataloader)

Some weights of the model checkpoint at ./pretrained_models/chinese-macbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Train...
device= cuda
——————Epoch 1 / 30 ——————
Batch: 0, Train Acc: 0.375, Train Loss: 0.7514275908470154, Lr: 1.9995098039215686e-05
Batch: 50, Train Acc: 1.0, Train Loss: 0.22080090641975403, Lr: 1.9750000000000002e-05
Batch: 100, Train Acc: 1.0, Train Loss: 0.07771937549114227, Lr: 1.9504901960784315e-05
Train progress bar: 100%|██████████| 136/136 [01:09<00:00,  1.97it/s]
Epoch: 0, Train Acc: 0.8405330882352942, Train Loss: 0.361802339553833, Lr: 1.9333333333333333e-05, Time: 0:01:09
Running Validation...
Valid progress bar: 100%|██████████| 45/45 [00:05<00:00,  7.70it/s]
Valid Acc: 0.925, Valid Loss: 0.22786585986614227
——————Epoch 2 / 30 ——————
Batch: 0, Train Acc: 1.0, Train Loss: 0.11732280254364014, Lr: 1.932843137254902e-05
Batch: 50, Train Acc: 1.0, Train Loss: 0.03351026028394699, Lr: 1.9083333333333338e-05
Batch: 100, Train Acc: 0.9375, Train Loss: 0.23448705673217773, Lr: 1.8838235294117647e-05
Train progress bar: 100%|██████████| 136/136 [01:05<00:00,  2.09it/s]

FileNotFoundError: [Errno 2] No such file or directory: './fine-tuning-add-epoch-gpu-valid-30epoch.model'

In [69]:
test(model,test_dataloader)

Running Test...
device= cuda
Test progress bar: 100%|██████████| 45/45 [00:06<00:00,  7.16it/s]
Test Acc: 0.9166666666666666


In [75]:
# 4 epoch
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc

model = torch.load('./fine-tuning-add-epoch-gpu-valid.model') 
labels_list, outs_list = test(model,test_dataloader)
labels_list_flatten = np.array(labels_list).flatten()
outs_list_flatten = np.array(outs_list).flatten()
print('accuracy:{}'.format(accuracy_score(labels_list_flatten, outs_list_flatten)))
print('precision:{}'.format(precision_score(labels_list_flatten, outs_list_flatten)))
print('recall:{}'.format(recall_score(labels_list_flatten, outs_list_flatten)))
print('f1_score:{}'.format(f1_score(labels_list_flatten, outs_list_flatten)))
fpr,tpr,threshold = roc_curve(labels_list_flatten, outs_list_flatten)
roc_auc = auc(fpr,tpr)
print('roc_auc:{}'.format(roc_auc))

Running Test...
device= cuda
Test progress bar: 100%|██████████| 45/45 [00:06<00:00,  7.32it/s]
Test Acc: 0.9152777777777777
accuracy:0.9152777777777777
precision:0.8260869565217391
recall:0.8397790055248618
f1_score:0.8328767123287671
roc_auc:0.8902049016492584


In [73]:
# 10 epoch
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc

model = torch.load('./fine-tuning-add-epoch-gpu-valid-10epoch.model') 
labels_list, outs_list = test(model,test_dataloader)
labels_list_flatten = np.array(labels_list).flatten()
outs_list_flatten = np.array(outs_list).flatten()
print('accuracy:{}'.format(accuracy_score(labels_list_flatten, outs_list_flatten)))
print('precision:{}'.format(precision_score(labels_list_flatten, outs_list_flatten)))
print('recall:{}'.format(recall_score(labels_list_flatten, outs_list_flatten)))
print('f1_score:{}'.format(f1_score(labels_list_flatten, outs_list_flatten)))
fpr,tpr,threshold = roc_curve(labels_list_flatten, outs_list_flatten)
roc_auc = auc(fpr,tpr)
print('roc_auc:{}'.format(roc_auc))

Running Test...
device= cuda
Test progress bar: 100%|██████████| 45/45 [00:06<00:00,  7.12it/s]
Test Acc: 0.9194444444444444
accuracy:0.9194444444444444
precision:0.8324324324324325
recall:0.850828729281768
f1_score:0.8415300546448088
roc_auc:0.8966574073124981


In [74]:
# 30 epoch
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc

model = torch.load('./fine-tuning-add-epoch-gpu-valid-30epoch.model') 
labels_list, outs_list = test(model,test_dataloader)
labels_list_flatten = np.array(labels_list).flatten()
outs_list_flatten = np.array(outs_list).flatten()
print('accuracy:{}'.format(accuracy_score(labels_list_flatten, outs_list_flatten)))
print('precision:{}'.format(precision_score(labels_list_flatten, outs_list_flatten)))
print('recall:{}'.format(recall_score(labels_list_flatten, outs_list_flatten)))
print('f1_score:{}'.format(f1_score(labels_list_flatten, outs_list_flatten)))
fpr,tpr,threshold = roc_curve(labels_list_flatten, outs_list_flatten)
roc_auc = auc(fpr,tpr)
print('roc_auc:{}'.format(roc_auc))

Running Test...
device= cuda
Test progress bar: 100%|██████████| 45/45 [00:06<00:00,  7.35it/s]
Test Acc: 0.9166666666666666
accuracy:0.9166666666666666
precision:0.8296703296703297
recall:0.8388888888888889
f1_score:0.8342541436464089
roc_auc:0.8907407407407408
