In [1]:
import torch
from transformers import BertModel,BertTokenizer
from transformers import AdamW
from datasets import load_dataset, load_from_disk


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
#加载预训练模型
pretrained_model = BertModel.from_pretrained('bert-base-chinese')
#需要移动到cuda上
pretrained_model.to(device)

#不训练,反向传播时不需要计算梯度
for param in pretrained_model.parameters():
    param.requires_grad_(False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # 所有计算得到的tensor的requires_grad自动设置为False
        with torch.no_grad():
            out = pretrained_model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)
#         print(out.last_hidden_state.shape)  # torch.Size([16, 100, 768])
#         print(out.last_hidden_state[:, 0].shape)  # torch.Size([16, 768]) # 注意取0位置
        out = self.fc(out.last_hidden_state[:, 0])
        # 经过激活函数
        out = out.softmax(dim=1)  
        return out

# 加载整个模型
model = Model()
# 同样要移动到cuda
model.to(device)

# 虚拟一批数据,需要把所有的数据都移动到cuda上
input_ids = torch.ones(16, 100).long().to(device) 
attention_mask = torch.ones(16, 100).long().to(device)
token_type_ids = torch.ones(16, 100).long().to(device)
labels = torch.ones(16).long().to(device)

#试算
out = model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids)
out
out.shape
#后面的计算和中文分类完全一样，只是放在了cuda上计算

torch.Size([16, 2])

In [4]:
#定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)
#         self.dataset = load_from_disk('./data/ChnSentiCorp')[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


dataset = Dataset('train')
len(dataset), dataset[0]

Found cached dataset chn_senti_corp (C:/Users/lizhong/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


(9600,
 ('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  1))

In [5]:
#加载字典和分词工具，进行编码
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=50,
                                   return_tensors='pt',
                                   return_length=True)

 
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    labels = torch.LongTensor(labels).to(device)

    #print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


#数据加载器
loader = torch.utils.data.DataLoader(dataset=Dataset("train"),
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

# for i, (input_ids, attention_mask, token_type_ids,
#         labels) in enumerate(loader):
#     break

Found cached dataset chn_senti_corp (C:/Users/lizhong/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


In [7]:
len(loader)

600

In [8]:
# 优化器
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    # 训练
    model.train()
    
    for i, (input_ids, attention_mask, token_type_ids,labels) in enumerate(loader):
        # 正向传播
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
       # 计算损失
        loss = criterion(out, labels)
        # 反向传播，计算梯度
        loss.backward()
        # 更新参数
        optimizer.step()
        # 梯度归零
        optimizer.zero_grad()

        if i % 50 == 0:
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)

            print(f"第{i}批数据：", loss.item(), accuracy)

        if i == 1000:
            break
            
if __name__ == "__main__":
    print("start...")
    train()
    print("end...")



start...
第0批数据： 0.6922425031661987 0.4375
第50批数据： 0.509463906288147 0.9375
第100批数据： 0.4561876654624939 0.9375
第150批数据： 0.457298219203949 0.875
第200批数据： 0.3816230893135071 1.0
第250批数据： 0.5415152311325073 0.8125
第300批数据： 0.417835533618927 0.9375
第350批数据： 0.43066173791885376 0.9375
第400批数据： 0.6028048992156982 0.6875
第450批数据： 0.4416239261627197 0.875
第500批数据： 0.43899375200271606 0.875
第550批数据： 0.45590609312057495 0.8125
end...


In [10]:
loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

#测试
def test():
    model.eval()
    correct = 0
    total = 0

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)
        
#         print(i)
        if i == 1000:
            break

    print(correct / total)

if __name__ == "__main__":
    print("start...")
    test()
    print("end...")

Found cached dataset chn_senti_corp (C:/Users/lizhong/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


start...
0.831081081081081
end...
