In [2]:
import os
import jieba
import torch
import pickle
import pandas as pd
import torch.nn as nn

from ark_nlp.model.tc.bert import Bert
from ark_nlp.model.tc.bert import BertConfig
from ark_nlp.model.tc.bert import Dataset
from ark_nlp.model.tc.bert import Task
from ark_nlp.model.tc.bert import get_default_model_optimizer
from ark_nlp.model.tc.bert import Tokenizer

### 一、数据读入与处理

#### 1. 数据读入

In [3]:
train_data_df = pd.read_json('../mydata/data_origin/220602_0902-cblue-nlp-医疗nlp打榜/CHIP-CDN/CHIP-CDN_train.json')
dev_data_df = pd.read_json('../mydata/data_origin/220602_0902-cblue-nlp-医疗nlp打榜/CHIP-CDN/CHIP-CDN_dev.json')

In [4]:
train_data_df['normalized_result_num'] = train_data_df['normalized_result'].apply(lambda x: len(x.split('##')))
dev_data_df['normalized_result_num'] = dev_data_df['normalized_result'].apply(lambda x: len(x.split('##')))

train_data_df['normalized_result_num_label'] = train_data_df['normalized_result_num'].apply(lambda x: 0 if x > 2 else x)
dev_data_df['normalized_result_num_label'] = dev_data_df['normalized_result_num'].apply(lambda x: 0 if x > 2 else x)

In [5]:
train_data_df = (train_data_df
                 .loc[:,['text', 'normalized_result_num_label']]
                 .rename(columns={'normalized_result_num_label': 'label'}))

dev_data_df = (dev_data_df
               .loc[:,['text', 'normalized_result_num_label']]
               .rename(columns={'normalized_result_num_label': 'label'}))

In [6]:
tc_train_dataset = Dataset(train_data_df)
tc_dev_dataset = Dataset(dev_data_df)

#### 2. 词典创建和生成分词器

In [7]:
tokenizer = Tokenizer(vocab='nghuyong/ernie-1.0', max_seq_len=100)

#### 3. ID化

In [8]:
tc_train_dataset.convert_to_ids(tokenizer)
tc_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [9]:
config = BertConfig.from_pretrained('nghuyong/ernie-1.0',
                                    num_labels=len(tc_train_dataset.cat2id))

#### 2. 模型创建

In [10]:
torch.cuda.empty_cache()

In [11]:
dl_module = Bert.from_pretrained('nghuyong/ernie-1.0', 
                                 config=config)

Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing Bert: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing Bert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Bert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Bert were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [12]:
# 设置运行次数
num_epoches = 5
batch_size = 32

In [13]:
param_optimizer = list(dl_module.named_parameters())
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]      

#### 2. 任务创建

In [15]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Thu Jun 16 06:20:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:06.0 Off |                    0 |
| N/A   72C    P0    72W /  70W |   9760MiB / 15109MiB |    100%      Default |
|                               |            

In [16]:
model = Task(dl_module, 'adamw', 'lsce', cuda_device=3, ema_decay=0.995)

#### 3. 训练

In [17]:
model.fit(tc_train_dataset, 
          tc_dev_dataset,
          lr=3e-5,
          epochs=num_epoches, 
          batch_size=batch_size,
          params=optimizer_grouped_parameters
         )

 53% 100/188 [00:48<00:43,  2.04it/s]

[99/188],train loss is:0.888844,train evaluation is:0.589063


100% 188/188 [01:31<00:00,  2.05it/s]


epoch:[0],train loss is:0.831329,train evaluation is:0.636469 

classification_report: 
               precision    recall  f1-score   support

           0       0.82      0.65      0.73       277
           1       0.76      0.79      0.77       972
           2       0.63      0.64      0.63       751

    accuracy                           0.71      2000
   macro avg       0.73      0.69      0.71      2000
weighted avg       0.72      0.71      0.71      2000

confusion_matrix_: 
 [[181   9  87]
 [  4 769 199]
 [ 36 237 478]]
test loss is:0.730034,test acc is:0.714000,f1_score is:0.710654


 53% 100/188 [00:49<00:44,  2.00it/s]

[99/188],train loss is:0.702268,train evaluation is:0.745313


100% 188/188 [01:33<00:00,  2.01it/s]


epoch:[1],train loss is:0.693873,train evaluation is:0.747008 

classification_report: 
               precision    recall  f1-score   support

           0       0.79      0.74      0.76       277
           1       0.78      0.83      0.80       972
           2       0.68      0.65      0.66       751

    accuracy                           0.75      2000
   macro avg       0.75      0.74      0.74      2000
weighted avg       0.75      0.75      0.75      2000

confusion_matrix_: 
 [[204   7  66]
 [  7 806 159]
 [ 46 219 486]]
test loss is:0.697966,test acc is:0.748000,f1_score is:0.744426


 53% 100/188 [00:50<00:44,  1.99it/s]

[99/188],train loss is:0.608291,train evaluation is:0.809063


100% 188/188 [01:34<00:00,  1.99it/s]


epoch:[2],train loss is:0.611963,train evaluation is:0.806682 

classification_report: 
               precision    recall  f1-score   support

           0       0.83      0.73      0.78       277
           1       0.81      0.82      0.81       972
           2       0.69      0.71      0.70       751

    accuracy                           0.77      2000
   macro avg       0.78      0.75      0.76      2000
weighted avg       0.77      0.77      0.77      2000

confusion_matrix_: 
 [[202   6  69]
 [  6 794 172]
 [ 34 183 534]]
test loss is:0.701244,test acc is:0.765000,f1_score is:0.763522


 53% 100/188 [00:50<00:44,  1.99it/s]

[99/188],train loss is:0.532102,train evaluation is:0.866875


100% 188/188 [01:34<00:00,  1.99it/s]


epoch:[3],train loss is:0.540341,train evaluation is:0.861868 

classification_report: 
               precision    recall  f1-score   support

           0       0.82      0.74      0.77       277
           1       0.82      0.80      0.81       972
           2       0.68      0.72      0.70       751

    accuracy                           0.76      2000
   macro avg       0.77      0.75      0.76      2000
weighted avg       0.76      0.76      0.76      2000

confusion_matrix_: 
 [[204   6  67]
 [  5 776 191]
 [ 41 169 541]]
test loss is:0.716088,test acc is:0.760500,f1_score is:0.759777


 53% 100/188 [00:50<00:44,  1.97it/s]

[99/188],train loss is:0.479787,train evaluation is:0.896250


100% 188/188 [01:34<00:00,  1.98it/s]


epoch:[4],train loss is:0.489596,train evaluation is:0.892287 

classification_report: 
               precision    recall  f1-score   support

           0       0.82      0.74      0.78       277
           1       0.83      0.80      0.81       972
           2       0.68      0.73      0.70       751

    accuracy                           0.77      2000
   macro avg       0.78      0.76      0.76      2000
weighted avg       0.77      0.77      0.77      2000

confusion_matrix_: 
 [[205   2  70]
 [  6 774 192]
 [ 38 162 551]]
test loss is:0.744606,test acc is:0.765000,f1_score is:0.764847


In [18]:
model.ema.store(model.module.parameters())
model.ema.copy_to(model.module.parameters())  

<br>

### 四、模型验证与保存

#### 1. 模型验证

In [19]:
from ark_nlp.factory.predictor import TCPredictor

In [20]:
tc_predictor_instance = TCPredictor(model.module, tokenizer, tc_train_dataset.cat2id)

In [21]:
tc_predictor_instance.predict_one_sample('怀孕伴精神障碍',
                                         return_proba=True)

[('2', 0.7087128162384033)]

#### 2. 模型保存

In [22]:
!mkdir -p checkpoint/predict_num

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
import pickle

In [24]:
torch.save(model.module.state_dict(),
           'checkpoint/predict_num/module.pth')

In [25]:
with open('checkpoint/predict_num/cat2id.pkl', "wb") as f:
    pickle.dump(tc_train_dataset.cat2id, f)