# 医疗问诊意图识别

## 安装包

In [24]:
!pip install transformers
!pip install torch
!pip install tensorboardX

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
You should consider upgrading via the '/Users/teletraan/.pyenv/versions/3.9.4/bin/python3.9 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
You should consider upgrading via the '/Users/teletraan/.pyenv/versions/3.9.4/bin/python3.9 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting tensorboardX
  Downloading https://mirrors.aliyun.com/pypi/packages/98/88/977b2f03fd0f8a2490fc7a1ad691d5e44cee5f1dc90c57078c5c168e2e70/tensorboardX-2.4.1-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 3.4 MB/s eta 0:00:01
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.4.1
You should consider upgrading via the '/Users/teletraan/.pyenv/versions/3.9.4/bin/python3.9 -m pip install --upgrade pip' command.[0m


## 读取和处理数据

### 原始数据读取

In [1]:
import pandas as pd
# 训练数据
df = pd.read_csv('train.csv')
# 验证数据
df_val = pd.read_csv('validation.csv')
# 测试数据
df_test = pd.read_csv('test.csv')

In [2]:
df.head()

Unnamed: 0,text,label_class,label
0,您好！我爸爸七年前得过肺结核！在前几天突然咳血，血液量大，有血块！现在住院，八天了也没查出结...,临床表现(病症表现),3
1,喉咙痒，咳嗽，白色泡沫痰嗓子红肿怀疑是检地上的东西吃引起的，请问怎么治疗,治疗方法,5
2,强迫症会引起幻听吗,相关病症,4
3,胃涨不消化怎么办,治疗方法,5
4,我就是经常在走路的时候出现手脚乱动，然后还有点心慌的情况，手脚就开始不停的发抖，老是有点精神...,治疗方法,5


### 样本标签分布情况

In [3]:
df.groupby("label")["label"].count()

label
0      386
1      854
2      119
3     1137
4      215
5     1802
6       21
7       37
8      122
9      321
10      36
11      68
12     881
Name: label, dtype: int64

In [4]:
df_val.groupby("label")["label"].count()

label
0     13
1     22
2      2
3     45
4      6
5     63
6      1
7      2
8      5
9     13
11     2
12    41
Name: label, dtype: int64

In [5]:
df_test.groupby("label")["label"].count()

label
0      41
1      94
2      17
3     131
4      21
5     209
6       4
7       4
8      17
9      44
10      4
11     10
12     95
Name: label, dtype: int64

### 数据预处理

#### 清除数字、特殊符号

In [6]:
import re

df.text = df.text.map(lambda x: re.sub('[a-zA-Z0-9×]*', '', x).replace(' ',''))
df_val.text = df_val.text.map(lambda x: re.sub('[a-zA-Z0-9×]*', '', x).replace(' ',''))
df_test.text = df_test.text.map(lambda x: re.sub('[a-zA-Z0-9×]*', '', x).replace(' ',''))

#### 截取文本或补齐文本到固定长度

In [7]:
def padding_sentences(input_sentences, padding_token, padding_sentence_length = 230):
    sentences = [sentence for sentence in input_sentences]
    max_sentence_length = padding_sentence_length
    l=[]
    for sentence in sentences:
        if len(sentence) > max_sentence_length:
            sentence = sentence[:max_sentence_length]
            l.append(sentence)
        else:
            sentence += padding_token * (max_sentence_length - len(sentence))
            l.append(sentence)
    return (l, max_sentence_length)

In [8]:
sentences, max_document_length = padding_sentences(df['text'], '[PAD]')
sentences_val, _ = padding_sentences(df_val['text'], '[PAD]')
sentences_test, _ = padding_sentences(df_test['text'], '[PAD]')

#### 文本转为词向量

In [9]:
from transformers import AutoTokenizer, AlbertForMaskedLM, AlbertModel
import torch
from torch.nn.functional import softmax

pretrained = 'voidful/albert_chinese_tiny'
tokenizer = AutoTokenizer.from_pretrained(pretrained)
albert_model = AlbertModel.from_pretrained(pretrained)

Some weights of the model checkpoint at albert_chinese_tiny were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
train_data = []
for i,s in enumerate(sentences):
    input_ids = torch.tensor(tokenizer.encode(s, add_special_tokens=False)).unsqueeze(0)
    outputs = albert_model(input_ids)
    train_data.append([outputs['last_hidden_state'].detach().numpy(), df['label'][i]])


In [12]:
val_data = []
for i,s in enumerate(sentences_val):
    input_ids = torch.tensor(tokenizer.encode(s, add_special_tokens=False)).unsqueeze(0)
    outputs = albert_model(input_ids)
    val_data.append([outputs['last_hidden_state'].detach().numpy(), df_val['label'][i]])


In [13]:
test_data = []
for i,s in enumerate(sentences_test):
    input_ids = torch.tensor(tokenizer.encode(s, add_special_tokens=False)).unsqueeze(0)
    outputs = albert_model(input_ids)
    test_data.append([outputs['last_hidden_state'].detach().numpy(), df_test['label'][i]])

### 数据迭代器

In [14]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.FloatTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        return x, y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    iter = DatasetIterater(dataset, 64, device)
    return iter

In [15]:
train_iter = build_iterator(train_data)
val_iter = build_iterator(val_data) 
test_iter = build_iterator(test_data) 

In [16]:
# 测试一下迭代器
for i, (trains, labels) in enumerate(train_iter):
    print('label',labels)
    if i == 0:
        break

label tensor([ 3,  5,  4,  5,  5,  5,  1, 12,  5,  5, 12,  3,  9,  5,  5,  3,  9,  5,
        12,  3,  1,  8, 12,  3,  5,  1,  4,  3,  3,  1,  5,  9,  3,  5,  5,  0,
         1,  1, 12,  5,  2, 11,  5,  0,  5,  3,  5,  5,  3,  5, 12,  3,  1,  5,
         5,  5,  0, 12, 12,  7,  8, 12,  3,  5])


## 模型构建

In [17]:
import torch.nn as nn

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.convs = nn.ModuleList(
            [nn.Conv1d(312, 256, k) for k in (2,3,4)])
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(256 * 3, 128)
        self.fc2 = nn.Linear(128, 13)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x))
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = x.squeeze()
        x = x.permute(0,2,1)
        out = torch.cat([self.conv_and_pool(x, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [18]:
model = Model()
model

Model(
  (convs): ModuleList(
    (0): Conv1d(312, 256, kernel_size=(2,), stride=(1,))
    (1): Conv1d(312, 256, kernel_size=(3,), stride=(1,))
    (2): Conv1d(312, 256, kernel_size=(4,), stride=(1,))
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=13, bias=True)
)

## 模型训练

In [19]:
def evaluate(model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

In [20]:
import time
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn import metrics
from datetime import timedelta


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

def train(model, train_iter, val_iter, num_epochs):
    start_time = time.time()
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # 学习率指数衰减，每次epoch：学习率 = gamma * 学习率
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    total_batch = 0  # 记录进行到多少batch
    val_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    writer = SummaryWriter(log_dir='./' + time.strftime('%m-%d_%H.%M', time.localtime()))
    for epoch in range(num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                val_acc, val_loss = evaluate(model, val_iter)
                if val_loss < val_best_loss:
                    val_best_loss = val_loss
                    torch.save(model.state_dict(), './model')
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, val_loss, val_acc, time_dif, improve))
                writer.add_scalar("loss/train", loss.item(), total_batch)
                writer.add_scalar("loss/dev", val_loss, total_batch)
                writer.add_scalar("acc/train", train_acc, total_batch)
                writer.add_scalar("acc/dev", val_acc, total_batch)
                model.train()
            total_batch += 1
            if total_batch - last_improve > 1000:
                # 验证集loss超过1000batch没下降，结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        scheduler.step() # 学习率衰减
        if flag:
            break
    writer.close()

In [21]:
train(model, train_iter, val_iter, 5)

Epoch [1/5]
Iter:      0,  Train Loss:   2.4,  Train Acc: 17.19%,  Val Loss:   2.4,  Val Acc: 29.30%,  Time: 0:00:03 *
Epoch [2/5]
Iter:    100,  Train Loss:   1.4,  Train Acc: 53.12%,  Val Loss:   1.3,  Val Acc: 55.35%,  Time: 0:01:05 *
Epoch [3/5]
Iter:    200,  Train Loss:   1.2,  Train Acc: 56.25%,  Val Loss:   1.2,  Val Acc: 58.14%,  Time: 0:02:03 *
Epoch [4/5]
Iter:    300,  Train Loss:   1.0,  Train Acc: 57.81%,  Val Loss:   1.1,  Val Acc: 60.47%,  Time: 0:03:01 *
Epoch [5/5]
Iter:    400,  Train Loss:  0.65,  Train Acc: 82.81%,  Val Loss:   1.2,  Val Acc: 58.14%,  Time: 0:03:59 


## 模型测试

In [22]:
def test(model, test_iter):
    # model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


In [23]:
test(model, test_iter)

Test Loss:   1.2,  Test Acc: 59.48%
Precision, Recall and F1-Score...
              precision    recall  f1-score   support

           0     0.5200    0.6341    0.5714        41
           1     0.5849    0.6596    0.6200        94
           2     0.6250    0.5882    0.6061        17
           3     0.5393    0.3664    0.4364       131
           4     0.4000    0.2857    0.3333        21
           5     0.7131    0.8086    0.7578       209
           6     1.0000    0.2500    0.4000         4
           7     0.0000    0.0000    0.0000         4
           8     0.8571    0.3529    0.5000        17
           9     0.8333    0.3409    0.4839        44
          10     0.6667    0.5000    0.5714         4
          11     1.0000    0.4000    0.5714        10
          12     0.4336    0.6526    0.5210        95

    accuracy                         0.5948       691
   macro avg     0.6287    0.4492    0.4902       691
weighted avg     0.6138    0.5948    0.5839       691

Confusion

## 模型保存

In [24]:
torch.save(model.state_dict(), './model.pt')

## 总结

数据的标签分布不均，最终的模型出现了过拟合，后续可通过增加数据，变更或调整模型来进行优化。