公共配置

In [1]:
import pandas as pd
from glob import glob
import os
import random

import torch
from torch.utils import data
import torch.nn as nn
from torchcrf import CRF


In [2]:
EVENTFILE = './EventFile/'
OUTPUTFILE = './OutputFile/'

TRAINDATA = './OutputFile/trainData.txt'
TESTDATA = './OutputFile/testData.txt'
TESTDATASIZE = 0.1

VOCAB_PATH = './OutputFile/vocab.txt'
LABEL_PATH = './OutputFile/label.txt'

WORD_PAD = '<PAD>'
WORD_UNK = '<UNK>'
VOCAB_SIZE = 3000

WORD_PAD_ID = 0
LABEL_O_ID = 0
WORD_UNK_ID = 1

EMBEDDING_DIM = 100
HIDDEN_SIZE = 256
TARGET_SIZE = 3
LR = 1e-3
EPOCH = 100
MODEL_DIR = './OutputFile/model/'


数据处理

In [3]:
# 获得打标签的位置
def get_annotation(annPath):
    with open(annPath, encoding='utf-8') as file:
        anns = {}
        for line in file.readlines():
            arr = line.split('\t')[1].split()
            name = arr[0]
            start = int(arr[1])
            end = int(arr[-1])
            
            # 标注太长显然不正确
            if end - start > 50:
                continue

            anns[start] = 'B-' + name
            for i in range(start + 1, end):
                anns[i] = 'I-' + name
        
        return anns


In [4]:
# 获得文件
def get_text(textPath):
    text = ''
    with open(textPath, encoding='utf-8') as file:
        for t in file.read():
            if t == '\n':
                text += '/n'
            elif t == ',':
                text += '。'
            else:
                text += t
    return text


In [5]:
# 标签和文本结合
def generate_annotation():
    for textPath in glob(EVENTFILE + '*.txt'):
        annPath = textPath[:-3] + 'ann'
        anns = get_annotation(annPath)
        text = get_text(textPath)
        # 打标签 先全打O 然后替换
        df = pd.DataFrame({'word': list(text), 'label': ['O'] * len(text)})
        df.loc[anns.keys(), 'label'] = list(anns.values())
        # 导出文件
        fileName = os.path.split(textPath)[1]
        df.to_csv(OUTPUTFILE + fileName, header=None, index=None)


In [6]:
# 拆分数据集
def get_data():
    data = glob(OUTPUTFILE + '*.txt')
    random.seed(0)
    random.shuffle(data)
    n = int(len(data) * TESTDATASIZE)
    testData = data[:n]
    trainData = data[n:]
    merge_file(trainData, TRAINDATA)
    merge_file(testData, TESTDATA)


# 合并文件
def merge_file(data, path):
    with open(path, 'a', encoding='utf-8') as file:
        for f in data:
            text = open(f, encoding='utf-8').read()
            file.write(text)

In [7]:
# 生成词表
def generate_vocab():
    df = pd.read_csv(TRAINDATA, usecols=[0], names=['word'])
    vocal_list = [WORD_PAD, WORD_UNK] + df['word'].value_counts().keys().tolist()
    vocal_list = vocal_list[:VOCAB_SIZE]
    vocal_dict = {v: k for k, v in enumerate(vocal_list)}
    vocal = pd.DataFrame(list(vocal_dict.items()))
    vocal.to_csv(VOCAB_PATH, header=None, index=None)


# 生成标签表
def generate_label():
    df = pd.read_csv(TRAINDATA, usecols=[1], names=['label'])
    label_list = df['label'].value_counts().keys().tolist()
    label_dict = {v: k for k, v in enumerate(label_list)}
    label = pd.DataFrame(list(label_dict.items()))
    label.to_csv(LABEL_PATH, header=None, index=None)

In [19]:
# generate_annotation()

# get_data()

# generate_vocab()
# generate_label()


数据加载

In [8]:
# 得到关系转化表
def get_vocab():
    df = pd.read_csv(VOCAB_PATH, names=['word', 'id'])
    return list(df['word']), dict(df.values)


def get_label():
    df = pd.read_csv(LABEL_PATH, names=['label', 'id'])
    return list(df['label']), dict(df.values)

In [9]:
class Dataset(data.Dataset):
    def __init__(self, type='train', base_len=50):
        super().__init__()
        self.base_len = base_len
        data_path = TRAINDATA if type == 'train' else TESTDATA
        self.df = pd.read_csv(data_path, encoding='utf-8', names=['word', 'label'])
        _, self.word2id = get_vocab()
        _, self.label2id = get_label()
        self.get_points()


    # 找切分点
    def get_points(self):
        self.points = [0]
        i = 0
        while True:
            if i + self.base_len >= len(self.df):
                self.points.append(len(self.df))
                break
            if self.df.loc[i + self.base_len, 'label'] == 'O':
                i += self.base_len
                self.points.append(i)
            else:
                i += 1


    def __len__(self):
        return len(self.points) - 1

    
    # 向量化
    def __getitem__(self, index):
        df = self.df[self.points[index] : self.points[index + 1]]
        wordUnkId = self.word2id[WORD_UNK]
        labelOId = self.label2id['O']
        input = [self.word2id.get(w, wordUnkId) for w in df['word']]
        target = [self.label2id.get(l, labelOId) for l in df['label']]
        return input, target

In [10]:
# 填充
def collate_fn(batch):
    # 按句子长度从大到小排序
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    # 拿到最长句子
    max_len = len(batch[0][0])
    input = []
    target = []
    mask = []

    for item in batch:
        pad_len = max_len - len(item[0])
        input.append(item[0] + [WORD_PAD_ID] * pad_len)
        target.append(item[1] + [LABEL_O_ID] * pad_len)
        mask.append([1] * len(item[0]) + [0] * pad_len)
    
    return torch.tensor(input), torch.tensor(target), torch.tensor(mask).bool()

网络定义

In [11]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM, WORD_PAD_ID)
        self.lstm = nn.LSTM(
            EMBEDDING_DIM,
            HIDDEN_SIZE,
            batch_first = True,
            bidirectional = True
        )
        self.linear = nn.Linear(2 * HIDDEN_SIZE, TARGET_SIZE)
        self.crf = CRF(TARGET_SIZE)
    

    def _get_lstm_feature(self, input):
        out = self.embed(input)
        out, _ = self.lstm(out)
        return self.linear(out)


    def forward(self, input, mask):
        out = self._get_lstm_feature(input)
        return self.crf.decode(out, mask)

    
    def loss_fn(self, input, target, mask):
        y_pred = self._get_lstm_feature(input)
        return -self.crf.forward(y_pred, target, mask, reduction='mean')

训练

In [12]:
dataset = Dataset()
loader = data.DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for e in range(EPOCH):
    for b, (input, target, mask) in enumerate(loader):
        y_pred = model(input, mask)
        loss = model.loss_fn(input, target, mask)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if b % 10 == 0:
            print(f'Epoch: {e} Loss: {loss.item()}')
    if e % 10 == 0:
        torch.save(model, MODEL_DIR + f'model_{e}.pth')

KeyboardInterrupt: 

测试

In [16]:
dataset = Dataset('test')
loader = data.DataLoader(
    dataset,
    batch_size=64,
    collate_fn=collate_fn
)

with torch.no_grad():
    model = torch.load(MODEL_DIR + 'model_100.pth')

    y_ture_list = []
    y_pred_list = []

    for b, (input, target, mask) in enumerate(loader):
        y_pred = model(input, mask)
        # loss = model.loss_fn(input, target, mask)

        for lst in y_pred:
            y_pred_list += lst

        for y, m in zip(target, mask):
            y_ture_list += y[m == True].tolist()

    y_ture_tensor = torch.tensor(y_ture_list)
    y_pred_tensor = torch.tensor(y_pred_list)

    accuracy = (y_ture_tensor == y_pred_tensor).sum() / len(y_ture_tensor)
    print(f'total: {len(y_ture_tensor)} accuracy: {accuracy.item()}')

total: 19536 accuracy: 0.7291154861450195


使用

In [14]:
def extract(label, text):
    i = 0
    res = []
    while i < len(label):
        if label[i] != 'O':
            prefix, name = label[i].split('-')
            start = end = i
            i += 1
            while i < len(label) and label[i] == 'I-' + name:
                end = i
                i += 1
            
            res.append(text[start:end + 1])
        else:
            i += 1
    
    return res

In [94]:
# 用户添加
def userAdd(text, arr):
    userWord = []
    with open('./userDict.txt', encoding='utf-8') as file:
        for l in file.readlines():
            userWord.append(l.split('\n')[0])

    for w in userWord:
        if w in text:
            arr.append(w)

    return arr    

In [17]:
text = """
一个端口支持多种协议可以使部署和运维更为方便，甚至在一些特殊的开发场景也能降低复杂度。
1、对于协议切换/架构升级的场景，通常需要同时暴露多个协议，单端口多协议能最大程度上带来运维的便捷性。
2、从后端开发的角度一套业务代码加少许配置就能暴露多个端口也能带来降低开发成本 目前 Dubbo 支持一个应用对外发布多种 RPC 协议，但这些 RPC 协议都需要独立占用一个服务端端口，另外 Dubbo QoS 也同样占用了一个端口。维护这些端口的监听需要消耗一定的资源，同时暴露多端口对于运维也存在一定复杂度，如 VIP /域名等。因此可以通过在同一个端口支持多种复用协议来降低复杂度，提高易用性
例如：一个业务逻辑需要提供给不同语言、不同业务方进行调用。 
当前大多数 rpc 框架均不支持该逻辑（包括 Dubbo）以 Dubbo 举例，使用Triple 协议并开启所有默认服务，会开启如下默认端口 Triple: 50051、Metadata : 20880、Qos: 22222。如果后续增加其他功能可能还会更多，这很不优雅，并且还启动了多个 Netty Server，造成了资源浪费。 如果本地需要测试，在不修改配置的情况下端口会冲突导致启动失败，体验很差。 

实现服务端同端口多协议暴露,将各种协议服务使用一种统一的方式使用同一个 Netty Server 进行暴露
将Qos 协议和 Triple 协议使用同一个端口暴露
将 Dubbo 的其他协议进行接入

熟练使用 Java
了解基础网络通信原理
了解Netty工作原理
"""
_, word2id = get_vocab()

input = torch.tensor([[word2id.get(w, WORD_PAD_ID) for w in text]])
mask = torch.tensor([[1] * len(text)]).bool()

model = torch.load(MODEL_DIR + 'model_90.pth')
y_pred = model(input, mask)

id2label, _ = get_label()

label = []
for l in y_pred:
    label.append(id2label[l[0]])

res = extract(label, text)
res = list(set(res))
print(res)

['Triple 协议', '\n', 'Dubbo QoS', 'RPC 协议', 'Dubbo', '网络通信原理', 'Netty', 'Qos 协议', 'Netty Server', 'Java', 'rpc 框架']
