# 数据处理模块



**目录：**

0. 数据读取

1. 句子特征提取

2. 标签类别收集

3. 样本转化为模型可读的特征

---


In [1]:
import argparse

import os
import copy
import json
import logging

import numpy as np
from copy import deepcopy
from collections import Counter
from collections import OrderedDict
from ordered_set import OrderedSet


import torch
from torch.utils.data import TensorDataset, RandomSampler, DataLoader
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

logger = logging.getLogger(__name__)

## 1 训练数据形式回顾


### 两个数据集

<img src="./数据结构0.PNG"  width="400" height="600" align="left" />

### 数据集格式

Stanford数据集有三个domain，所以一个domain做train，一个做dev，一个做test；

json文件的key就是domain名称；

数据文件中有一层不必要的list，导致代码会报错；需要注意改过来；

<img src="./数据结构1.PNG"  width="300" height="300" align="left" />


每个episode的数据

1. 每个episode中"support"的句子数量是不一定的；

2. "seq_outs"是指slot标签数据，这里是没有提供的，也是本文不会去研究的；

<img src="./数据结构2.PNG"  width="300" height="300" align="left" />

回忆 support set需要是minimum including的，

1. domain下每个标签都要有至少K次 (K-SHOT)；
2. 去掉一个样本，上述条件就不满足；

<img src="./数据结构3.PNG"  width="300" height="300" align="left" />

## 2 数据集加载

In [2]:
# coding:utf-8
import json
import collections
import logging

import sys
from typing import List, Tuple, Dict

In [3]:
class RawDataLoaderBase:
    def __init__(self, *args, **kwargs):
        pass

    def load_data(self, path: str):
        pass

In [4]:
# 定义一个输入样本的格式 （注意，这不是一个 小样本学习 的样本）

DataItem = collections.namedtuple("DataItem", ["seq_in", "seq_out", "label"])

# namedtuple
# 因为元组的局限性：不能为元组内部的数据进行命名，
#     所以往往我们并不知道一个元组所要表达的意义，
#     所以在这里引入了 collections.namedtuple 这个工厂函数，
#     来构造一个带字段名的元组

import collections

# 两种方法来给 namedtuple 定义方法名
User = collections.namedtuple('User', ['name', 'sex', 'age'])
# User = collections.namedtuple('User', 'name age id')
user = User('tester', '男', '12')

print(user)

# 获取用户的属性
print(user.name)
print(user.sex)
print(user.age)

User(name='tester', sex='男', age='12')
tester
男
12


In [5]:
class FewShotExample(object):
    """  
    
    Each few-shot example is a pair of (one query example, support set) 
    
    每个小样本学习 的样本，是query example，配上一个support set；
    
    
    """

    def __init__(
            self,
            gid: int,
            batch_id: int,
            test_id: int,
            domain_name: str,
            support_data_items: List[DataItem],
            test_data_item: DataItem
    ):
        self.gid = gid
        self.batch_id = batch_id
        self.test_id = test_id  # query relative index in one episode
        self.domain_name = domain_name

        self.support_data_items = support_data_items  # all support data items
        self.test_data_item = test_data_item  # one query data items

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        return 'gid:{}\n\tdomain:{}\n\ttest_data:{}\n\ttest_label:{}\n\tsupport_data:{}'.format(
            self.gid,
            self.domain_name,
            self.test_data_item.seq_in,
            self.test_data_item.seq_out,
            self.support_data_items,
        )

In [6]:
class FewShotRawDataLoader(RawDataLoaderBase):
    def __init__(self, opt):
        super(FewShotRawDataLoader, self).__init__()
        self.opt = opt
        self.debugging = opt.do_debug

    def load_data(self, path: str) -> (List[FewShotExample], List[List[FewShotExample]], int):
        """
            load few shot data set
            input:
                path: file path
            output
                examples: a list, all example loaded from path
                few_shot_batches: a list, of fewshot batch, each batch is a list of examples
                max_len: max sentence length
            """
        with open(path, 'r') as reader:
            raw_data = json.load(reader)
            examples, few_shot_batches, max_support_size = \
                self.raw_data2examples(raw_data)
        if self.debugging:
            examples, few_shot_batches = examples[:8], few_shot_batches[:2]
        return examples, few_shot_batches, max_support_size

    def raw_data2examples(self, raw_data: Dict) -> (List[FewShotExample], List[List[FewShotExample]], int):
        """
        process raw_data into examples
        """
        examples = []
        all_support_size = []
        few_shot_batches = []
        for domain_n, domain in raw_data.items():
            
            # Notice: the batch here means few shot batch, not training batch
            for batch_id, batch in enumerate(domain[0]):

                one_batch_examples = []
                support_data_items, test_data_items = self.batch2data_items(batch) 
                # "support"; 
                # "query":
                
                all_support_size.append(len(support_data_items))
                ''' Pair each test sample with full support set '''
                for test_id, test_data_item in enumerate(test_data_items):
                    gid = len(examples)
                    example = FewShotExample(
                        gid=gid,
                        batch_id=batch_id,
                        test_id=test_id,
                        domain_name=domain_n,
                        test_data_item=test_data_item,
                        support_data_items=support_data_items,
                    )
                    examples.append(example)
                    one_batch_examples.append(example)
                few_shot_batches.append(one_batch_examples)

        # print("all_support_size: ", all_support_size)
        max_support_size = max(all_support_size)
        return examples, few_shot_batches, max_support_size

    def batch2data_items(self, batch: dict) -> (List[DataItem], List[DataItem]):

        support_data_items = self.get_data_items(parts=batch['support'])
        test_data_items = self.get_data_items(parts=batch['query'])

        return support_data_items, test_data_items

    def get_data_items(self, parts: dict) -> List[DataItem]:
        # 将一个个数据转化为 DataItem，也就是一个namedTuple

        data_item_lst = []
        for seq_in, seq_out, label in zip(parts['seq_ins'], parts['seq_outs'], parts['labels']):
            # todo: move word-piecing into preprocessing module
            # label = token_label if self.opt.task == 'ml' else sent_label   # decide label type according to task
            data_item = DataItem(seq_in=seq_in, seq_out=seq_out, label=label)
            # print(data_item)
            data_item_lst.append(data_item)
        return data_item_lst

In [7]:
### 数据加载： 举例

class Config():
    do_debug = False

opt = Config()
opt.do_debug = False

data_loader = FewShotRawDataLoader(opt)

path = "../data/stanford/stanford.0.spt_s_1.q_s_32.ep_200--use_schema--label_num_schema2/train.json"
examples, few_shot_batches, max_support_size = data_loader.load_data(path)
print("-" * 50)
print("num of examples: ", len(examples))
print("-" * 50)
print("the first example: ", examples[0])
print("-" * 50)

print("num of batches: ", len(few_shot_batches))
print("-" * 50)
print("the first batch: ", len(few_shot_batches[0]))
print("-" * 50)

--------------------------------------------------
num of examples:  6400
--------------------------------------------------
the first example:  gid:0
	domain:weather
	test_data:['Thank', 'you', 'very', 'much', 'car', '!']
	test_label:['O', 'O', 'O', 'O', 'O', 'O']
	support_data:[DataItem(seq_in=['I', 'need', 'to', 'know', 'what', 'the', 'lowest', 'temperature', 'will', 'be', 'in', 'New', 'York', 'in', 'the', 'next', 'few', 'days', '.'], seq_out=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], label=['request_low_temperature']), DataItem(seq_in=['Which', 'one', 'is', 'gon', 'na', 'be', 'the', 'lowest', 'temperature', 'today', 'and', 'tomorrow', 'in', 'Fresno', '?'], seq_out=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], label=['request_time']), DataItem(seq_in=['thank', 'you'], seq_out=['O', 'O'], label=['appreciate']), DataItem(seq_in=['Find', 'what', 'the', 'temperature', 'will', 'be', 'in', 'downtown', 'Ch

### label集合收集：

In [8]:
def flatten(l):
    """ convert list of list to list"""
    return [item  for sublist in l for item in sublist]
    # 不能写成 [item for item in sublist for sublist in l ]

l_0 = [[4, 5], [2, 3]]
print(flatten(l_0))

[4, 5, 2, 3]


In [9]:
from typing import List, Tuple, Dict

def make_dict(opt, examples: List[FewShotExample]) -> (Dict[str, int], Dict[int, str]):
    """
    make label2id dict
    label2id must follow rules:
    For sequence labeling:
        1. id(PAD)=0 id(O)=1  2. id(B-X)=i  id(I-X)=i+1
    For (multi-label) text classification:
        1. id(PAD)=0
    """
    
    # 对槽位标签的
    def purify(l):
        """ remove B- and I- """
        return set([item.replace('B-', '').replace('I-', '') for item in l])

    ''' collect all label from: all test set & all support set '''
    all_labels = []
    label2id = {}
    for example in examples:
        if opt.task == 'sl':  # 在这里不适用
            all_labels.append(example.test_data_item.seq_out)
            all_labels.extend([data_item.seq_out for data_item in example.support_data_items])
        else:
            all_labels.append(example.test_data_item.label)
            all_labels.extend([data_item.label for data_item in example.support_data_items])
    ''' collect label word set '''
    label_set = sorted(list(purify(set(flatten(all_labels)))))  # sort to make embedding id fixed
    
#     # transfer label to index type such as `label_1`
#     # 这里我们是跳过的，不需要
#     if opt.index_label:
#         if 'label2index_type' not in opt:
#             opt.label2index_type = {}
#             for idx, label in enumerate(label_set):
#                 opt.label2index_type[label] = 'label_' + str(idx)
#         else:
#             max_label_idx = max([int(value.replace('label_', '')) for value in opt.label2index_type.values()])
#             for label in label_set:
#                 if label not in opt.label2index_type:
#                     max_label_idx += 1
#                     opt.label2index_type[label] = 'label_' + str(max_label_idx)
#         label_set = [opt.label2index_type[label] for label in label_set]
#     elif opt.unused_label:
#         if 'label2unused_type' not in opt:
#             opt.label2unused_type = {}
#             for idx, label in enumerate(label_set):
#                 opt.label2unused_type[label] = '[unused' + str(idx) + ']'
#         else:
#             max_label_idx = max([int(value.replace('[unused', '').replace(']', '')) for value in opt.label2unused_type.values()])
#             for label in label_set:
#                 if label not in opt.label2unused_type:
#                     max_label_idx += 1
#                     opt.label2unused_type[label] = '[unused' + str(max_label_idx) + ']'
#         label_set = [opt.label2unused_type[label] for label in label_set]
#     else:
#         pass
    
    ''' build dict '''
    label2id['[PAD]'] = len(label2id)  # '[PAD]' in first position and id is 0
    if opt.task == 'sl':
        label2id['O'] = len(label2id)
        for label in label_set:
            if label == 'O':
                continue
            label2id['B-' + label] = len(label2id)
            label2id['I-' + label] = len(label2id)
    else:  # mlc. sc
        for label in label_set:
            label2id[label] = len(label2id)
    ''' reverse the label2id '''
    id2label = dict([(idx, label) for label, idx in label2id.items()])
    return label2id, id2label

In [10]:
# 举例：

opt.task = "stanford"
opt.index_label = False
opt.unused_label = False

label2id, id2label = make_dict(opt, examples)
print(label2id)

{'[PAD]': 0, 'appreciate': 1, 'inform': 2, 'query': 3, 'request_high_temperature': 4, 'request_low_temperature': 5, 'request_temperature': 6, 'request_time': 7, 'request_weather': 8}


## 句子特征提取

### 使用stanford pos-tagger做词性标注

https://nlp.stanford.edu/software/tagger.html


<img src="./stanford-nlp-group-photo.jpg"  width="900" height="900" align="left" />

<img src="./pos_tagger.PNG"  width="900" height="900" align="left" />

In [11]:
import os
import json
from nltk.tag import StanfordPOSTagger

In [12]:
def load_tagger(model_dir):
    tagger_model_file = os.path.join(model_dir, 'models/english-bidirectional-distsim.tagger')
    tagger_jar_file = os.path.join(model_dir, 'stanford-postagger.jar')
    tagger = StanfordPOSTagger(model_filename=tagger_model_file, path_to_jar=tagger_jar_file)
    return tagger

In [13]:
def get_tag_data(tagger, prefix=''):
    all_sub_dirs = os.listdir(DATA_DIR)
    all_sub_dirs = [sub_dir for sub_dir in all_sub_dirs if sub_dir.startswith(prefix)]
    print('all_sub_dirs: {}'.format(all_sub_dirs))
    json_file_lst = ['train.json', 'dev.json', 'test.json']

    res = {}
    d_count = 0

    if DEBUG:
        all_sub_dirs = all_sub_dirs[:1]
        json_file_lst = ['dev.json']

    for sub_dir in all_sub_dirs:
        print(sub_dir)
        for json_file in json_file_lst:
            print(json_file)
            filename = os.path.join(DATA_DIR, sub_dir, json_file)
            with open(filename, 'r') as fr:
                json_data = json.load(fr)
            for key, episode_data in json_data.items():
                print(key)
                print(type(episode_data))
                print(len(episode_data))
                print(len(episode_data[0]))
                for e_item in episode_data[0]:
                    # print(e_item)
                    # print(e_item['support'])
                    for seq_in in e_item['support']['seq_ins']:
                        text = ' '.join(seq_in)
                        if text not in res:
                            d_count += 1
                            tag_data = tagger.tag(seq_in)
                            # print(seq_in)
                            # print(tag_data)
                            res[text] = tag_data
                            if d_count % 10 == 0:
                                print('d_count - {}'.format(d_count))

                    for seq_in in e_item['query']['seq_ins']:
                        text = ' '.join(seq_in)
                        if text not in res:
                            d_count += 1
                            tag_data = tagger.tag(seq_in)
                            res[text] = tag_data
                            if d_count % 10 == 0:
                                print('d_count - {}'.format(d_count))
    return res

#### 举例


['So', 'this', 'is', 'more', 'high', '-tech', "it's", 'what', 'I', 'think', 'people', 'like', 'today', '.']

[('So', 'RB'), ('this', 'DT'), ('is', 'VBZ'), ('more', 'RBR'), ('high', 'JJ'), ('-tech', 'IN'), ("it's", 'PRP$'), ('what', 'WP'), ('I', 'PRP'), ('think', 'VBP'), ('people', 'NNS'), ('like', 'IN'), ('today', 'NN'), ('.', '.')]


### 预处理器

分为 InputBuilder, OutputBuilder, FeatureConstructor三个组成；


#### InputBuilder

In [14]:
FeatureItem = collections.namedtuple(   # text or raw features
    "FeatureItem",
    [
        "tokens",  # tokens corresponding to input token ids, eg: word_piece tokens with [CLS], [SEP]
        "labels",  # labels for all input position, eg; label for word_piece tokens
        "data_item",
        "token_ids",
        "segment_ids",
        "nwp_index",
        "input_mask",
        "output_mask"
    ]
)


ModelInput = collections.namedtuple(  # digit features for computation
    "ModelInput",   # all element shape: test: (1, test_len) support: (support_size, support_len)
    [
        "token_ids",  # token index list
        "segment_ids",  # bert [SEP] ids
        "nwp_index",  # non-word-piece word index to extract non-word-piece tokens' reps (only useful for bert).
        "input_mask",  # [1] * len(sent), 1 for valid (tokens, cls, sep, word piece), 0 is padding in batch construction
        "output_mask",  # [1] * len(sent), 1 for valid output, 0 for padding, eg: 1 for original tokens in sl task
    ]
)

In [15]:
# InputBuilder的基础父类

class InputBuilderBase:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, example, max_support_size, label2id
    ) -> (FeatureItem, ModelInput, List[FeatureItem], ModelInput):
        raise NotImplementedError

    def data_item2feature_item(self, data_item: DataItem, seg_id: int) -> FeatureItem:
        raise NotImplementedError

    def get_test_model_input(self, feature_item: FeatureItem) -> ModelInput:
        # FeatureItem 转化为 torch tensor, 这样就可以作为模型的输入
        ret = ModelInput(
            token_ids=torch.LongTensor(feature_item.token_ids),
            segment_ids=torch.LongTensor(feature_item.segment_ids),
            nwp_index=torch.LongTensor(feature_item.nwp_index),
            input_mask=torch.LongTensor(feature_item.input_mask),
            output_mask=torch.LongTensor(feature_item.output_mask)
        )
        return ret

    def get_support_model_input(self, feature_items: List[FeatureItem], max_support_size: int) -> ModelInput:
        pad_id = self.tokenizer.vocab['[PAD]']
        token_ids = self.pad_support_set([f.token_ids for f in feature_items], pad_id, max_support_size)
        segment_ids = self.pad_support_set([f.segment_ids for f in feature_items], 0, max_support_size)
        nwp_index = self.pad_support_set([f.nwp_index for f in feature_items], [0], max_support_size)
        input_mask = self.pad_support_set([f.input_mask for f in feature_items], 0, max_support_size)
        output_mask = self.pad_support_set([f.output_mask for f in feature_items], 0, max_support_size)
        ret = ModelInput(
            token_ids=torch.LongTensor(token_ids),
            segment_ids=torch.LongTensor(segment_ids),
            nwp_index=torch.LongTensor(nwp_index),
            input_mask=torch.LongTensor(input_mask),
            output_mask=torch.LongTensor(output_mask)
        )
        return ret

    def pad_support_set(self, item_lst: List[List[int]], pad_value: int, max_support_size: int) -> List[List[int]]:
        """
        pre-pad support set to insure: 
            1. each spt set has same sent num 
            2. each sent has same length
        (do padding here because: 
                1. all support sent are considered as one tensor input  
                2. support set size is small
            )
        :param item_lst:
        :param pad_value:
        :param max_support_size:
        :return:
        """
        ''' pad sentences '''
        max_sent_len = max([len(x) for x in item_lst])  # max length among one
        ret = []
        for sent in item_lst:
            temp = sent[:]
            while len(temp) < max_sent_len:
                temp.append(pad_value)
            ret.append(temp)
        
        ''' pad support set size '''
        pad_item = [pad_value for _ in range(max_sent_len)]
        while len(ret) < max_support_size:
            ret.append(pad_item)
        return ret

    def digitizing_input(self, tokens: List[str], seg_id: int) -> (List[int], List[int]):
        # tokens 映射为 ids
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        segment_ids = [seg_id for _ in range(len(tokens))]
        return token_ids, segment_ids

    def tokenizing(self, item: DataItem):
        """
        Possible tokenizing for item 
        不同的模型，可能不一样
        
        """
        pass

In [16]:
# LabelNumSchemaInputBuilder

# 先看 BertInputBuilder
class BertInputBuilder(InputBuilderBase):
    def __init__(self, tokenizer, opt):
        super(BertInputBuilder, self).__init__(tokenizer)
        self.opt = opt
        self.test_seg_id = 0
        self.support_seg_id = 0 if opt.context_emb == 'sep_bert' else 1  
        # 1 to cat support and query to get reps 
        # 0 表示support 和 query 分别计算BERT表征；
        
        self.tag_data_dict = None
        if opt.task == 'mlc':
            # 把所有被词性标注过的句子加载进内存
            self.tag_data_dict = self.load_tag_data_dict()
        
        self.seq_ins = {}

    def __call__(self, example, max_support_size, label2id) -> (FeatureItem, ModelInput, List[FeatureItem], ModelInput):
        # query转化为特征
        test_feature_item, test_input = self.prepare_test(example)
        
        # support句子转化为特征
        support_feature_items, support_input = self.prepare_support(example, max_support_size)
        
        return test_feature_item, test_input, support_feature_items, support_input

    def prepare_test(self, example):
        test_feature_item = self.data_item2feature_item(data_item=example.test_data_item, seg_id=0)
        test_input = self.get_test_model_input(test_feature_item)
        return test_feature_item, test_input

    def prepare_support(self, example, max_support_size):
        support_feature_items = [self.data_item2feature_item(data_item=s_item, seg_id=self.support_seg_id) for s_item in
                                 example.support_data_items]
        support_input = self.get_support_model_input(support_feature_items, max_support_size)
        return support_feature_items, support_input

    def data_item2feature_item(self, data_item: DataItem, seg_id: int) -> FeatureItem:
        """ 
        get feature_item for bert, steps: 
            1. do digitalizing 
            2. make mask 
            
        """
        
        # 进行subword分词
        wp_mark, wp_text = self.tokenizing(data_item)
        
        if self.opt.task == 'sl':  # use word-level labels  [opt.label_wp is supported by model now.]
            labels = self.get_wp_label(data_item.seq_out, wp_text, wp_mark) if self.opt.label_wp else data_item.seq_out
        
        else:  # use sentence level labels
            labels = data_item.label
#             if 'None' not in labels:
#                 # transfer label to index type such as `label_1`
#                 # 没有用到
#                 if self.opt.index_label:
#                     labels = [self.opt.label2index_type[label] for label in labels]
#                 if self.opt.unused_label:
#                     labels = [self.opt.label2unused_type[label] for label in labels]
        
        # 拼接成BERT的输入格式
        tokens = ['[CLS]'] + wp_text + ['[SEP]'] if seg_id == 0 else wp_text + ['[SEP]']
        # 转化为id
        token_ids, segment_ids = self.digitizing_input(tokens=tokens, seg_id=seg_id)
        
        
        nwp_index = self.get_nwp_index(wp_mark)
        input_mask = [1] * len(token_ids)
        output_mask = [1] * len(labels)   # For sl: it is original tokens; For mlc: it is labels
        
        ret = FeatureItem(
            tokens=tokens,
            labels=labels,
            data_item=data_item,
            token_ids=token_ids,
            segment_ids=segment_ids,
            nwp_index=nwp_index,
            input_mask=input_mask,
            output_mask=output_mask,
        )
        return ret

    def get_nwp_index(self, word_piece_mark: list) -> torch.Tensor:
        """ get index of non-word-piece tokens, which is used to extract non-wp bert embedding in batch manner """
        # 没有被wordpiece拆开的单词地方标为1；        
        return torch.nonzero(torch.LongTensor(word_piece_mark) - 1).tolist()  # wp mark word-piece with 1, so - 1

    def tokenizing(self, item: DataItem):
        """ Do tokenizing and get word piece data and get label on pieced words. """
        wp_text = self.tokenizer.wordpiece_tokenizer.tokenize(' '.join(item.seq_in))
        
        # wp_mark： 如果单词被拆分为多个wordpiece tokens，单词的后面几个wp token位置标位1
        wp_mark = [int((len(w) > 2) and w[0] == '#' and w[1] == '#') for w in wp_text]  # mark wp as 1
        return wp_mark, wp_text

    def get_wp_label(self, label_lst, wp_text, wp_mark, label_pieced_words=False):
        """ get label on pieced words. """
        # 本文用不到
        wp_label, label_idx = [], 0
        for ind, mark in enumerate(wp_mark):
            if mark == 0:  # label non-pieced token with original label
                wp_label.append(label_lst[label_idx])
                label_idx += 1  # pointer on non-wp labels
            elif mark == 1:  # label word-piece with whole word's label or with  [PAD] label
                pieced_label = wp_label[-1].replace('B-', 'I-') if label_pieced_words else '[PAD]'
                wp_label.append(pieced_label)
            if not wp_label[-1]:
                raise RuntimeError('Empty label')
        if not (len(wp_label) == len(wp_text) == len(wp_mark)):
            raise RuntimeError('ERROR: Failed to generate wp labels:{}{}{}{}{}{}{}{}{}{}{}'.format(
                len(wp_label), len(wp_text), len(wp_mark),
                '\nwp_lb', wp_label, '\nwp_text', wp_text, '\nwp_mk', wp_mark, '\nlabel', label_lst))

    def prepare_label_num_features(self, example, label2id):
        # support的label number features
        support_label_num_features = [self.extract_label_num_feature(s_item) for s_item in example.support_data_items]
        
        # torch.stack：是concat操作，但是是会自己先添加一个dimension进行拼接，
        support_label_num_features = torch.stack(support_label_num_features, dim=0)
        
        # query的label number features
        test_label_num_features = self.extract_label_num_feature(example.test_data_item)
        return support_label_num_features, test_label_num_features

    def prepare_label_num_target(self, example, label2id):
        # label数量
        support_label_num_target = torch.Tensor([len(s_item.label) for s_item in example.support_data_items])
        test_label_num_target = torch.Tensor([len(example.test_data_item.label)])
        return support_label_num_target, test_label_num_target

    def extract_label_num_feature(self, item: DataItem) -> torch.Tensor:
        seq_in = item.seq_in
        seq_in_text = ' '.join(seq_in)
        if seq_in_text in self.seq_ins:
            return self.seq_ins[seq_in_text]
        else:
            sent_len = len(seq_in)
            # tag_data = self.tagger.tag(seq_in)
            if seq_in_text not in self.tag_data_dict:
                raise ValueError('the tag data dict is not complement ')  # complete
            tag_data = self.tag_data_dict[seq_in_text]
            conj_num = verb_num = punc_num = qst_num = 0
            
            # 计数： 
            for origin_item, tag_item in tag_data:
                # 谓词
                if tag_item in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
                    verb_num += 1
                # Coordinating conjunction: 并列连词
                elif tag_item == "CC":
                    conj_num += 1
                
                # 标点符号
                elif tag_item in [",", "."] or origin_item == ";":  # `! ?` are represent as `.`
                    punc_num += 1
                
                # 疑问代词 & 疑问副词等
                # WP: wh-pronoun; WRB: wh-adverb; WP$:  Possessive wh-pronoun; WDT wh-determiner
                elif tag_item in ["WP", "WRB", "WDZ", "WP$"]:
                    qst_num += 1
            sf_item = torch.Tensor([sent_len, conj_num, verb_num, punc_num, qst_num])
            self.seq_ins[seq_in_text] = sf_item
        return sf_item

    def load_tag_data_dict(self):
        # 把所有被词性标注过的句子加载进内存
        
        tag_data_dir = os.path.dirname(os.path.dirname(self.opt.train_path))
        tag_data_path = os.path.join(tag_data_dir, 'tag_data.dict.all')
        with open(tag_data_path, 'r') as fr:
            tag_data_dict = json.load(fr)
        return tag_data_dict

In [17]:
# label name的处理；

class SchemaInputBuilder(BertInputBuilder):
    def __init__(self, tokenizer, opt):
        super(SchemaInputBuilder, self).__init__(tokenizer, opt)

    def __call__(self, example, max_support_size, label2id) -> (FeatureItem, ModelInput, List[FeatureItem], ModelInput):
        test_feature_item, test_input = self.prepare_test(example)
        support_feature_items, support_input = self.prepare_support(example, max_support_size)
        
        # label名称的表征：主要用"sep"， BERT直接编码；不与句子拼接
        if self.opt.label_reps in ['cat']:  # represent labels by concat all all labels
            label_input, label_items = self.prepare_label_feature(label2id)
        elif self.opt.label_reps in ['sep', 'sep_sum']:  # represent each label independently
            label_input, label_items = self.prepare_sep_label_feature(label2id)
        return test_feature_item, test_input, support_feature_items, support_input, label_items, label_input,

    def prepare_label_feature(self, label2id: dict):
        """ prepare digital input for label feature in concatenate style """
        
        text, wp_text, label, wp_label, wp_mark = [], [], [], [], []
        sorted_labels = sorted(label2id.items(), key=lambda x: x[1])
        for label_name, label_id in sorted_labels:
            if label_name == '[PAD]':
                continue
            tmp_text = self.convert_label_name(label_name)
            tmp_wp_text = self.tokenizer.tokenize(' '.join(tmp_text))
            text.extend(tmp_text)
            wp_text.extend(tmp_wp_text)
            label.extend(['O'] * len(tmp_text))
            wp_label.extend(['O'] * len(tmp_wp_text))
            wp_mark.extend([0] + [1] * (len(tmp_wp_text) - 1))
        label_item = self.data_item2feature_item(DataItem(text, label, wp_text, wp_label, wp_mark), 0)
        label_input = self.get_test_model_input(label_item)
        return label_input, label_item

    def prepare_sep_label_feature(self, label2id):
        """ prepare digital input for label feature separately """
        label_items = []
        for label_name in label2id:
            if label_name == '[PAD]':
                continue
            seq_in = self.convert_label_name(label_name)
            seq_out = ['None'] * len(seq_in)
            label = ['None']
            label_items.append(self.data_item2feature_item(DataItem(seq_in, seq_out, label), 0))
        label_input = self.get_support_model_input(label_items, len(label2id) - 1)  # no pad, so - 1
        return label_input, label_items

    def convert_label_name(self, name):
        text = []
        tmp_name = name
        if 'B-' in name:
            text.append('begin')
            tmp_name = name.replace('B-', '')
        elif 'I-' in name:
            text.append('inner')
            tmp_name = name.replace('I-', '')
        elif 'O' == name:
            text.append('ordinary')
            tmp_name = ''

        # special processing to label name
        name_translations = [('PER', 'person'), 
                             ('ORG', 'organization'), 
                             ('LOC', 'location'),
                             ('MISC', 'miscellaneous'), 
                             ('GPE', 'geographical political'),
                             ('NORP', 'nationalities or religious or political groups'),
                             # toursg data
                             ("ACK", "acknowledgment, as well as common expressions used for grounding"),
                             # ("CANCEL", "cancelation"),
                             # ("CLOSING", "closing remarks"),
                             # ("COMMIT", "commitment"),
                             # ("CONFIRM", "confirmation"),
                             # ("ENOUGH", "no more information is needed"),
                             # ("EXPLAIN", "an explanation/justification of a previous stated idea"),
                             # ("HOW_MUCH", "money or time amounts"),
                             # ("HOW_TO", "used to request/give specific instructions"),
                             ("INFO", "information request"),
                             # ("NEGATIVE", "negative responses"),
                             # ("OPENING", "opening remarks"),
                             # ("POSITIVE", "positive responses"),
                             # ("PREFERENCE", "preferences"),
                             # ("RECOMMEND", "recommendations"),
                             # ("THANK", "thank you remarks"),
                             # ("WHAT", "concept related utterances"),
                             # ("WHEN", "time related utterances"),
                             # ("WHERE", "location related utterances"),
                             # ("WHICH", "entity related utterances"),
                             # ("WHO", "person related utterances and questions"),
                             ]
        if tmp_name:
            for shot, long in name_translations:
                if tmp_name == shot:
                    text.append(long)
                    tmp_name = ''
                    break
        if tmp_name:
            text.extend(tmp_name.lower().split('_'))
        return text

In [18]:
class LabelNumSchemaInputBuilder(SchemaInputBuilder):

    def __init__(self, tokenizer, opt):
        super(LabelNumSchemaInputBuilder, self).__init__(tokenizer, opt)

    def __call__(self, example, max_support_size, label2id) -> (FeatureItem, ModelInput, List[FeatureItem], ModelInput):
        test_feature_item, test_input = self.prepare_test(example)
        support_feature_items, support_input = self.prepare_support(example, max_support_size)
        
        # 处理label名称
        label_input = label_items = None
        if self.opt.label_reps in ['cat']:  # represent labels by concat all all labels
            label_input, label_items = self.prepare_label_feature(label2id)
        elif self.opt.label_reps in ['sep', 'sep_sum']:  # represent each label independently
            label_input, label_items = self.prepare_sep_label_feature(label2id)
        
        # label数量的特征
        '''get sentence features'''
        support_label_num_features, test_label_num_features = self.prepare_label_num_features(example, label2id)
        '''get sentence target'''
        support_label_num_target, test_label_num_target = self.prepare_label_num_target(example, label2id)

        return test_feature_item, test_input, support_feature_items, support_input, label_items, label_input, \
            support_label_num_features, test_label_num_features, support_label_num_target, test_label_num_target


In [26]:
# 举例： 

from transformers import BertTokenizer

opt.bert_vocab = "../resources/bert_base_uncased/vocab.txt"
opt.context_emb = "sep"
opt.label_reps = "sep"
opt.task = "mlc"
opt.train_path = "../data/stanford/stanford.0.spt_s_1.q_s_32.ep_200--use_schema--label_num_schema2/train.json"

tokenizer = BertTokenizer.from_pretrained(opt.bert_vocab)
input_builder = LabelNumSchemaInputBuilder(tokenizer, opt)


test_feature_item, test_input, support_feature_items, support_input, label_items, label_input, \
            support_label_num_features, test_label_num_features, support_label_num_target, test_label_num_target = input_builder(
    examples[0], max_support_size, label2id
)

In [29]:
print("test_feature_item: ", test_feature_item)
print("test_input: ", test_input)
# print("support_feature_items: ", support_feature_items)
# print("support_input: ", support_input)
print("label_items: ", label_items)
print("label_input: ", label_input)
print("support_label_num_features: ", support_label_num_features)
print("test_label_num_features: ", test_label_num_features)
print("support_label_num_target: ", support_label_num_target)
print("test_label_num_target: ", test_label_num_target)

test_feature_item:  FeatureItem(tokens=['[CLS]', '[UNK]', 'you', 'very', 'much', 'car', '!', '[SEP]'], labels=['appreciate'], data_item=DataItem(seq_in=['Thank', 'you', 'very', 'much', 'car', '!'], seq_out=['O', 'O', 'O', 'O', 'O', 'O'], label=['appreciate']), token_ids=[101, 100, 2017, 2200, 2172, 2482, 999, 102], segment_ids=[0, 0, 0, 0, 0, 0, 0, 0], nwp_index=[[0], [1], [2], [3], [4], [5]], input_mask=[1, 1, 1, 1, 1, 1, 1, 1], output_mask=[1])
test_input:  ModelInput(token_ids=tensor([ 101,  100, 2017, 2200, 2172, 2482,  999,  102]), segment_ids=tensor([0, 0, 0, 0, 0, 0, 0, 0]), nwp_index=tensor([[0],
        [1],
        [2],
        [3],
        [4],
        [5]]), input_mask=tensor([1, 1, 1, 1, 1, 1, 1, 1]), output_mask=tensor([1]))
label_items:  [FeatureItem(tokens=['[CLS]', 'appreciate', '[SEP]'], labels=['None'], data_item=DataItem(seq_in=['appreciate'], seq_out=['None'], label=['None']), token_ids=[101, 9120, 102], segment_ids=[0, 0, 0], nwp_index=[[0]], input_mask=[1, 1, 1],

### OutputBuilder

multi-label classification：还没有处理每个句子的labels；

In [None]:
class OutputBuilderBase:
    """  Digitalizing the output targets"""
    def __init__(self):
        pass

    def __call__(self, test_feature_item: FeatureItem, support_feature_items: FeatureItem,
                 label2id: dict, max_support_size: int):
        raise NotImplementedError

    def pad_support_set(self, item_lst: List[List[int]], pad_value: int, max_support_size: int) -> List[List[int]]:
        """
        pre-pad support set to insure: 
            1. each set has same sent num 
            2. each sent has same length
        (do padding here because: 
            1. all support sent are considered as one tensor input  
            2. support set size is small)
        :param item_lst:
        :param pad_value:
        :param max_support_size:
        :return:
        """
        ''' pad sentences '''
        max_sent_len = max([len(x) for x in item_lst])
        ret = []
        for sent in item_lst:
            temp = sent[:]
            while len(temp) < max_sent_len:
                temp.append(pad_value)
            ret.append(temp)
        ''' pad support set size '''
        pad_item = [pad_value for _ in range(max_sent_len)]
        while len(ret) < max_support_size:
            ret.append(pad_item)
        return ret

In [None]:
class FewShotOutputBuilder(OutputBuilderBase):
    """  Digitalizing the output targets as label id for non word piece tokens  """
    def __init__(self):
        super(FewShotOutputBuilder, self).__init__()
    
    def __call__(self, test_feature_item: FeatureItem, 
                 support_feature_items: FeatureItem,
                 label2id: dict, 
                 max_support_size: int):
        test_target = self.item2label_ids(test_feature_item, label2id)
        
        # to estimate emission, the support target is one-hot here
        support_target = [self.item2label_onehot(f_item, label2id) for f_item in support_feature_items]
        
        # padding为统一的shape
        support_target = self.pad_support_set(support_target, self.label2onehot('[PAD]', label2id), max_support_size)
        return torch.LongTensor(test_target), torch.LongTensor(support_target)

    def item2label_ids(self, f_item: FeatureItem, label2id: dict):
        return [label2id[lb] for lb in f_item.labels]

    def item2label_onehot(self, f_item: FeatureItem, label2id: dict):
        return [self.label2onehot(lb, label2id) for lb in f_item.labels]

    def label2onehot(self, label: str, label2id: dict):
        onehot = [0 for _ in range(len(label2id))]
        onehot[label2id[label]] = 1
        return onehot

### FeatureConstructor

将预处理的流程集中起来；

In [None]:
class FeatureConstructor:
    """
    Class for build feature and label2id dict
    Main function:
        construct_feature： 得到feature；
        make_dict：收集标签集，得到标签的编号；
    """
    def __init__(
            self,
            input_builder: InputBuilderBase,
            output_builder: OutputBuilderBase,
    ):
        self.input_builder = input_builder
        self.output_builder = output_builder

    def construct_feature(
            self,
            examples: List[FewShotExample],
            max_support_size: int,
            label2id: dict,
            id2label: dict,
    ) -> List[FewShotFeature]:
        all_features = []
        for example in examples:
            feature = self.example2feature(example, max_support_size, label2id, id2label)
            all_features.append(feature)
        return all_features

    def example2feature(
            self,
            example: FewShotExample,
            max_support_size: int,
            label2id: dict,
            id2label: dict
    ) -> FewShotFeature:
        test_feature_item, test_input, support_feature_items, support_input = self.input_builder(
            example, max_support_size, label2id)
        test_target, support_target = self.output_builder(
            test_feature_item, support_feature_items, label2id, max_support_size)
        ret = FewShotFeature(
            gid=example.gid,
            test_gid=example.test_id,
            batch_gid=example.batch_id,
            test_input=test_input,
            test_feature_item=test_feature_item,
            support_input=support_input,
            support_feature_items=support_feature_items,
            test_target=test_target,
            support_target=support_target,
        )
        return ret

In [None]:
class LabelNumSchemaFeatureConstructor(FeatureConstructor):
    def __init__(
            self,
            input_builder: InputBuilderBase,
            output_builder: OutputBuilderBase,
    ):
        super(LabelNumSchemaFeatureConstructor, self).__init__(input_builder, output_builder)

    def example2feature(
            self,
            example: FewShotExample,
            max_support_size: int,
            label2id: dict,
            id2label: dict
    ) -> FewShotFeature:
        test_feature_item, test_input, support_feature_items, support_input, label_items, label_input, \
            support_label_num_features, test_label_num_features, support_label_num_target, test_label_num_target = \
            self.input_builder(example, max_support_size, label2id)
        test_target, support_target = self.output_builder(
            test_feature_item, support_feature_items, label2id, max_support_size)
        ret = FewShotFeature(
            gid=example.gid,
            test_gid=example.test_id,
            batch_gid=example.batch_id,
            test_input=test_input,
            test_feature_item=test_feature_item,
            support_input=support_input,
            support_feature_items=support_feature_items,
            test_target=test_target,
            support_target=support_target,
            label_input=label_input,
            label_items=label_items,
            support_label_num_feature=support_label_num_features,
            test_label_num_feature=test_label_num_features,
            support_label_num_target=support_label_num_target,
            test_label_num_target=test_label_num_target
        )
        return ret

### 批量处理数据集

In [None]:

def load_feature(path):
    with open(path, 'rb') as reader:
        saved_feature = pickle.load(reader)
        return saved_feature['features'], saved_feature['label2id'], saved_feature['id2label']


def get_training_data_and_feature(opt, data_loader, preprocessor):
    """ prepare feature and data """
    
    # 如果有缓存：加载缓存
    if opt.load_feature:
        try:
            train_features, train_label2id, train_id2label = load_feature(opt.train_path.replace('.json', '.saved.pk'))
            dev_features, dev_label2id, dev_id2label = load_feature(opt.dev_path.replace('.json', '.saved.pk'))
        except FileNotFoundError:
            opt.load_feature, opt.save_feature = False, True  # Not a saved feature file yet, make it
            train_features, train_label2id, train_id2label, dev_features, dev_label2id, dev_id2label =\
                get_training_data_and_feature(opt, data_loader, preprocessor)
            opt.load_feature, opt.save_feature = True, False  # restore option
    else:

        # 从json文件中读出数据
        logger.info("opt.train_path: {}".format(opt.train_path))
        train_examples, train_max_len, train_max_support_size = \
            data_loader.load_data(
                path=opt.train_path
            )
        dev_examples, dev_max_len, dev_max_support_size = data_loader.load_data(path=opt.dev_path)

        # 拿到label标签的编号
        #   --> 因为现在是小样本学习，考察模型迁移能力，所以train和dev的标签体系是不一样的
        train_label2id, train_id2label = make_dict(opt, train_examples)
        dev_label2id, dev_id2label = make_dict(opt, dev_examples)
        print("train_label2id: \n", train_label2id)
        print("dev_label2id: \n", dev_label2id)
        logger.info(' Finish train dev prepare dict ')

        # 将文本和标签进行序列化；
        train_features = preprocessor.construct_feature(
            train_examples,
            train_max_support_size,
            train_label2id,
            train_id2label
        )
        dev_features = preprocessor.construct_feature(
            dev_examples,
            dev_max_support_size,
            dev_label2id,
            dev_id2label
        )
        logger.info(' Finish prepare train dev features ')
        
        # 缓存至文件
        if opt.save_feature:
            save_feature(opt.train_path.replace('.json', '.saved.pk'), train_features, train_label2id, train_id2label)
            save_feature(opt.dev_path.replace('.json', '.saved.pk'), dev_features, dev_label2id, dev_id2label)
    return train_features, train_label2id, train_id2label, dev_features, dev_label2id, dev_id2label


In [None]:
def get_testing_data_feature(opt, data_loader, preprocessor):
    """ prepare feature and data """
    if opt.load_feature:
        try:
            test_features, test_label2id, test_id2label = load_feature(opt.test_path.replace('.json', '.saved.pk'))
        except FileNotFoundError:
            opt.load_feature, opt.save_feature = False, True  # Not a saved feature file yet, make it
            test_features, test_label2id, test_id2label = get_testing_data_feature(opt, data_loader, preprocessor)
            opt.load_feature, opt.save_feature = True, False  # restore option
    else:
        test_examples, test_max_len, test_max_support_size = data_loader.load_data(path=opt.test_path)
        test_label2id, test_id2label = make_dict(opt, test_examples)
        logger.info(' Finish prepare test dict')
        test_features = preprocessor.construct_feature(
            test_examples, test_max_support_size, test_label2id, test_id2label)
        logger.info(' Finish prepare test feature')
        if opt.save_feature:
            save_feature(opt.test_path.replace('.json', '.saved.pk'), test_features, test_label2id, test_id2label)
    return test_features, test_label2id, test_id2label