# 数据处理模块



**目录：**
1. 标签类别收集

2. 训练样本读取

2. 样本转化为符合BERT模型的特征

---


In [17]:
import argparse

import os
import copy
import json
import logging


import torch
from torch.utils.data import TensorDataset, RandomSampler, DataLoader

logger = logging.getLogger(__name__)

## 1 训练数据形式回顾

　　atis和snips数据集已经将训练集、验证集和测试集区分好
- label文件保存了意图识别的标签
- seq.in文件每行保存了一句输入样本
- seq.out文件每行保存了样本的插槽标签序列，以空格隔开

<img src="./imgs/数据集结构.png"  width="300" height="300" align="left" />

## 2 标签集：将所有出现的意图标签和槽位标签统计出来

In [18]:
def vocab_process(data_dir):
    '''
    args:
        data_dir: 数据集所在的路径；
    
    return:
        None
    
    results:
        intent的label类型(写入一个txt文件);
        slot的label类型(写入一个txt文件);
        
    
    '''
    
    
    # 标签集合输出到如下文件中
    slot_label_vocab = 'slot_label.txt'
    intent_label_vocab = 'intent_label.txt'

    train_dir = os.path.join(data_dir, 'train')
    
    # 收集intent标签
    with open(os.path.join(train_dir, 'label'), 'r', encoding='utf-8') as f_r, open(os.path.join(data_dir, intent_label_vocab), 'w',
                                                                                    encoding='utf-8') as f_w:
        # 提取所有出现的intent的label类型
        intent_vocab = set()
        for line in f_r:
            line = line.strip()
            intent_vocab.add(line)
        
        # 因为训练集已经分好了，所以可能出现验证集中有而训练集中没有的label，以“UNK”来表示这种label; 
        # 后面读取dev集，就需要将未见过的intent标签归类为"UNK"
        additional_tokens = ["UNK"]
        for token in additional_tokens:
            f_w.write(token + '\n')
        
        # 将vocab以字典序排列
        intent_vocab = sorted(list(intent_vocab))
        for intent in intent_vocab:
            f_w.write(intent + '\n')

    # 收集 slot 标签
    with open(os.path.join(train_dir, 'seq.out'), 'r', encoding='utf-8') as f_r, open(os.path.join(data_dir, slot_label_vocab), 'w',
                                                                                      encoding='utf-8') as f_w:
        # 得到所有序列label
        slot_vocab = set()
        for line in f_r:
            line = line.strip()
            
            # 一个label序列如下： O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip
            # 按空格分割得到label序列
            
            slots = line.split() 
            for slot in slots:
                slot_vocab.add(slot)
        
        # label是以BIO的形式标记的，先按BIO后面的实体类别字典序排列，再按BI顺序排列
        slot_vocab = sorted(list(slot_vocab), key=lambda x: (x[2:], x[:2]))

        # Write additional tokens
        # “UNK”同上，“PAD”表示被填充的部分的label
        additional_tokens = ["PAD", "UNK"]
        for token in additional_tokens:
            f_w.write(token + '\n')

        for slot in slot_vocab:
            f_w.write(slot + '\n')

atis_dir = "./data/atis/"
vocab_process(atis_dir)

snips_dir = "./data/snips/"
vocab_process(snips_dir)

In [19]:
# 后面两类标签就分别通过简单的读取函数就可以读取出来了

def get_intent_labels(args):
    return [label.strip() for label in open(os.path.join(args.data_dir, args.task, args.intent_label_file), 'r', encoding='utf-8')]


def get_slot_labels(args):
    return [label.strip() for label in open(os.path.join(args.data_dir, args.task, args.slot_label_file), 'r', encoding='utf-8')]



## 3 样本读取为样本实例

In [20]:
class InputExample(object):
    """
    A single training/test example for simple sequence classification. 一个单独的样本实例
    一个样本完全可以用一个dict来表示，但是使用 InputExample 类，作为一个python类，具有一些方便之处

    Args:
        guid: Unique id for the example.
        words: list. The words of the sequence.
        intent_label: (Optional) string. The intent label of the example.
        slot_labels: (Optional) list. The slot labels of the example.
    """

    def __init__(self, guid, words, intent_label=None, slot_labels=None):
        self.guid = guid # 每个样本的独特的序号
        self.words = words # 样本的输入序列
        self.intent_label = intent_label #样本的intent标签
        self.slot_labels = slot_labels #样本的slot标签序列

    def __repr__(self):
        # 默认为： “类名+object at+内存地址”这样的信息表示这个实例；
        # 这里我们重写成了想要输出的信息；
        # print(input_example) 时候显示；
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        
        # __dict__： 
        # 类 的静态函数、类函数、普通函数、全局变量以及一些内置的属性都是放在类__dict__里的
        # 对象实例的__dict__中存储了一些self.xxx的一些东西
        # 参见 https://www.cnblogs.com/starrysky77/p/9102344.html
        
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
    
    

In [21]:
class JointProcessor(object):
    """
    Processor for the JointBERT data set 
    
    JointBERT项目的数据处理器
    """

    def __init__(self, args):
        self.args = args
        
        # 读出我们已经整理好的意图标签和slot标签；
        self.intent_labels = get_intent_labels(args)
        self.slot_labels = get_slot_labels(args)
        
        # 每个数据集的文件夹里面，数据格式是一致的，文件名也一致；
        self.input_text_file = 'seq.in'
        self.intent_label_file = 'label'
        self.slot_labels_file = 'seq.out'
    
    # 按行读取文件
    @classmethod
    def _read_file(cls, input_file):
        """
        Reads a tab separated value file.
        读一个文件，以行为单位，先把每行读出来；        
        
        """
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                lines.append(line.strip())
            return lines

    def _create_examples(self, texts, intents, slots, set_type):
        """
        Creates examples for the training and dev sets.

        
       
        Args:
            texts: list. Sequence of unsplitted texts. 需要处理的文本组成的列表
            intents: list. Sequence of intent labels. 意图label组成的列表
            slots: list. Sequence of unsplitted slot labels. slot labels 组成的列表
            set_type: str. train, dev, test. 训练集/开发集/测试集
            
        """
        examples = []
        for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
            guid = "%s-%s" % (set_type, i)   # 给每个样本一个编号
            # 1. input_text
            words = text.split()  
            # 2. intent
            intent_label = self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
            # 3. slot
            slot_labels = []
            for s in slot.split():
                slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))

            assert len(words) == len(slot_labels)
            examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
        return examples

    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test; 区分训练/开发/测试集
        """
        data_path = os.path.join(self.args.data_dir, self.args.task, mode)
        logger.info("LOOKING AT {}".format(data_path))
        return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
                                     intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
                                     slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
                                     set_type=mode)

In [22]:
# 先构建参数

# parser = argparse.ArgumentParser()

# 实际使用应该是命令行传入的参数，不过我这里直接赋值传入
# parser.add_argument("--task", default=None, required=True, type=str, help="The name of the task to train")
# parser.add_argument("--data_dir", default="./data", type=str, help="The input data dir")
# parser.add_argument("--intent_label_file", default="intent_label.txt", type=str, help="Intent Label file")
# parser.add_argument("--slot_label_file", default="slot_label.txt", type=str, help="Slot Label file")

# args = parser.parse_args()

class Args():
    task =  None
    data_dir =  None
    intent_label_file =  None
    slot_label_file =  None

args = Args()
args.task = "atis"
args.data_dir = "./data"
args.intent_label_file = "intent_label.txt"
args.slot_label_file = "slot_label.txt"

In [23]:
# 实例化

processor = JointProcessor(args)

In [24]:
# 看一下processor的属性

print(processor.intent_labels)
print(processor.slot_labels)

['UNK', 'atis_abbreviation', 'atis_aircraft', 'atis_aircraft#atis_flight#atis_flight_no', 'atis_airfare', 'atis_airline', 'atis_airline#atis_flight_no', 'atis_airport', 'atis_capacity', 'atis_cheapest', 'atis_city', 'atis_distance', 'atis_flight', 'atis_flight#atis_airfare', 'atis_flight_no', 'atis_flight_time', 'atis_ground_fare', 'atis_ground_service', 'atis_ground_service#atis_ground_fare', 'atis_meal', 'atis_quantity', 'atis_restriction']
['PAD', 'UNK', 'O', 'B-aircraft_code', 'B-airline_code', 'B-airline_name', 'I-airline_name', 'B-airport_code', 'B-airport_name', 'I-airport_name', 'B-arrive_date.date_relative', 'B-arrive_date.day_name', 'B-arrive_date.day_number', 'I-arrive_date.day_number', 'B-arrive_date.month_name', 'B-arrive_date.today_relative', 'B-arrive_time.end_time', 'I-arrive_time.end_time', 'B-arrive_time.period_mod', 'B-arrive_time.period_of_day', 'I-arrive_time.period_of_day', 'B-arrive_time.start_time', 'I-arrive_time.start_time', 'B-arrive_time.time', 'I-arrive_tim

In [26]:
# 读取train样本
train_examples = processor.get_examples("train")
print(len(train_examples))
print(train_examples[5])


4478
{
  "guid": "train-5",
  "intent_label": 12,
  "slot_labels": [
    2,
    2,
    2,
    2,
    2,
    2,
    73,
    2,
    114,
    115,
    2,
    2,
    2,
    103,
    104,
    2,
    2,
    81,
    2,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "words": [
    "i'm",
    "looking",
    "for",
    "a",
    "flight",
    "from",
    "charlotte",
    "to",
    "las",
    "vegas",
    "that",
    "stops",
    "in",
    "st.",
    "louis",
    "hopefully",
    "a",
    "dinner",
    "flight",
    "how",
    "can",
    "i",
    "find",
    "that",
    "out"
  ]
}



In [10]:
# 两个数据集的processor

processors = {
    "atis": JointProcessor,
    "snips": JointProcessor
}

## 4 将数据处理成可以喂给模型的特征

In [27]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
        self.input_ids = input_ids  # 输入样本序列在bert词表里的索引，可以直接喂给nn.embedding
        self.attention_mask = attention_mask  # 注意力mask，padding的部分为0，其他为1
        self.token_type_ids = token_type_ids  # 表示每个token属于句子1还是句子2
        self.intent_label_id = intent_label_id  
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [36]:
def convert_examples_to_features(examples, 
                                 max_seq_len, 
                                 tokenizer,
                                 pad_token_label_id=-100,  
                                 cls_token_segment_id=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0,
                                 mask_padding_with_zero=True):
    """
    将之前读取的数据进行添加[CLS],[SEP]标记，padding等操作
    
    args:
        examples: 样本实例列表
        pad_token_label_id: Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        cls_token_segment_id： 取0
        sequence_a_segment_id： 取0
        pad_token_segment_id： 取0
        mask_padding_with_zero： attention mask;
    
    """
    # Setting based on the current model type 这里我们以BERT tokenizer为例讲解
    cls_token = tokenizer.cls_token   # [CLS]
    sep_token = tokenizer.sep_token   # [SEP]
    unk_token = tokenizer.unk_token   # [UNK]
    pad_token_id = tokenizer.pad_token_id  # [PAD]编号为0

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 1000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        # Tokenize word by word (for NER)
        # bert的tokenizer可能会把一个单词分为多个subword，将第一个subword标记为slot label，其他标记为pad label
        tokens = []
        slot_labels_ids = []
        for word, slot_label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)  # non-spacing characters
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word 不能识别的word就标记为 unk 词
            
            # principle: prin cip le; 
            # B-ENT: B-ENT, X, X;   1, -100, -100 
            # B-ENT: B-ENT, I-ENT, I-ENT;  1, 2, 2
            # B-ENT: B-ENT, O, O;  1, 0, 0
            # B-ENT: B-ENT, B-ENT, B-ENT; 1, 1, 1
            
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        # 如果句子长了就截断
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids
        
        # 把单词转化为ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
        
        # check长度是否符合
        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)

        intent_label_id = int(example.intent_label)

        if  ex_index < 105:
            print("*** Example ***")
            print("guid: %s" % example.guid)
            print("tokens: %s" % " ".join([str(x) for x in tokens]))
            print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            print("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))
            print("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          intent_label_id=intent_label_id,
                          slot_labels_ids=slot_labels_ids
                          ))
    
    return features

In [37]:
def load_and_cache_examples(args, tokenizer, mode):
    processor = processors[args.task](args)

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        'cached_{}_{}_{}_{}'.format(
            mode,
            args.task,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            args.max_seq_len
        )
    )
    print(cached_features_file)

    if os.path.exists(cached_features_file) and False:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        # Load data features from dataset file
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if mode == "train":
            examples = processor.get_examples("train")
        elif mode == "dev":
            examples = processor.get_examples("dev")
        elif mode == "test":
            examples = processor.get_examples("test")
        else:
            raise Exception("For mode, Only train, dev, test is available")

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        # 使用torch的cross entropy 函数时，会忽略id是pad_token_label_id的loss，默认是-100
        pad_token_label_id = args.ignore_index
        features = convert_examples_to_features(examples, 
                                                args.max_seq_len,
                                                tokenizer,
                                                pad_token_label_id=pad_token_label_id)
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset 将特征转化为tensor
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
    all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)

    # 将各种tensor打包，类似zip，要求各 tensor 第一维相等
    dataset = TensorDataset(
        all_input_ids,
        all_attention_mask,
        all_token_type_ids,
        all_intent_label_ids,
        all_slot_labels_ids 
    )
    return dataset

In [38]:
# 这一步涉及到不同模型的tokenizer

from transformers import BertConfig, DistilBertConfig, AlbertConfig
from transformers import BertTokenizer, DistilBertTokenizer, AlbertTokenizer

from JointBERT.model import JointBERT, JointDistilBERT, JointAlbert




MODEL_CLASSES = {
    'bert': (BertConfig, JointBERT, BertTokenizer),
    'distilbert': (DistilBertConfig, JointDistilBERT, DistilBertTokenizer),
    'albert': (AlbertConfig, JointAlbert, AlbertTokenizer)
}

MODEL_PATH_MAP = {
    'bert': 'resources/bert_base_uncased',
    'distilbert': 'distilbert-base-uncased',
    'albert': 'albert-xxlarge-v1'
}

def load_tokenizer(args):
    return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)

In [39]:
# 先构建参数
class Args():
    task =  None
    data_dir =  None
    intent_label_file =  None
    slot_label_file =  None

args = Args()
args.task = "atis"
args.data_dir = "./data"
args.intent_label_file = "intent_label.txt"
args.slot_label_file = "slot_label.txt"
args.max_seq_len = 50
args.model_type = "bert"
args.model_dir = "experiments/jointbert_0"
args.model_name_or_path = MODEL_PATH_MAP[args.model_type]

args.ignore_index = -100   
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
# 表示计算交叉熵时，自动忽略的标签值

args.train_batch_size = 4

tokenizer = load_tokenizer(args)
load_and_cache_examples(args, tokenizer, mode="train")

./data\cached_train_atis_bert_base_uncased_50
*** Example ***
guid: train-0
tokens: [CLS] i want to fly from baltimore to dallas round trip [SEP]
input_ids: 101 1045 2215 2000 4875 2013 6222 2000 5759 2461 4440 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
intent_label: 12 (id = 12)
slot_labels: -100 2 2 2 2 2 73 2 114 98 99 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
*** Example ***
guid: train-1
tokens: [CLS] round trip fares from baltimore to philadelphia less than 1000 dollars round trip fares from denver to philadelphia less than 1000 dollars round trip fares from pittsbu

input_ids: 101 2265 2033 2035 2034 2465 7597 2013 5759 2000 6222 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
intent_label: 4 (id = 4)
slot_labels: -100 2 2 2 29 30 2 2 73 2 114 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
*** Example ***
guid: train-87
tokens: [CLS] is there a flight between san francisco and boston with a stop ##over in dallas fort worth [SEP]
input_ids: 101 2003 2045 1037 3462 2090 2624 3799 1998 3731 2007 1037 2644 7840 1999 5759 3481 4276 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

<torch.utils.data.dataset.TensorDataset at 0x207f6ea98e0>

### 5 pytorch的dataloader加载函数

这里是pytorch dataload 的pipeline固定写法,同学们需要了解清楚`执行流程`

In [16]:
tokenizer = load_tokenizer(args)
train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

# torch自带的sampler类，功能是每次返回一个随机的样本索引
train_sampler = RandomSampler(train_dataset)
# 使用dataloader输出batch
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

device = "cpu"
for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch) # 将batch上传到显卡
    inputs = {"input_ids": batch[0],
              "attention_mask": batch[1],
              "token_type_ids": batch[2],
              "intent_label_ids": batch[3],
              "slot_labels_ids": batch[4]}
    
    if step == 0:
        print(inputs["input_ids"], inputs["input_ids"].shape)
        print(inputs["attention_mask"], inputs["attention_mask"].shape)
        print(inputs["token_type_ids"], inputs["token_type_ids"].shape)
        print(inputs["intent_label_ids"], inputs["intent_label_ids"].shape)
        print(inputs["slot_labels_ids"], inputs["slot_labels_ids"].shape)
    
    # 输入模型的形式
    # outputs = self.model(**inputs)

./data\cached_train_atis_bert_base_uncased_50
./data\cached_dev_atis_bert_base_uncased_50
./data\cached_test_atis_bert_base_uncased_50
tensor([[ 101, 1045, 1005, 1040, 2066, 1037, 3462, 2013, 5111, 2103, 2000, 3050,
         3349, 2008, 8480, 1999, 3050, 3349, 1999, 1996, 2397, 5027,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [ 101, 1045, 2342, 1037, 3462, 2013, 7797, 2000, 2624, 4560, 2975, 2044,
         1020, 1999, 1996, 3944,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [ 101, 2265, 2033, 1996, 7599, 2077, 1022, 2572, 2006, 2257, 2117, 2013,
         3731, 2000, 7573, 2006, 7160,  102,    0,    0,    0,    0,    0,    0,
           