In [1]:
import os
import re
import random
import json

from pymongo import MongoClient

In [72]:
if not os.path.exists('clm_train.txt') or not os.path.exists('clm_valid.txt'):
    c = MongoClient()
    train_file = open('clm_train.txt', 'w')
    test_file = open('clm_valid.txt', 'w')
    for doc in c.article.crawl.find(projection=['summary']):
        if random.random() < 0.1:
            test_file.write(doc['summary'])
        train_file.write(doc['summary'])      

In [2]:
from datasets import load_dataset
datasets = load_dataset("text", data_files={"train": 'clm_valid.txt', "validation": 'clm_valid.txt'})

Using custom data configuration default-398c787d8b9b2cbf
Reusing dataset text (/home/min/.cache/huggingface/datasets/text/default-398c787d8b9b2cbf/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [3]:
model_checkpoint = "hfl/chinese-xlnet-base"

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [6]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [7]:
block_size = 128

In [8]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [10]:
tokenizer.decode(lm_datasets["train"][0]["input_ids"])

'新华社北京8月1日电为全面展现和生动反映以习近平同志为核心的党中央团结带领全党全国各族人民顽强奋斗、如期全面建成小康社会的伟大历程和辉煌成就,由中央宣传部指导、中央广播电视总台承制的5集电视专题片《人民的小康》,将于2日起在中央电视台综合频道黄金时段播出。据介绍,该片分为《一诺千钧》《脱贫攻坚》《民生福祉》《美好生活》《关键一步》等5个篇章,充分展现以习近平同志为核心的党中央'

In [84]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [11]:
def textgen():
    prefix = '在所有的木偶表演中，提线木偶难度最大。提线木偶是演员自上而下以数十条丝线操纵木偶表演的艺术。演员必须熟练掌握10多种理线技巧和30多种组织提线以表演各个行当、各种动作的“线规”，才有资格走上舞台。'
    prompt_text = prefix + '我希望'
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt')
    output_sequences = model.generate(
        encoded_prompt, 
        max_length=20+len(encoded_prompt[0]),
        temperature=0.1,
        top_k=0,
        top_p=0.9,
        repetition_penalty=1.0,
        do_sample=True,
    )
    print(tokenizer.decode(output_sequences[0], clean_up_tokenization_spaces=True))

In [12]:
textgen()

在所有的木偶表演中,提线木偶难度最大。提线木偶是演员自上而下以数十条丝线操纵木偶表演的艺术。演员必须熟练掌握10多种理线技巧和30多种组织提线以表演各个行当、各种动作的“线规”,才有资格走上舞台。我希望,在我看来,提线木偶是个好演员。 好演员,是个


In [1]:
from transformers import Trainer, TrainingArguments

In [2]:
training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01
)

  return torch._C._cuda_getDeviceCount() > 0


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 8094
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3036


Epoch,Training Loss,Validation Loss


In [21]:
textgen()

In [2]:
import torch
torch.cuda.is_available()

True

In [11]:
print(lm_datasets['train'][0])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [19, 21222, 528, 69, 25, 41, 29, 910, 39, 2681, 14027, 24, 153, 1004, 6147, 37, 17163, 6796, 39, 2756, 20, 654, 432, 10231, 7905, 213, 654, 1040, 291, 584, 744, 22206, 1092, 13987, 2324, 21, 174, 449, 2681, 2907, 96, 548, 1068, 20, 14582, 2710, 976, 24, 6053, 17389, 4425, 17, 53, 432, 16505, 5625, 21, 432, 17917, 682, 200, 1252, 8693, 56, 300, 2671, 20376, 463, 36, 744, 3081, 548, 31, 17, 134, 72, 45, 1947, 23, 19158, 3799, 3530, 7170, 17687, 1224, 18, 4490, 5728, 17, 11607, 2554, 36, 59, 844, 850, 14150, 12294, 4061, 9494, 1944, 3811, 12294, 11675, 512, 23159, 12294, 17040, 

In [13]:
print(tokenized_datasets['train'][0])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [19, 21222, 528, 69, 25, 41, 29, 910, 39, 2681, 14027, 24, 153, 1004, 6147, 37, 17163, 6796, 39, 2756, 20, 654, 432, 10231, 7905, 213, 654, 1040, 291, 584, 744, 22206, 1092, 13987, 2324, 21, 174, 449, 2681, 2907, 96, 548, 1068, 20, 14

In [14]:
datasets['train'][0]

{'text': '\t新华社北京8月1日电为全面展现和生动反映以习近平同志为核心的党中央团结带领全党全国各族人民顽强奋斗、如期全面建成小康社会的伟大历程和辉煌成就，由中央宣传部指导、中央广播电视总台承制的5集电视专题片《人民的小康》，将于2日起在中央电视台综合频道黄金时段播出。据介绍，该片分为《一诺千钧》《脱贫攻坚》《民生福祉》《美好生活》《关键一步》等5个篇章，充分展现以习近平同志为核心的党中央对决胜全面建成小康社会的战略擘画和重大部署，全方位呈现全面建成小康社会的历史性成就，立体化反映老百姓的幸福小康生活和昂扬精神风貌。据悉，该片创作坚持以思想引领为主线，以具体事例为主体，以普通人物为主角，力求体现思想性和生动性有机统一。摄制团队深入全国各地，拍摄了许多感人故事、清新画面，真实展现了中华大地实现全面小康的新面貌、新气象。'}

In [20]:
tokenizer.encode('<sep>')

[4, 4, 3]

In [73]:
datasets = load_dataset("text", data_files={"train": 'clm_train.txt', "validation": 'clm_valid.txt'}, streaming=True)
tokenized_datasets = datasets['train'].map(tokenize_function, batched=True)
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
)

Using custom data configuration default-cb1381deea317b8a


In [74]:
next(iter(datasets['train']))

{'text': ''}

In [75]:
print(next(iter(tokenized_datasets))['input_ids'])

[4, 3]


In [76]:
print(next(iter(lm_datasets))['input_ids'])

[4, 3, 19, 654, 20, 3497, 44, 3377, 17, 2136, 69, 22, 87, 20, 3428, 13987, 2324, 17, 64, 10311, 2542, 17, 159, 174, 449, 600, 84, 2603, 4061, 9494, 1944, 3811, 4132, 2361, 17, 15371, 421, 17124, 18320, 5111, 4167, 17, 8372, 35, 10068, 15371, 18, 8372, 10068, 15371, 657, 17, 7308, 202, 17123, 17799, 12070, 4061, 9494, 1944, 3811, 6712, 203, 12208, 1702, 1106, 2774, 9602, 740, 18177, 258, 17, 908, 4061, 9494, 5161, 2758, 9567, 2012, 21, 18067, 219, 119, 3428, 18, 544, 29, 17, 22695, 1096, 1506, 10372, 198, 1493, 137, 28, 2118, 614, 168, 2662, 7841, 4061, 9494, 1944, 3811, 66, 12208, 1702, 1106, 2774, 9602, 740, 1205, 357, 20376, 1022, 4286, 18, 341, 614, 23, 4311, 4584, 10568, 2177, 220, 76, 17]


In [77]:
tokenizer.decode(next(iter(lm_datasets))["input_ids"])

'<sep><cls> 党的十八大以来,经过8年多的持续奋斗,到2020年底,中国如期完成新时代脱贫攻坚目标任务,贫困地区落后面貌根本改变,消除了绝对贫困。消除绝对贫困之后,关键要做好巩固拓展脱贫攻坚成果同乡村振兴有效衔接各项工作,让脱贫基础更加稳固、成效更可持续。近日,课题组赴重庆市石柱县中益乡就如何推进脱贫攻坚与乡村振兴有效衔接问题进行专题调研。该乡在融创中国的助力下,'

In [62]:
print(next(iter(lm_datasets)))

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [19, 3799, 3466, 17, 10779, 22, 254, 266, 10448, 578, 455, 9594, 2713, 66, 1323, 1836, 2348, 1284, 6535, 12154, 17, 1706, 1084, 2761, 16529, 17, 2720, 455, 3654, 16628, 24, 999, 2078, 6163, 5368, 20, 8770, 18, 9612, 11920, 19694, 1359, 57, 29, 262, 17, 6592, 421, 20359, 578, 455, 1084, 144, 17, 52, 5476, 5794, 5000, 7119, 2758, 14032, 18, 4, 3, 19, 65, 2713, 1218, 14624, 421, 17, 14297, 978, 1753, 596, 9803, 281, 305, 17, 3428, 17753, 1226, 29, 254, 6670, 11454, 627, 15361, 18, 140, 15361, 1299, 3377, 17, 15467, 27, 919, 192, 4577, 15381, 18, 2262, 1187, 120, 1740, 1901, 21, 

In [60]:
print(lm_datasets['train'][0])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [19, 3799, 3466, 17, 10779, 22, 254, 266, 10448, 578, 455, 9594, 2713, 66, 1323, 1836, 2348, 1284, 6535, 12154, 17, 1706, 1084, 2761, 16529, 17, 2720, 455, 3654, 16628, 24, 999, 2078, 6163, 5368, 20, 8770, 18, 9612, 11920, 19694, 1359, 57, 29, 262, 17, 6592, 421, 20359, 578, 455, 1084, 144, 17, 52, 5476, 5794, 5000, 7119, 2758, 14032, 18, 4, 3, 19, 65, 2713, 1218, 14624, 421, 17, 14297, 978, 1753, 596, 9803, 281, 305, 17, 3428, 17753, 1226, 29, 254, 6670, 11454, 627, 15361, 18, 140, 15361, 1299, 3377, 17, 15467, 27, 919, 192, 4577, 15381, 18, 2262, 1187, 120, 1740, 1901, 21, 

In [81]:
gen = iter(lm_datasets)
count = 0
for data in gen:
    count += 1
print(count)

In [85]:
model

XLNetLMHeadModel(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwi

In [5]:
def train_from_scratch():
    vocab_file = 'vocab.json'
    train_file = 'clm_train.txt'
    
    if not os.path.exists(vocab_file):
        char_set = set()
        for line in open(train_file):
            line = re.sub(r'\s', '', line)
            for char in line:
                char_set.add(char)
        
        vocab_dict = {'id2token': {}, 'token2id': {}}
        idx = 0
        for char in char_set:
            vocab_dict['id2token'][idx] = char
            vocab_dict['token2id'][char] = idx
            idx += 1
            
        json.dump(vocab_dict, open(vocab_file, 'w'), ensure_ascii=False)
    
    tokenizer = myTokenizer(vocab_file)
    return tokenizer

In [6]:
class myTokenizer:
    def __init__(self, vocab_file):
        vocab_dict = json.load(open(vocab_file))
        self.id2token = vocab_dict['id2token']
        self.token2id = vocab_dict['token2id']
        self.bos_token_id = len(self.id2token) # 对应\t
        self.id2token[self.bos_token_id] = '\t'
        self.token2id['\t'] = self.bos_token_id
    
    @property
    def size(self):
        return len(self.id2token)
    
    def __call__(self, text):
        return self.encode(text)
    
    def encode(self, text):
        data = {'input_ids': [], 'attention_mask': []}
        if isinstance(text, str):
            for char in text:
                token_id = self.token2id.get(char)
                if token_id:
                    data['input_ids'].append(token_id)
                    data['attention_mask'].append(1)
        elif isinstance(text, list):
            for ele in text:
                input_ids = []
                attention_mask = []
                for char in ele:
                    token_id = self.token2id.get(char)
                    if token_id:
                        input_ids.append(token_id)
                        attention_mask.append(1)
                data['input_ids'].append(input_ids)
                data['attention_mask'].append(attention_mask)
        return data
    
    def decode(self, id_list):
        if isinstance(id_list, int):
            id_list = [id_list]
            
        text = ''
        for token_id in id_list:
            if token_id == self.bos_token_id:
                text += '\t'
            else:
                token = self.id2token.get(str(token_id))
                if token:
                    text += token
        return text

In [7]:
tokenizer = train_from_scratch()

In [8]:
tokenizer.encode(['看看是什么', '测试'])

{'input_ids': [[4959, 4959, 3020, 2406, 2758], [5888, 4559]],
 'attention_mask': [[1, 1, 1, 1, 1], [1, 1]]}

In [9]:
tokenizer.decode([6447, 4959, 4959, 3020, 2406, 2758])

'\t看看是什么'

In [10]:
tokenizer.size

6448

In [11]:
def tokenize_function(examples):
    return tokenizer.encode(examples["text"])

datasets = load_dataset("text", data_files={"train": 'clm_valid.txt', "validation": 'clm_valid.txt'})
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Using custom data configuration default-398c787d8b9b2cbf
Reusing dataset text (/home/min/.cache/huggingface/datasets/text/default-398c787d8b9b2cbf/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [13]:
datasets['train'][0]

{'text': '\t新华社北京8月1日电为全面展现和生动反映以习近平同志为核心的党中央团结带领全党全国各族人民顽强奋斗、如期全面建成小康社会的伟大历程和辉煌成就，由中央宣传部指导、中央广播电视总台承制的5集电视专题片《人民的小康》，将于2日起在中央电视台综合频道黄金时段播出。据介绍，该片分为《一诺千钧》《脱贫攻坚》《民生福祉》《美好生活》《关键一步》等5个篇章，充分展现以习近平同志为核心的党中央对决胜全面建成小康社会的战略擘画和重大部署，全方位呈现全面建成小康社会的历史性成就，立体化反映老百姓的幸福小康生活和昂扬精神风貌。据悉，该片创作坚持以思想引领为主线，以具体事例为主体，以普通人物为主角，力求体现思想性和生动性有机统一。摄制团队深入全国各地，拍摄了许多感人故事、清新画面，真实展现了中华大地实现全面小康的新面貌、新气象。'}

In [14]:
print(tokenized_datasets['train'][0])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [19]:
print(lm_datasets['train'][1])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2425, 2135, 5939, 5409, 1217, 3719, 1666, 2887, 2117, 5226, 854, 5022, 4285, 629, 963, 5226, 6417, 2142, 5573, 5284, 963, 5226, 5929, 373, 2043, 1086, 963, 5226, 3731, 3279, 373, 1851, 963, 5226, 1476, 2284, 854, 1556, 963, 4606, 2811, 3527, 2198, 5420, 1217, 5489, 2887, 1336, 4557, 5095, 3021, 4282, 2339, 4590, 1679, 2117, 4845, 4303, 4250, 2141, 2258, 5748, 1534, 972, 3679, 3634, 4328, 3139, 4228, 2079, 943, 5849, 5303, 4250, 6381, 1223, 2030, 5026, 5734, 1541, 2935, 5343, 1552, 1217, 3634, 3180, 4385, 2144, 4557, 3634, 4328, 3139, 4228, 2079, 943, 5849, 5303, 4250, 1525, 