In [75]:
from datasets import load_dataset

ds = load_dataset("openai/gsm8k", "main", cache_dir="./dataset")

In [116]:
ds["train"].num_rows

7473

In [76]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})


In [77]:
def data_process(data):
    """
    数据处理，输入训练集或者测试集的输出，处理返回输入问题、答案、思考过程+问题三个部分
    """
    question = data["question"]
    thought_answer = data["answer"]
    answer = [t.split("#### ")[-1] for t in thought_answer]
    return question, answer, thought_answer


train_data = ds["train"]
train_question, train_answer, train_thought_answer = data_process(train_data)

In [78]:
train_answer

['72',
 '10',
 '5',
 '42',
 '624',
 '35',
 '48',
 '16',
 '41',
 '990',
 '121',
 '5',
 '85',
 '35',
 '5',
 '448000',
 '800',
 '43',
 '16',
 '16',
 '38',
 '1080',
 '7',
 '5',
 '62',
 '110',
 '400',
 '400',
 '8',
 '1000',
 '6',
 '1200',
 '10',
 '34',
 '5250',
 '36',
 '15',
 '5',
 '9',
 '15',
 '476',
 '500',
 '99',
 '60',
 '300',
 '99',
 '1920',
 '15',
 '10',
 '48',
 '5',
 '160',
 '5',
 '36',
 '11',
 '75',
 '45',
 '2',
 '320',
 '120',
 '96',
 '200',
 '15',
 '59',
 '840',
 '558',
 '520',
 '6',
 '90',
 '49',
 '19',
 '25',
 '54',
 '3',
 '28',
 '15',
 '768',
 '85',
 '4',
 '70',
 '100',
 '14',
 '700',
 '54',
 '90',
 '5',
 '6',
 '600',
 '258',
 '216',
 '90',
 '10',
 '1825',
 '14000',
 '60',
 '64',
 '126',
 '46',
 '45',
 '3',
 '15',
 '36',
 '25',
 '258',
 '96',
 '320',
 '50',
 '97',
 '41',
 '20',
 '140',
 '6',
 '920',
 '60',
 '15',
 '4000',
 '21',
 '16',
 '13',
 '140',
 '720',
 '6',
 '48',
 '25',
 '18',
 '50',
 '25',
 '55',
 '6',
 '10',
 '55',
 '500',
 '110',
 '9',
 '74',
 '2350',
 '20',
 '6',
 '

In [79]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [80]:
model_name = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [None]:
one_input = train_question[0]
one_input_inputs = tokenizer(one_input, return_tensors="pt", add_special_tokens=True)

In [92]:
train_thought_answer[0]

'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'

In [84]:
label = tokenizer(train_thought_answer[0], return_tensors="pt")
label

{'input_ids': tensor([[ 9267,  5434,  1916,  4678, 13311,  3274,     3,     2,  3707, 13311,
          2423,  2266,  3155,  3155,  2266, 16234,    16,   932,     5,  9267,
          5434,  1916,  4678,  1220,  2266,  3274,     3,     2,  3707,  1220,
          2266,  2423,  5865,  3155,  3155,  5865, 16234, 16889,    16,  1186,
            11,   932,     5,     3, 30345, 30345,  9455,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [87]:
outputs = model(
    input_ids=one_input_inputs["input_ids"],
    attention_mask=one_input_inputs["attention_mask"],
    labels=label["input_ids"],
)

In [67]:
generated_ids = model.generate(**one_input_inputs)
decode_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
decode_preds

['Natalia sold clips to 48 of her friends in April, and then she sold half as many']

In [100]:
inputs = tokenizer(
    train_question[:2],
    add_special_tokens=True,
    return_tensors="pt",
    padding=True,
    truncation=True,
)

In [None]:
generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=128,
)

In [117]:
generated_ids

tensor([[    0,  9267,  5434,  1916, 16234,    12,  4678,    13,   160,   803,
            16,  1186,     6,    11,   258,   255,  1916,   985,    38,   186,
         16234,    16,   932,     5,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [    0, 32099,  1725,  3807,     7, 21487,    46,  1781,    21,  1871,
             7,    23,  6031,     5, 18566,     6,    62,  1725,   131,   410,
           943,   676,    13,  1871,     7,    23,  6031,     5,   571,   231,
           410,   255,  3807,    58,     1]])

In [105]:
decode_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
decode_preds

['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May.',
 'ng earns $12 an hour for babysitting. Yesterday, weng just did 50 minutes of babysitting. How much did she earn?']