In [74]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from promptsource.templates import DatasetTemplates
from datasets import load_dataset
import pandas as pd
from torch.utils.data import dataloader


In [2]:
# checkpoint = "bigscience/T0pp"
checkpoint = "bigscience/T0_3B"

model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
)
model.parallelize()
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [3]:
def t0(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt").cuda()
    outputs = model.generate(inputs, max_length=512, temperature=1, top_p=0.8)
    message = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return message


In [4]:
dataset = load_dataset("super_glue", "rte", split="validation")
example = dataset[0]
example


Found cached dataset super_glue (/workspaces/seed/cache/hf_dataset/super_glue/rte/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed)


{'premise': 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.',
 'hypothesis': 'Christopher Reeve had an accident.',
 'idx': 0,
 'label': 1}

In [9]:
dataset.column_names


['premise', 'hypothesis', 'idx', 'label']

In [8]:
df = pd.DataFrame(dataset)
df.head()


Unnamed: 0,premise,hypothesis,idx,label
0,"Dana Reeve, the widow of the actor Christopher...",Christopher Reeve had an accident.,0,1
1,"Yet, we now are discovering that antibiotics a...",Bacteria is winning the war against antibiotics.,1,0
2,Cairo is now home to some 15 million people - ...,15 million tonnes of rubbish are produced dail...,2,1
3,"The Amish community in Pennsylvania, which num...",Pennsylvania has the biggest Amish community i...,3,1
4,Security forces were on high alert after an el...,Security forces were on high alert after a cam...,4,0


In [14]:
tokenizer.eos_token


'</s>'

In [15]:
tokenizer.pad_token


'<pad>'

In [16]:
tokenizer.padding_side


'right'

In [17]:
tokenizer.is_fast


True

In [19]:
rte = DatasetTemplates("super_glue", "rte")
prompt = rte["MNLI crowdsource"]


In [23]:
i, o = prompt.apply(example)
print(i, "\n", o)


Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Using only the above description and what you know about the world, is "Christopher Reeve had an accident." definitely correct? Yes or no? 
 No


In [27]:
examples = dataset[:8]


In [34]:
column_names = dataset.column_names
bs = len(examples[column_names[0]])
bs


8

In [44]:
ee = pd.DataFrame(examples).to_dict(orient="records")
ee[0]


{'premise': 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.',
 'hypothesis': 'Christopher Reeve had an accident.',
 'idx': 0,
 'label': 1}

In [46]:
prompt.apply(ee[0])


['Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Using only the above description and what you know about the world, is "Christopher Reeve had an accident." definitely correct? Yes or no?',
 'No']

In [47]:
prompt.get_answer_choices_list(ee[0])


['Yes', 'No']

In [53]:
input_text, target_text, choices_text = [], [], []


In [54]:
for e in ee:
    i, o = prompt.apply(e)
    input_text.append(i)
    target_text.append(o)
    choices_text.append(prompt.get_answer_choices_list(e))


In [56]:
target_text


['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes']

In [59]:
tokenized_inputs = tokenizer(
    input_text,
    padding="max_length",
    max_length=1024,
    truncation=True,
    add_special_tokens=False,
)


In [62]:
tokenized_targets = [
    tokenizer(
        ans_choi,
        padding=False,
        max_length=1024,
        truncation=True,
    )
    for ans_choi in choices_text
]


In [64]:
tokenized_targets


[{'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]},
 {'input_ids': [[2163, 1], [465, 1]], 'attention_mask': [[1, 1], [1, 1]]}]

In [65]:
features = {
            k: [
                [elem for _ in range(len(tokenized_targets[idx]["input_ids"]))]
                for idx, elem in enumerate(v)
            ]
            for k, v in tokenized_inputs.items()
        }

features["labels"] = [
    tokenized_targets[idx]["input_ids"]
    for idx in range(bs)
]
features["labels_attention_mask"] = [
    tokenized_targets[idx]["attention_mask"]
    for idx in range(bs)
]
features["targets"] = [
    choices_text[idx].index(t)
    for idx, t in enumerate(target_text)
]

In [127]:
pd.DataFrame(examples).head()

Unnamed: 0,premise,hypothesis,idx,label
0,"Dana Reeve, the widow of the actor Christopher...",Christopher Reeve had an accident.,0,1
1,"Yet, we now are discovering that antibiotics a...",Bacteria is winning the war against antibiotics.,1,0
2,Cairo is now home to some 15 million people - ...,15 million tonnes of rubbish are produced dail...,2,1
3,"The Amish community in Pennsylvania, which num...",Pennsylvania has the biggest Amish community i...,3,1
4,Security forces were on high alert after an el...,Security forces were on high alert after a cam...,4,0


In [68]:
pd.DataFrame(features).head()

Unnamed: 0,input_ids,attention_mask,labels,labels_attention_mask,targets
0,"[[2744, 9, 419, 15, 162, 6, 8, 23428, 13, 8, 7...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[2163, 1], [465, 1]]","[[1, 1], [1, 1]]",1
1,"[[5201, 6, 62, 230, 33, 17452, 24, 14458, 7, 3...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[2163, 1], [465, 1]]","[[1, 1], [1, 1]]",0
2,"[[28600, 19, 230, 234, 12, 128, 627, 770, 151,...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[2163, 1], [465, 1]]","[[1, 1], [1, 1]]",1
3,"[[37, 736, 1273, 573, 16, 8913, 6, 84, 2302, 8...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[2163, 1], [465, 1]]","[[1, 1], [1, 1]]",1
4,"[[3684, 3859, 130, 30, 306, 5685, 227, 46, 435...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[2163, 1], [465, 1]]","[[1, 1], [1, 1]]",0


In [106]:
def apply_prompt(e):
    i, o = prompt.apply(e)
    return i, o, prompt.get_answer_choices_list(e)

In [134]:
ex_pd = pd.DataFrame(examples)
# apply the prompt to the each row of ex_pd and break it into 2 columns
ex_pd = ex_pd.apply(lambda x: pd.Series(apply_prompt(x)), axis=1)

# rename the columns 0 to input_text, 1 to target_text, 2 to choices_text
ex_pd.columns = ["input_text", "target_text", "choices_text"]
ex_pd


Unnamed: 0,input_text,target_text,choices_text
0,"Dana Reeve, the widow of the actor Christopher...",No,"[Yes, No]"
1,"Yet, we now are discovering that antibiotics a...",Yes,"[Yes, No]"
2,Cairo is now home to some 15 million people - ...,No,"[Yes, No]"
3,"The Amish community in Pennsylvania, which num...",No,"[Yes, No]"
4,Security forces were on high alert after an el...,Yes,"[Yes, No]"
5,"In 1979, the leaders signed the Egypt-Israel p...",Yes,"[Yes, No]"
6,"singer and actress Britney Spears, 24, has fil...",Yes,"[Yes, No]"
7,Following the successful bid to bring the 2010...,Yes,"[Yes, No]"


In [124]:
out = tokenizer(
    ex_pd.input_text.tolist(),
    padding="max_length",
    max_length=1024,
    truncation=True,
    add_special_tokens=False,
)

# make dataframe from out with two columns: input_ids and attention_mask
out_pd = pd.DataFrame(dict(out))
out_pd


Unnamed: 0,input_ids,attention_mask
0,"[2744, 9, 419, 15, 162, 6, 8, 23428, 13, 8, 75...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[5201, 6, 62, 230, 33, 17452, 24, 14458, 7, 33...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[28600, 19, 230, 234, 12, 128, 627, 770, 151, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[37, 736, 1273, 573, 16, 8913, 6, 84, 2302, 81...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[3684, 3859, 130, 30, 306, 5685, 227, 46, 4356...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5,"[86, 15393, 6, 8, 2440, 3814, 8, 10438, 18, 30...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,"[7634, 11, 15676, 12188, 3186, 180, 29892, 6, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,"[6851, 8, 1574, 6894, 12, 830, 8, 2735, 12749,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [125]:
label_out = tokenizer(
        ex_pd.choices_text.tolist(),
        padding=False,
        max_length=1024,
        truncation=True,
    )
pd.DataFrame(dict(label_out)).rename(columns={"input_ids": "labels", "attention_mask": "labels_attention_mask"})

Unnamed: 0,labels,labels_attention_mask
0,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
1,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
2,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
3,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
4,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
5,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
6,"[2163, 1, 465, 1]","[1, 1, 1, 1]"
7,"[2163, 1, 465, 1]","[1, 1, 1, 1]"


In [150]:
ae = pd.DataFrame(features).to_dict(orient="records")[0]
ae['input_ids'] = torch.tensor(ae['input_ids']).cuda()
ae['attention_mask'] = torch.tensor(ae['attention_mask']).cuda()

# delete the labels and labels_attention_mask and targets 
del ae['labels']
del ae['labels_attention_mask']
del ae['targets']

ae

{'input_ids': tensor([[2744,    9,  419,  ...,    0,    0,    0],
         [2744,    9,  419,  ...,    0,    0,    0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [151]:
model(**ae)

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds