In [1]:
!pip install openai
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.2-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Col

In [2]:
import os
import openai
import torch
from datasets import load_dataset
class BoolQADataset(torch.utils.data.Dataset):
    """
    Dataset for the dataset of BoolQ questions and answers
    """

    def __init__(self, passages, questions, answers, tokenizer, max_len):
        self.passages = passages
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.answers)

    def __getitem__(self, index):
        """
        This function is called by the DataLoader to get an instance of the data
        :param index:
        :return:
        """

        passage = str(self.passages[index])
        question = self.questions[index]
        answer = self.answers[index]

        # this is input encoding for your model. Note, question comes first since we are doing question answering
        # and we don't wnt it to be truncated if the passage is too long
        input_encoding = question + " [SEP] " + passage

        # encode_plus will encode the input and return a dictionary of tensors
        encoded_review = self.tokenizer.encode_plus(
            input_encoding,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )

        return {
            'input_ids': encoded_review['input_ids'][0],  # we only have one example in the batch
            'attention_mask': encoded_review['attention_mask'][0],
            # attention mask tells the model where tokens are padding
            'labels': torch.tensor(answer, dtype=torch.long)  # labels are the answers (yes/no)
        }

In [3]:
# download dataset
print("Loading the dataset ...")
dataset = load_dataset("boolq")
dataset = dataset.shuffle()  # shuffle the data

Loading the dataset ...


Downloading builder script:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading and preparing dataset boolq/default to /root/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Dataset boolq downloaded and prepared to /root/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [43]:
import json
import requests
headers = {
    "Authorization": f"Bearer {'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'}",
    "Content-Type": "application/json"
}
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloomz"

def query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))


 # 选择用于训练的8个数据
train_data_1 = dataset['train'][:5]
train_data_2 = dataset['train'][11:14]



#  选择用于测试的100个数据
test_data = dataset['train'][15:115]
# # 训练模型
# for example in train_data:
#     question = example['question']
#     passage = example['passage']
#     answer = example['answer']
#     if answer == 'yes':
#         answer_start = passage.index(question.split()[-1])
#         answer_end = answer_start + len(question.split()[-1])
#     elif answer == 'no':
#         answer_start = 0
#         answer_end = 0
#     else:
#         raise ValueError("Answer should be 'yes' or 'no'.")
#     question_answerer(
#         question=question,
#         context=passage,
#         answer=answer,
#         answer_start=answer_start,
#         answer_end=answer_end,
#         use_cache=False,
#         output_dir=None,
#         overwrite_output_dir=False
#     )


for i in range(len(train_data_1['question'])):
   question = train_data_1['question'][i]
   passage = train_data_1['passage'][i]
   answer = str(train_data_1['answer'][i])
   prompt = passage + question
   data = query({"inputs": prompt,"targets":answer})

for i in range(len(train_data_2['question'])):
   question = train_data_2['question'][i]
   passage = train_data_2['passage'][i]
   answer = str(train_data_2['answer'][i])
   prompt = passage + question
   data = query({"inputs": prompt,"targets":answer})


answer = []
for i in range(len(test_data['question'])):
   question = test_data['question'][i]
   passage = test_data['passage'][i]
   prompt = "Context:" + passage + "Question：" + question
   data = query({"inputs": prompt})
   print(data)



[{'generated_text': 'Context:The center contact of the bulb typically connects to the medium-power filament, and the ring connects to the low-power filament. Thus, if a 3-way bulb is screwed into a standard light socket that has only a center contact, only the medium-power filament operates. In the case of the 50 W / 100 W / 150 W bulb, putting this bulb in a regular lamp socket will result in it behaving like a normal 100W bulb.Question：do 3 way light bulbs work in any lamp socket that has only a center contact yes'}]
[{'generated_text': "Context:Gun show loophole, gun law loophole, Brady law loophole (or Brady bill loophole), private sale loophole, and private sale exemption in the United States is the sale of firearms by private sellers, including those done at gun shows, dubbed the ``secondary market''. A loophole in federal law exists, under which ``any person may sell a firearm to an unlicensed resident of the state where they reside, as long as they do not know or have reasonabl