In [4]:
import json
from datasets import load_dataset
import numpy as np

dataset_id = "rag-datasets/rag-mini-bioasq"

dataset = load_dataset(dataset_id, "question-answer-passages",trust_remote_code=True)["test"]
corpus = load_dataset(dataset_id, "text-corpus",trust_remote_code=True)["passages"]
print(dataset)
print(corpus)
dataset[0], corpus[0]

Dataset({
    features: ['question', 'answer', 'relevant_passage_ids', 'id'],
    num_rows: 4719
})
Dataset({
    features: ['passage', 'id'],
    num_rows: 40221
})


({'question': 'Is Hirschsprung disease a mendelian or a multifactorial disorder?',
  'answer': "Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model.",
  'relevant_passage_ids': '[20598273, 6650562, 15829955, 15617541, 23001136, 8896569, 21995290, 12239580, 15858239]',
  'id': 0},
 {'passage': 'New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Characteristic morphological, cytological, some physico-chemical \nand biological features of the isolated viruses are described. A possible role \nof these viruses in human and animal health disorders is discussed. The isolated \nviruses remain unclass

In [7]:
id_to_corpus = {e["id"]: e["passage"] for e in corpus}

In [17]:
def build_context(ex):
  contexts = []
  for each in eval(ex["relevant_passage_ids"]):
    ctx = id_to_corpus[each]
    if ctx != "nan":
      contexts.append(ctx)
  #contexts = corpus.filter(lambda x: x["id"] in eval(ex["relevant_passage_ids"]))["passage"]
  return {"context": "\n".join(contexts)}
new_dataset = dataset.map(build_context)
new_dataset[0], new_dataset

({'question': 'Is Hirschsprung disease a mendelian or a multifactorial disorder?',
  'answer': "Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model.",
  'relevant_passage_ids': '[20598273, 6650562, 15829955, 15617541, 23001136, 8896569, 21995290, 12239580, 15858239]',
  'id': 0,
  'context': 'The major gene for Hirschsprung disease (HSCR) encodes the receptor tyrosine \nkinase RET. In a study of 690 European- and 192 Chinese-descent probands and \ntheir parents or controls, we demonstrate the ubiquity of a >4-fold \nsusceptibility from a C-->T allele (rs2435357: p = 3.9 x 10(-43) in European \nancestry; p = 1.1 x 10(-21) in Chinese sampl

In [18]:
from collections import Counter


len(Counter(new_dataset["context"]))

4297

In [19]:
from collections import Counter

n_contexts_for_raft = 100
unique_contexts = Counter(new_dataset["context"])
count = 0
raft_train_ctx = []
raft_test_ctx = []
for k, v in unique_contexts.items():
  if v > 0 and count < n_contexts_for_raft:
    count += 1
    raft_train_ctx.append(k)
  else:
    raft_test_ctx.append(k)

print(len(raft_test_ctx), len(raft_train_ctx))

#raft_train_ctx = unique_contexts[:100]
#raft_test_ctx = unique_contexts[100:]

4197 100


In [16]:
test_dataset = new_dataset.filter(lambda x: x["context"] in raft_test_ctx)
test_dataset

Filter: 100%|██████████| 4719/4719 [00:00<00:00, 21606.32 examples/s]


Dataset({
    features: ['question', 'answer', 'relevant_passage_ids', 'id', 'context'],
    num_rows: 4288
})

In [20]:
from datasets import Dataset

test_dataset_unique_ctx = test_dataset.to_pandas()
test_dataset_unique_ctx = test_dataset_unique_ctx.groupby("context").head(1).reset_index()
test_dataset_unique_ctx = Dataset.from_pandas(test_dataset_unique_ctx)
test_dataset_unique_ctx

Dataset({
    features: ['index', 'question', 'answer', 'relevant_passage_ids', 'id', 'context'],
    num_rows: 4197
})

In [21]:
# Dataset for evaluation raft

# 1070
# 
uploaded_dataset_id = "phatvo/rag-mini-bioasq-raft-test"

test_dataset_unique_ctx.map(lambda x: {"context": ' '.join(x["context"].split())}).push_to_hub(uploaded_dataset_id, split="test")
#test_dataset_unique_ctx.push_to_hub(uploaded_dataset_id, split="test_unique_ctx", private=True)

Map: 100%|██████████| 4197/4197 [00:00<00:00, 5001.89 examples/s]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 27.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.02s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/phatvo/rag-mini-bioasq-raft-test/commit/8530525f217851d1392cbde0ddc04c8b42c3ca49', commit_message='Upload dataset', commit_description='', oid='8530525f217851d1392cbde0ddc04c8b42c3ca49', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
def save_context_to_file(data:list, filepath):
  with open(filepath, "w") as f:
    _data = [each.replace("\n", "") for each in data]
    _data = [' '.join(each.split()) for each in data]
    f.write("\n".join(_data) + "\n")

save_context_to_file(raft_train_ctx, "../data/context_for_generate_bioasq.txt")

In [23]:
with open("../data/context_for_generate_bioasq.txt", "r") as f:
  print(len(f.readlines()))

100


In [16]:
"""

python3 raft.py \
    --datapath "data/context_for_generate_bioasq.txt" \
    --output "tmp/bioasq-test-100-raft" \
    --distractors 3 \
    --doctype txt \
    --chunk_size 512 \
    --questions 1 \
    --openai_key $OPENAI_KEY \
    --completion_model gpt-4o \
    --splitter breakline --p 0.95

"""

'\n\npython3 raft.py     --datapath "data/context_for_generate_cuad.txt"     --output "tmp/cuad-test-50-raft"     --distractors 3     --doctype txt     --chunk_size 512     --questions 1     --openai_key $OPENAI_KEY     --completion_model gpt-4o     --splitter breakline --p 0.95\n\n'

In [None]:
# $env:OPENAI_KEY=

In [24]:
from datasets import load_dataset

ds = load_dataset("json", data_files="../tmp/bioasq-test-100-raft.jsonl", )
ds = ds.map(lambda x: {"text": f"{x['instruction']}\nCoT Answer: {x['cot_answer']}"})
ds

Generating train split: 99 examples [00:00, 3992.46 examples/s]
Map: 100%|██████████| 99/99 [00:00<00:00, 1511.06 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction', 'text'],
        num_rows: 99
    })
})

In [25]:
ds.push_to_hub("phatvo/rag-mini-bioasq-100-raft")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 18.64ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.06s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/phatvo/rag-mini-bioasq-100-raft/commit/c7aa7e752af3ec1310fde2182a3f507512071839', commit_message='Upload dataset', commit_description='', oid='c7aa7e752af3ec1310fde2182a3f507512071839', pr_url=None, pr_revision=None, pr_num=None)