## Setup

In [1]:
from datasets import load_dataset
from typing import List
import pandas as pd

### 1. Download HotpotQA from HuggingFace

In [2]:
df = load_dataset("hotpot_qa", "fullwiki")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [19]:
df['train'][32221]

{'id': '5a8eb1f05542995a26add4e8',
 'question': 'While Trachycarpus is a genus of palms, what is Nemesia a genus of?',
 'answer': 'annuals, perennials and sub-shrubs',
 'type': 'comparison',
 'level': 'easy',
 'supporting_facts': {'title': ['Trachycarpus', 'Nemesia (plant)'],
  'sent_id': [0, 0]},
 'context': {'title': ['Nemesia (plant)',
   'Trachycarpus',
   'Climbing palm',
   'Calyptrocalyx',
   'Trachycarpus oreophilus',
   'Phytelephas',
   'Roystonea',
   'Attalea (palm)',
   'Trachycarpus martianus',
   'Bentinckia'],
  'sentences': [['Nemesia is a genus of annuals, perennials and sub-shrubs which are native to sandy coasts or disturbed ground in South Africa.',
    ' Numerous hybrids have been selected, and the annual cultivars are popular with gardeners as bedding plants.',
    ' In temperate regions the annual cultivars are usually treated as half-hardy bedding plants, sown from seed in heat and planted out after all danger of frost has passed.'],
   ['Trachycarpus is a genu

### 2. Transform JSON format to table 

In [4]:
def generate_chunks(title: List[str], sentences: List[List[str]]) -> dict:
    chunk_dict = {}
    for i, title_text in enumerate(title):
        chunks = [f"{title_text} {sentence}" for sentence in sentences[i]]
        if title_text not in chunk_dict:
            chunk_dict[title_text] = chunks
        else:
            chunk_dict[title_text].extend(chunks)
    return chunk_dict

In [5]:
validation_data = {
    'question_id': [],
    'question': [],
    'answer': [],
    'type': [],
    'level': [],
    'chunks': []  
}

for item in df['validation']:
    try:
        chunks_dict = generate_chunks(
            item['context']['title'],
            item['context']['sentences']
        )
        
        validation_data['question_id'].append(item['id'])
        validation_data['question'].append(item['question'])
        validation_data['answer'].append(item['answer'])
        validation_data['type'].append(item['type'])
        validation_data['level'].append(item['level'])
        validation_data['chunks'].append(chunks_dict)
        
    except KeyError as e:
        print(f"KeyError for question ID {item['id']}: {str(e)}")

validation_df = pd.DataFrame(validation_data)

In [17]:
validation_df.to_csv('validation_data.csv', index=False)