# Download Datasets

In [1]:
from itertools import count

import pandas as pd
from datasets import load_dataset
from pyarrow.dataset import dataset

## Load all QA pairs

In [2]:
all_qa = load_dataset("cardiffnlp/databench", name="qa", split="train")

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

In [3]:
all_qa.shape

(1308, 7)

In [4]:
all_qa

Dataset({
    features: ['question', 'answer', 'type', 'columns_used', 'column_types', 'sample_answer', 'dataset'],
    num_rows: 1308
})

## Save as **all_qa.csv** file 

In [17]:
# Convert to a pandas DataFrame
all_qa_df = pd.DataFrame(all_qa)

# Save as a CSV file
all_qa_df.to_csv("../data/raw/all_qa.csv", index=False)

In [28]:
all_qa_df

Unnamed: 0,question,answer,type,columns_used,column_types,sample_answer,dataset
0,Is the person with the highest net worth self-...,True,boolean,"[finalWorth, selfMade]","[number[uint32], boolean]",False,001_Forbes
1,Does the youngest billionaire identify as male?,True,boolean,"[age, gender]","[number[UInt8], category]",True,001_Forbes
2,Is the city with the most billionaires in the ...,True,boolean,"[city, country]","[category, category]",True,001_Forbes
3,Is there a non-self-made billionaire in the to...,True,boolean,"[rank, selfMade]","[number[uint16], boolean]",False,001_Forbes
4,Does the oldest billionaire have a philanthrop...,False,boolean,"[age, philanthropyScore]","[number[UInt8], number[UInt8]]",False,001_Forbes
...,...,...,...,...,...,...,...
1303,Which 3 countries made the least transactions?,"['Saudi Arabia', 'Bahrain', 'Czech Republic']",list[category],[Country],[category],"['United Kingdom', 'Australia']",065_RFM
1304,What are the invoice numbers of the top 5 tran...,"['581483', '541431', '578841', '542504', '5730...",list[number],"[InvoiceNo, Quantity]","[number[UInt32], number[int32]]","['548005', '555200', '573399', '577076', '5689...",065_RFM
1305,What are the stock codes of the bottom 4 trans...,"[23843, 21366, 23005, 23005, 84347]",list[number],"[InvoiceNo, Quantity]","[number[UInt32], number[int32]]","['22465', '21109', '82484', '22799', '82600']",065_RFM
1306,What are the six most commonly ordered quantit...,"[1, 2, 12, 6, 4, 3]",list[number],[Quantity],[number[int32]],"[1, 4, 2, 12, 24, 3]",065_RFM


In [15]:
all_qa_df.columns

Index(['question', 'answer', 'type', 'columns_used', 'column_types',
       'sample_answer', 'dataset'],
      dtype='object')

## Download dependent datasets

In [38]:
dependent_datasets = list(all_qa_df["dataset"].unique())
# print(dependent_datasets)

### 1. Download first 5 datasets : '001_Forbes', '002_Titanic', '003_Love', '004_Taxi', '005_NYC'

In [34]:
for ds_id in dependent_datasets[0:5]:
    df = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{ds_id}/all.parquet")
    df.to_csv(f'../data/raw/datasets/{ds_id}.csv', index=False)