In [2]:
import os

import pandas as pd

from esg_data_pipeline.components.nq_curator import NQCurator
from esg_data_pipeline.components.nq_extractor import NQExtractor
from esg_data_pipeline.config import nq_config as config

## Extraction

The extraction stage coverts the raw NQ file from jsonl format to CSV, split in multiple files, since the data size is very large.
<br>The dataset contains both text and table data.
<br>Refer to the following link for information about the NQ dataset.
https://ai.google.com/research/NaturalQuestions

For this demo, we use a sample of the original dataset with 100 samples.
<br> The full dataset can be downloaded from here.
https://ai.google.com/research/NaturalQuestions/download


In [3]:
sample_nq_json_file = config.RAW_INPUT_FILE_PATH

!rm $config.EXTRACTED_FILES_OUTPUT_FOLDER/* 2> /dev/null

out_dir = config.EXTRACTED_FILES_OUTPUT_FOLDER
os.makedirs(out_dir, exist_ok=True)

In [4]:
nq_extractor = NQExtractor(raw_nq_json_file=sample_nq_json_file, out_dir=out_dir)
nq_extractor.run()

100it [00:00, 2052.55it/s]


In [5]:
!ls $config.EXTRACTED_FILES_OUTPUT_FOLDER

Parsed_NQ_0.csv


Inspecting the CSV file

In [6]:
df = pd.read_csv("{}/Parsed_NQ_0.csv".format(config.EXTRACTED_FILES_OUTPUT_FOLDER))
df.head(2)

Unnamed: 0.1,Unnamed: 0,document_text,question_text,example_id,yes_no_answer,long_answer_start,long_answer_end,short_answer_start,short_answer_end,other_long_answer_candidates
0,0,Email marketing - Wikipedia <H1> Email marketi...,which is the most common use of opt-in e-mail ...,5655493461695504401,NONE,1952,2019,1960,1969,"[{'start_token': 14, 'top_level': True, 'end_t..."
1,1,The Mother ( How I Met Your Mother ) - wikiped...,how i.met your mother who is the mother,5328212470870865242,NONE,212,310,213,215,"[{'start_token': 28, 'top_level': True, 'end_t..."


## Curation Stage

This stage uses the extracted CSV files from the previous stage as input and curated a dataset comprised on positive and negative examples.
<br> The text and table data are seperated and saved in two different final CSV files.

In [7]:
extraction_dir = config.EXTRACTED_FILES_OUTPUT_FOLDER
curation_dir = config.CURATED_FILES_OUTPUT_FOLDER

!rm $curation_dir/* 2> /dev/null
os.makedirs(curation_dir, exist_ok=True)

In [8]:
nq_curator = NQCurator(input_dir=extraction_dir, output_dir=curation_dir, extract_text=True, extract_tables=True)
nq_curator.run()

100%|██████████| 54/54 [00:00<00:00, 2324.00it/s]
100%|██████████| 54/54 [00:01<00:00, 48.24it/s] 


In [9]:
!ls $curation_dir

NQ_table_relevance_balanced.csv  NQ_text_relevance_balanced.csv


Inspecting the curated text and tabel CSV files

In [10]:
df_text = pd.read_csv(os.path.join(curation_dir, "NQ_text_relevance_balanced.csv"))
df_text.head(2)

Unnamed: 0.1,Unnamed: 0,question,text,label
0,0,who sold manhattan to the dutch in 1626,Minuit is credited with purchasing the island ...,1
1,1,which is the most common use of opt-in e-mail ...,A common example of permission marketing is a ...,1


In [11]:
df_table = pd.read_csv(os.path.join(curation_dir, "NQ_table_relevance_balanced.csv"))
df_table.head(2)

Unnamed: 0.1,Unnamed: 0,question,text,label
0,0,when do the eclipse supposed to take place,<Table> <Tr> <Th> Sr.No . </Th> <Th> Name </Th...,0
1,1,when did the first lego movie come out,<Table> <Tr> <Th> Year </Th> <Th> Title </Th> ...,0
