In [None]:
from transformers import T5Model, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup, AdamW
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook
import jsonlines

In [None]:
# %%time
# !python -m language.totto.baseline_preprocessing.preprocess_data_main --input_path="./datasets/totto_data/totto_train_data.jsonl" --output_path="./datasets/processed_train.jsonl"

In [None]:
# %%time
# !python -m language.totto.baseline_preprocessing.preprocess_data_main --input_path="./datasets/totto_data/totto_dev_data.jsonl" --output_path="./datasets/processed_dev.jsonl"

In [None]:
# %%time
# !python -m language.totto.baseline_preprocessing.preprocess_data_main --input_path="./datasets/totto_data/unlabeled_totto_test_data.jsonl" --output_path="./datasets/processed_test.jsonl"

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained('text_qgen/')
multi_qgen_model = T5ForConditionalGeneration.from_pretrained('text_qgen/')

In [None]:
%%capture
multi_qgen_model.to('cuda')

In [None]:
def encode_(text_to_encode:str):
    return t5_tokenizer(text_to_encode, return_tensors='pt').input_ids

In [None]:
cols = ['subtable_text', 'label']
train_data = []
test_data = []

In [None]:
ct = 0
with jsonlines.open('./datasets/processed_train.jsonl') as inpf, jsonlines.open('./datasets/TabQGen_train.jsonl', 'w') as outf:
    for obj in tqdm_notebook(inpf, total=120761):
        ct+=1
        wr_obj = {}
        wr_obj['table'] = obj['table']
        wr_obj['table_webpage_url'] = obj['table_webpage_url']
        wr_obj['table_page_title'] = obj['table_page_title']
        wr_obj['table_section_title'] = obj['table_section_title']
        wr_obj['table_section_text'] = obj['table_section_text']
        wr_obj['highlighted_cells'] = obj['highlighted_cells']
        wr_obj['example_id'] = obj['example_id']
        wr_obj['sentence_annotations'] = []
        subtable_txt = obj['subtable_str']

        for s in obj['sentence_annotations']:
            ann_obj = s.copy()
            inp_txt = f"qgen answer: {s['final_sentence']} context: {s['final_sentence']}"
            q = t5_tokenizer.batch_decode(multi_qgen_model.generate(encode_(inp_txt).to('cuda'), num_beams=4, max_length=512),
                                                            skip_special_tokens=True)[0]
            train_data.append([subtable_txt, q])
            ann_obj['question'] = q
            wr_obj['sentence_annotations'].append(ann_obj)
        outf.write(wr_obj)  
        if ct%100 == 0:
            print(f'processed {ct} samples')

In [None]:
train_tabqgen = pd.DataFrame(data=train_data, columns=cols)
train_tabqgen.to_csv('./datasets/tabqgen_train.csv')

In [None]:
ct = 0
with jsonlines.open('./datasets/processed_dev.jsonl') as inpf, jsonlines.open('./datasets/TabQGen_test.jsonl', 'w') as outf:
    for obj in tqdm_notebook(inpf, total=7700):
        ct+=1
        wr_obj = {}
        wr_obj['table'] = obj['table']
        wr_obj['table_webpage_url'] = obj['table_webpage_url']
        wr_obj['table_page_title'] = obj['table_page_title']
        wr_obj['table_section_title'] = obj['table_section_title']
        wr_obj['table_section_text'] = obj['table_section_text']
        wr_obj['highlighted_cells'] = obj['highlighted_cells']
        wr_obj['example_id'] = obj['example_id']
        wr_obj['overlap_subset'] = obj['overlap_subset']
        wr_obj['sentence_annotations'] = []
        subtable_txt = obj['subtable_str']

        for s in obj['sentence_annotations']:
            ann_obj = s.copy()
            inp_txt = f"qgen answer: {s['final_sentence']} context: {s['final_sentence']}"
            q = t5_tokenizer.batch_decode(multi_qgen_model.generate(encode_(inp_txt).to('cuda'), num_beams=4, max_length=512),
                                                            skip_special_tokens=True)[0]
            test_data.append([subtable_txt, q])
            ann_obj['question'] = q
            wr_obj['sentence_annotations'].append(ann_obj)
        outf.write(wr_obj)  
        if ct%100 == 0:
            print(f'processed {ct} samples')

In [None]:
test_tabqgen = pd.DataFrame(data=test_data, columns=cols)
test_tabqgen.to_csv('./datasets/tabqgen_test.csv')