In [72]:
import os
os.environ["HTTP_PROXY"] = "http://http.proxy.fmr.com:8000"
os.environ["HTTPS_PROXY"] = "http://http.proxy.fmr.com:8000"
os.environ["https_proxy"] = "http://http.proxy.fmr.com:8000"
import sys
sys.path.append("..")
import tiktoken
import pandas as pd
import pickle
from typing import List
from pathlib import Path
import datasets
from textwrap import dedent
from tqdm.notebook import tqdm
import re
from utils.normalizer import prepare_df_for_neuraldb_from_table

In [67]:
def preprocess_hybridqa_table(table: dict) -> dict:
    """Preprocesses wikitq headers to make them easier to parse in text-to-SQL task.
    TODO: This is causing some encoding issues
    """
    preprocessed_table = {"header": [], "rows": []}
    for v in table["header"]:
        preprocessed_table["header"].append(re.sub(r"(\'|\")", "", v))
    for v in table["rows"]:
        preprocessed_table["rows"].append([re.sub(r"(\'|\")", "", item) for item in v])
    return preprocessed_table

def load_formatted_dataset(dataset_split) -> List[str]:
    formatted_dataset = []
    for item in tqdm(dataset_split):
        document_context = item['passages']['rows']
        table_context = prepare_df_for_neuraldb_from_table(
            preprocess_hybridqa_table(item['table']), add_row_id=False
        )
        question_context = item['question']
        intro = dedent("""This is a hybrid question answering task. The goal of this task is to answer the question given a table (`w`) and corresponding passages (`documents`).
        Be as succinct as possible in answering the given question, do not include explanation.
        """)
        formatted_dataset.append(
            {
                "intro": intro,
                "documents": document_context,
                "table": table_context,
                "question": question_context
            }
        )
    return formatted_dataset

In [68]:
dataset = datasets.load.load_dataset(
    path="../datasets/hybridqa",
)
encoding = tiktoken.get_encoding("cl100k_base")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [69]:
if not Path("hybridqa-validation-formatted.pkl").is_file():
    formatted_dataset = load_formatted_dataset(dataset['validation'])
    with open("hybridqa-validation-formatted.pkl", "wb") as f:
        pickle.dump(formatted_dataset, f)
else:
    with open("hybridqa-validation-joined.pkl", "r") as f:
        formatted_dataset = pickle.load(f)

  0%|          | 0/3466 [00:00<?, ?it/s]

In [75]:
num_tokens = 0
for item in tqdm(formatted_dataset):
    document_df = pd.DataFrame(item['documents'], columns=["title", "content"])
    num_tokens += len(
        encoding.encode(
            f"""
            {item['intro']}

            Context:\n{item['table'].to_string()}\n{document_df.to_string()}\n
            Question: {item['question']}
            Answer:\n
            """
        )
    )
print(f"{num_tokens / len(dataset['validation'])} average, no truncation")

  0%|          | 0/3466 [00:00<?, ?it/s]

7510.8770917484135 average, no truncation


In [77]:
num_tokens = 0
for item in tqdm(formatted_dataset):
    documents = [(i[0], i[1][:400]) for i in item['documents']]
    document_df = pd.DataFrame(documents, columns=["title", "content"])
    num_tokens += len(
        encoding.encode(
            f"""
            {item['intro']}

            Context:\n{item['table'].to_string()}\n{document_df.to_string()}\n
            Question: {item['question']}
            Answer:\n
            """
        )
    )
print(f"{num_tokens / len(dataset['validation'])} average, 400 character document truncation")

  0%|          | 0/3466 [00:00<?, ?it/s]

3691.7642815926138 average, 400 character document truncation
