In [1]:
# A folder path, a file path, or a list of file paths is needed

# DATA_LOCATION = "wiki_pages"
# DATA_LOCATION = "wiki_pages/wiki_000.jsonl"
DATA_LOCATION = [
    "wiki_pages/wiki_000.jsonl",
    "wiki_pages/wiki_001.jsonl",
    "wiki_pages/wiki_002.jsonl",
    "wiki_pages/wiki_003.jsonl",
    "wiki_pages/wiki_004.jsonl",
]
OUTPUT_FOLDER = "wiki_pages_anserini"

SIMPLIFY_PAGE_LINKS = False
PREFER_URL_FOR_LINKS = False

# If SIMPLIFY_PAGE_LINKS and not PREFER_URL_FOR_LINKS: [[Page_(disambiguation)|Page Title]] -> Page Title
# If SIMPLIFY_PAGE_LINKS and PREFER_URL_FOR_LINKS: [[Page_(disambiguation)|Page Title]] -> Page (disambiguation)
# Note that the URL basename can sometimes contain extra information, but it can also redirect to a page that is different than the visible label.

SUBSECTION_SEP = "|"
SUBSUBSECTION_SEP = " "

In [2]:
import glob
import os

to_process = None

if type(DATA_LOCATION) == list:  # A list of file paths
    to_process = [file for file in DATA_LOCATION]
elif os.path.isfile(DATA_LOCATION):  # A file path
    to_process = [DATA_LOCATION]
elif os.path.isdir(DATA_LOCATION):  # A folder
    to_process = glob.glob("{}/*.jsonl".format(DATA_LOCATION))
else:
    raise ValueError("Data location is not a valid file or folder.")

print("{} file(s) will be processed.".format(len(to_process)))

5 file(s) will be processed.


In [3]:
import json
import re


def clean_value(value):
    """Replaces unnecessary space (including \t and such) with a single space
    character.

    Arg:
        value: A string.

    Returns:
        The simplified version of the provided string.
    """
    return " ".join(value.split())


def final_clean(
    field,
    simplify_page_links=SIMPLIFY_PAGE_LINKS,
    prefer_url_for_links=PREFER_URL_FOR_LINKS,
):
    """Removes multiple space characters with a single space character. Removes
    leading and trailing spaces. Simplifies the page links according to the
    parameters.

    Args:
        field (str): A field of the processed page data.
        simplify_page_links (Boolean): A Boolean indicating whether the page
            links must be simplified. By default, it is set to
            SIMPLIFY_PAGE_LINKS.
        prefer_url_for_links (Boolean): A Boolean indicating whether the page
            links must be simplified to the URL basename instead of the link's
            label. By default, it is set to SIMPLIFY_PAGE_LINKS.

    Returns:
        str: The simplified version of the provided string.
    """
    field = re.sub(" +", " ", field.strip(), flags=re.MULTILINE)
    if SIMPLIFY_PAGE_LINKS and PREFER_URL_FOR_LINKS:
        # Simplifies links by replacing them with the URL basename (replaces underscores)
        # Underscores are replaced with space
        field = re.sub(r"(?:\[\[)(.*)?(?:\|)", field.replace("_", " "), field)
        field = re.sub(r"(\|).*?(\]\])|\[\[", "", field, flags=re.MULTILINE)
    elif SIMPLIFY_PAGE_LINKS:
        # Simplifies links by replacing them with the titles
        field = re.sub(r"(\[\[).*?(\|)|]]", "", field, flags=re.MULTILINE)
    return field


if not os.path.exists(OUTPUT_FOLDER):  # Creates the output folder if it does not exist
    os.mkdir(OUTPUT_FOLDER)

counter = 0
for file_path in to_process:
    file_basename = os.path.basename(file_path)
    output_path = os.path.join(OUTPUT_FOLDER, file_basename)

    open(output_path, mode="w").close()  # Creating an empty output file

    with open(file_path, "r", encoding="utf-8") as f:
        file_data = [json.loads(line) for line in f]

    for page in file_data:

        page_data_processed = {"id": page["title"], "text": "", "lines": ""}

        # The order list of the page will be used to parse the items
        for item_id, item in enumerate(page["order"]):
            if item.startswith("sentence_"):  # Sentence
                clean_element = clean_value(page[item])
                page_data_processed["text"] += " {}".format(clean_element)
                page_data_processed["lines"] += "{}\t{}\n".format(item, clean_element)
            elif item.startswith("table_"):  # Table
                page_data_processed["text"] += " "
                page_data_processed["lines"] += "{}\t".format(item)
                table_rows = []
                for row in page[item]["table"]:
                    row_items = []
                    for cell in row:
                        row_items.append(clean_value(cell["value"]))

                    row_text = "{}".format(SUBSUBSECTION_SEP).join(row_items)
                    table_rows.append(row_text)

                if "caption" in page[item]:
                    table_rows.append(clean_value(page[item]["caption"]))

                table_text = "{}".format(SUBSECTION_SEP).join(table_rows)
                page_data_processed["text"] += table_text
                page_data_processed["lines"] += table_text
                page_data_processed["lines"] += "\n"
            elif item.startswith("section_"):  # Section
                clean_element = clean_value(page[item]["value"])
                page_data_processed["text"] += " {}".format(clean_element)
                page_data_processed["lines"] += "{}\t{}\n".format(item, clean_element)
            elif item.startswith("list_"):  # List
                list_items = []
                for list_item in page[item]["list"]:
                    list_items.append(clean_value(list_item["value"]))

                list_text = "{}".format(SUBSECTION_SEP).join(list_items)
                page_data_processed["text"] += list_text
                page_data_processed["lines"] += "{}\t{}\n".format(item, list_text)
            else:  # All alternatives must be handled and the code must not reach here
                raise ValueError("Unidentified page element found.")

        page_data_processed["text"] = final_clean(page_data_processed["text"])
        page_data_processed["lines"] = final_clean(page_data_processed["lines"])

        # Appending the processed page data to the file
        with open(output_path, "a", encoding="utf-8") as f:
            json.dump(page_data_processed, f, ensure_ascii=False)
            f.write("\n")