# Creating contextual data for RAG

##### Set up environment

In [92]:
import json
from datetime import datetime

import requests
from dotenv import load_dotenv

from helper_functions import parsing

load_dotenv()

True

---

### Load downloaded data

In [5]:
with open("data/documents_with_ids.json") as f:
    documents = json.load(f)

Here below:
- Chunk out tables
- If text is longer than 3000 characters, stop at the last sentence before exceeding the 3000th character and start a new chunk

In [142]:
import re


def split_into_sentences(text):
    """Splits the text into sentences based on periods, but ignores decimals."""

    # Regex to split by period followed by space or end of string, but not decimal points in numbers
    sentence_endings = re.compile(r"(?<!\d)\.(?=\s|$)")

    # Split text into sentences at the appropriate places (before periods, not in decimals)
    sentences = sentence_endings.split(text)

    # Rebuild the sentences, appending the period that was removed
    sentences = [
        sentences[i] + "." if i < len(sentences) - 1 else sentences[i]
        for i in range(len(sentences))
    ]  # noqa: E501

    # Remove any trailing empty strings from split operation
    return [s.strip() for s in sentences if s.strip()]


def separate_tables_from_text(document, chunk_size_limit: int = 3000):
    """
    Separate document into tables and regular text,
    ensuring that ongoing tables are merged properly.
    """
    sections = re.split(r"(##TABLE_START|##TABLE_END)", document)
    chunks = []
    current_table = []
    last_text = ""
    in_table = False
    for section in sections:
        section = section.strip()
        if section == "##TABLE_START":
            if in_table:
                pass
            else:
                in_table = True
                current_table = []
        elif section == "##TABLE_END":
            if in_table:
                in_table = False
                table_chunk = "".join(current_table)
                chunks.append(last_text.strip() + table_chunk)
                last_text = ""
        else:
            if in_table:
                current_table.append(section)
            else:
                last_text += section

    if in_table:
        table_chunk = "".join(current_table)
        chunks.append(last_text.strip() + table_chunk)

    output = []
    for i in chunks:
        i = i.replace("\n", " ")  # Replace newlines with spaces
        i = i.replace("\xa0", " ")  # Replace non-breaking spaces with regular spaces

        # Split text into sentences if it's too long
        if len(i) > 3000:
            sentences = split_into_sentences(i)
            sub_chunk = ""
            for sentence in sentences:
                if len(sub_chunk) + len(sentence) > chunk_size_limit:
                    # If adding this sentence exceeds 3000 characters, by default, save the current sub-chunk and start a new one  # noqa: E501
                    output.append(sub_chunk.strip())
                    sub_chunk = sentence  # Start a new chunk with the current sentence
                else:
                    # Add the sentence to the current sub-chunk
                    sub_chunk += " " + sentence
            # Append the last sub-chunk if it exists
            if sub_chunk:
                output.append(sub_chunk.strip())
        else:
            output.append(i)

    return output

In [143]:
chunks = separate_tables_from_text(documents[2]["text"])

Here below, re-organizing the documents with the newly created chunks

In [144]:
new_documents = []
for i in documents:
    chunks = separate_tables_from_text(i["text"])
    for chunk in chunks:
        new_documents.append(
            {
                "company": i["company"],
                "reporting_period": i["reporting_period"],
                "filing_type": i["filing_type"],
                "section": i["section"],
                "text": chunk,
                "id": parsing.generate_document_id(i["id"] + chunk[-100:]),
            }
        )

In [145]:
import json

with open("data/documents_chunked_3000.json", "w") as f:
    json.dump(new_documents, f, indent=2)

In [146]:
!head data/documents_chunked_3000.json

[
  {
    "company": "pltr",
    "reporting_period": "2023-12-31",
    "filing_type": "10k",
    "section": "md_a",
    "text": "ITEM 7. MANAGEMENT\u2019S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS   The following discussion and analysis of our financial condition and results of operations should be read in conjunction with our consolidated financial statements and the accompanying notes thereto included elsewhere in this Annual Report on Form 10-K. This discussion contains forward-looking statements based upon current plans, expectations, and beliefs, involving risks and uncertainties. Our actual results may differ materially from those anticipated in these forward-looking statements. You should review the section titled \u201cSpecial Note Regarding Forward-Looking Statements\u201d for a discussion of forward-looking statements and the section titled \u201cRisk Factors\u201d for a discussion of factors that could cause actual results to differ materially 

----

Here below:
- Chunk out tables (same as above)
- If text is longer than 200 characters, stop at the last sentence before exceeding the 200th character and start a new chunk

In [138]:
new_documents_500 = []
for i in documents:
    chunks = separate_tables_from_text(i["text"], chunk_size_limit=500)
    for n, chunk in enumerate(chunks):
        if len(chunk):
            new_documents_500.append(
                {
                    "company": i["company"],
                    "reporting_period": i["reporting_period"],
                    "filing_type": i["filing_type"],
                    "section": i["section"],
                    "text": chunk,
                    "id": parsing.generate_document_id(str(n) + i["id"]),
                }
            )

In [139]:
import json

with open("data/documents_chunked_500.json", "w") as f:
    json.dump(new_documents_500, f, indent=2)

In [141]:
!head data/documents_chunked_500.json

[
  {
    "company": "pltr",
    "reporting_period": "2023-12-31",
    "filing_type": "10k",
    "section": "md_a",
    "text": "ITEM 7. MANAGEMENT\u2019S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS   The following discussion and analysis of our financial condition and results of operations should be read in conjunction with our consolidated financial statements and the accompanying notes thereto included elsewhere in this Annual Report on Form 10-K. This discussion contains forward-looking statements based upon current plans, expectations, and beliefs, involving risks and uncertainties.",
    "id": "d404ce24"
  },
  {


----

Here below

In [None]:
tags = ["##TABLE_START", "##TABLE_END", "\n\n"]
ss = [
    i
    for i in re.split(r"(##TABLE_START|##TABLE_END)", documents[2]["text"])
    if i not in tags
]
# [i.replace("\n\n", "") for i in ss]

In [245]:
def split_into_sentences_by_period(text):
    text = text.replace(" \n\n", ". ")
    abbreviations = (
        r"(Inc|Ltd|Dr|Mr|Ms|Mrs|Jr|Sr|No|e\.g|U\.S|U\.K|PCAOB|BOD|etc| v| al)\."
    )

    # First, replace periods in abbreviations and other common uses (like decimals) with placeholders  # noqa: E501
    text = re.sub(abbreviations, lambda m: m.group(0).replace(".", "|||"), text)

    # Now, split text by periods that could indicate sentence boundaries.
    # This regex matches periods that are not part of abbreviations or decimals
    sentence_endings = re.compile(
        r"""
        (?<!\d)\.              # Ensure we're not in a decimal number (no digit before period)
        (?!\d)                  # Ensure we're not in a decimal number (no digit after period)
        (?=\s|\n|$)             # The period is followed by a space, newline, or end of string
    """,
        re.VERBOSE,
    )  # noqa: E501

    # Split text using the sentence-ending regex
    sentences = sentence_endings.split(text)

    # Replace the placeholders back to actual periods
    sentences = [s.replace("|||", ".") for s in sentences]

    # Clean up extra whitespace and remove any empty sentences
    stripped = [s.strip() for s in sentences if s.strip()]

    return stripped

In [246]:
import itertools


def create_overlapping_chunks(text: str, overlap: int = 5) -> list:
    tags = ["##TABLE_START", "##TABLE_END", "\n\n"]
    tag_splits = [
        i for i in re.split(r"(##TABLE_START|##TABLE_END)", text) if i not in tags
    ]
    splits = [split_into_sentences_by_period(i) for i in tag_splits]
    flattened_list = list(itertools.chain(*splits))

    new_flat_list = []
    i = 0
    # for i in range(len(flattened_list)):
    while i < len(flattened_list) - overlap:
        init_string = ""
        for j in flattened_list[i : i + overlap]:
            init_string += j + ". "
        new_flat_list.append(init_string)
        i += 1
    return new_flat_list

In [249]:
test_doc = create_overlapping_chunks(documents[2]["text"], overlap=5)

In [251]:
new_documents_5s = []
for i in documents:
    chunks = create_overlapping_chunks(i["text"], overlap=5)
    for n, chunk in enumerate(chunks):
        if len(chunk):
            new_documents_5s.append(
                {
                    "company": i["company"],
                    "reporting_period": i["reporting_period"],
                    "filing_type": i["filing_type"],
                    "section": i["section"],
                    "text": chunk,
                    "id": parsing.generate_document_id(str(n) + i["id"]),
                }
            )

In [256]:
with open("data/documents_chunked_5s.json", "w") as f:
    json.dump(new_documents_5s, f, indent=2)

---

In [215]:
idx = documents[2]["text"].find(
    "To date, the Company has not been required to make any payment resulting from"
)

In [None]:
date_formatting_example = "December 31, 2023, 2022, and 2021 was $ 526.1 million, $ 453.2 million, and $ 421.0 million, respectively"  # noqa: E501
date_formatting_example = "As of December 31, 2023 As of December 31, 2022 Authorized Issued and Outstanding Authorized Issued and Outstanding Class A Common Stock 20,000,000 \xa0 2,096,982 \xa0 20,000,000 \xa0 1,995,414 \xa0 Class B Common Stock 2,700,000 \xa0 102,141 \xa0 2,700,000 \xa0 102,656 \xa0 Class F Common Stock 1,005 \xa0 1,005 \xa0 1,005 \xa0 1,005 \xa0 Total 22,701,005 \xa0 2,200,128 \xa0 22,701,005 \xa0 2,099,075'"  # noqa: E501
date_formatting_example = "'Years Ended December 31, 2023 2022 Cash paid for operating lease liabilities $ 63,374 \xa0 $ 53,772 ,Lease liabilities arising from obtaining right-of-use assets,'$ 28,112 \xa0 $ 28,169'"  # noqa: E501