# Creating contextual data for RAG

##### Set up environment

In [92]:
import json
from datetime import datetime

import requests
from dotenv import load_dotenv

from helper_functions import parsing

load_dotenv()

True

---

### Load downloaded data

In [5]:
with open("data/documents_with_ids.json") as f:
    documents = json.load(f)

Here below:
- Chunk out tables
- If text is longer than 3000 characters, stop at the last sentence before exceeding the 3000th character and start a new chunk

In [112]:
import re


def split_into_sentences(text):
    """Splits the text into sentences based on periods, but ignores decimals."""

    # Regex to split by period followed by space or end of string, but not decimal points in numbers
    sentence_endings = re.compile(r"(?<!\d)\.(?=\s|$)")

    # Split text into sentences at the appropriate places (before periods, not in decimals)
    sentences = sentence_endings.split(text)

    # Rebuild the sentences, appending the period that was removed
    sentences = [
        sentences[i] + "." if i < len(sentences) - 1 else sentences[i]
        for i in range(len(sentences))
    ]  # noqa: E501

    # Remove any trailing empty strings from split operation
    return [s.strip() for s in sentences if s.strip()]


def separate_tables_from_text(document, chunk_size_limit: int = 3000):
    """
    Separate document into tables and regular text,
    ensuring that ongoing tables are merged properly.
    """
    sections = re.split(r"(##TABLE_START|##TABLE_END)", document)
    chunks = []
    current_table = []
    last_text = ""
    in_table = False
    for section in sections:
        section = section.strip()
        if section == "##TABLE_START":
            if in_table:
                pass
            else:
                in_table = True
                current_table = []
        elif section == "##TABLE_END":
            if in_table:
                in_table = False
                table_chunk = "".join(current_table)
                chunks.append(last_text.strip() + table_chunk)
                last_text = ""
        else:
            if in_table:
                current_table.append(section)
            else:
                last_text += section

    if in_table:
        table_chunk = "".join(current_table)
        chunks.append(last_text.strip() + table_chunk)

    output = []
    for i in chunks:
        i = i.replace("\n", " ")  # Replace newlines with spaces
        i = i.replace("\xa0", " ")  # Replace non-breaking spaces with regular spaces

        # Split text into sentences if it's too long
        if len(i) > 3000:
            sentences = split_into_sentences(i)
            sub_chunk = ""
            for sentence in sentences:
                if len(sub_chunk) + len(sentence) > chunk_size_limit:
                    # If adding this sentence exceeds 3000 characters, by default, save the current sub-chunk and start a new one  # noqa: E501
                    output.append(sub_chunk.strip())
                    sub_chunk = sentence  # Start a new chunk with the current sentence
                else:
                    # Add the sentence to the current sub-chunk
                    sub_chunk += " " + sentence
            # Append the last sub-chunk if it exists
            if sub_chunk:
                output.append(sub_chunk.strip())
        else:
            output.append(i)

    return output

In [79]:
chunks = separate_tables_from_text(documents[2]["text"])

Here below, re-organizing the documents with the newly created chunks

In [106]:
new_documents = []
for i in documents:
    chunks = separate_tables_from_text(i["text"])
    for chunk in chunks:
        new_documents.append(
            {
                "company": i["company"],
                "reporting_period": i["reporting_period"],
                "filing_type": i["filing_type"],
                "section": i["section"],
                "text": chunk,
                "id": parsing.generate_document_id(i["id"] + chunk[-100:]),
            }
        )

In [107]:
import json

with open("data/documents_chunked_3000.json", "w") as f:
    json.dump(new_documents, f, indent=2)

In [109]:
!head data/documents_chunked_3000.json

[
  {
    "company": "pltr",
    "reporting_period": "2023-12-31",
    "filing_type": "10k",
    "section": "md_a",
    "text": "ITEM 7. MANAGEMENT\u2019S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS   The following discussion and analysis of our financial condition and results of operations should be read in conjunction with our consolidated financial statements and the accompanying notes thereto included elsewhere in this Annual Report on Form 10-K. This discussion contains forward-looking statements based upon current plans, expectations, and beliefs, involving risks and uncertainties. Our actual results may differ materially from those anticipated in these forward-looking statements. You should review the section titled \u201cSpecial Note Regarding Forward-Looking Statements\u201d for a discussion of forward-looking statements and the section titled \u201cRisk Factors\u201d for a discussion of factors that could cause actual results to differ materially 

----

Here below:
- Chunk out tables (same as above)
- If text is longer than 200 characters, stop at the last sentence before exceeding the 200th character and start a new chunk

In [138]:
new_documents_500 = []
for i in documents:
    chunks = separate_tables_from_text(i["text"], chunk_size_limit=500)
    for n, chunk in enumerate(chunks):
        if len(chunk):
            new_documents_500.append(
                {
                    "company": i["company"],
                    "reporting_period": i["reporting_period"],
                    "filing_type": i["filing_type"],
                    "section": i["section"],
                    "text": chunk,
                    "id": parsing.generate_document_id(str(n) + i["id"]),
                }
            )

In [139]:
import json

with open("data/documents_chunked_500.json", "w") as f:
    json.dump(new_documents_500, f, indent=2)

In [141]:
!head data/documents_chunked_500.json

[
  {
    "company": "pltr",
    "reporting_period": "2023-12-31",
    "filing_type": "10k",
    "section": "md_a",
    "text": "ITEM 7. MANAGEMENT\u2019S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS   The following discussion and analysis of our financial condition and results of operations should be read in conjunction with our consolidated financial statements and the accompanying notes thereto included elsewhere in this Annual Report on Form 10-K. This discussion contains forward-looking statements based upon current plans, expectations, and beliefs, involving risks and uncertainties.",
    "id": "d404ce24"
  },
  {
