### Parse and Analyze Markdown Input ###

In [1]:
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
import tiktoken
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append("..")

from config import set_environment
set_environment()

In [3]:
input_directory = "F:/rag_sdk/datasets/cmp_ka/files/md/"
chunked_file = "F:/rag_sdk/evaluations/diagnostics/cmp_ka/CMP_KA_chunked.md"
analysis_file = "F:/rag_sdk/evaluations/diagnostics/cmp_ka/CMP_KA_chunk_analysis.xlsx"

In [4]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

#loader = TextLoader(input_file)

loader = DirectoryLoader(input_directory, glob="**/*.md", loader_cls=TextLoader)
text_data = loader.load()
len(text_data)

41

In [5]:
page_contents = [item.page_content for item in text_data]
text_concatenated = "\n\n ".join(page_contents)

In [6]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("####", "Header 5")
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(text_concatenated)

In [7]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [8]:
import re

def replace_title_in_text(text, title):
    # Create a regex pattern to match one or more # followed by the title
    pattern = re.compile(rf'#+\s+{re.escape(title)}')
    # Replace all matches with the title
    result = pattern.sub(title, text)
    return result

In [9]:
# Initialize the documents dictionary
documents = {}
i = 0

# Populate the data
for i, chunk in enumerate(md_header_splits):
    
    doc_id = i + 1
    doc_length = len(chunk.page_content)
    tokens = num_tokens_from_string(chunk.page_content,"cl100k_base")
    
    header_1 = chunk.metadata.get("Header 1", "")
    header_2 = chunk.metadata.get("Header 2", "")
    header_3 = chunk.metadata.get("Header 3", "")
    header_4 = chunk.metadata.get("Header 4", "")
    header_5 = chunk.metadata.get("Header 5", "")

    headers = [header_1, header_2, header_3, header_4, header_5]
    parents = []

    for header in headers:
        if header == "":
            break
        parents.append(header)
    
    # Identify the title as the last non-empty header
    title = parents[-1] if parents else "Untitled"
    text = replace_title_in_text(chunk.page_content, title)
   
    
    # Create the document dictionary
    document = {
        "doc_id": doc_id,
        "doc_length": doc_length,
        "tokens": tokens,
        "header_1": header_1,
        "header_2": header_2,
        "header_3": header_3,
        "header_4": header_4,
        "header_5": header_5,
        "text": text,
        "parents": parents,
        "title": title
    }
    
    # Add the document to the documents dictionary
    documents[title] = document


In [10]:
with open(chunked_file, 'w') as file:
   for title, document in documents.items():
      file.write(f"## Document - {document['doc_id']}")
      file.write(f"\n**Tokens - {document['tokens']}**")
      file.write(f"\n**Text of this document:**\n\n{document['text']}")
      file.write(f"\n\n**Title of this document:**\n{document['title']}")
      file.write(f"\n\n**This document is contained under the following titles:**\n{','.join(document['parents'])}\n")

In [11]:
# Create a DataFrame from the documents dictionary
df = pd.DataFrame.from_dict(documents, orient='index')

In [12]:
token_count_percentiles = np.percentile(df["tokens"], [50, 95, 99])

# Creating the first DataFrame for token count percentiles
percentiles_df = pd.DataFrame({
    'Percentile': [50, 95, 99],
    'Token Count': token_count_percentiles
})

# Calculating the percentage of chunks with token count <= 128, 256, 512
total_tokens = len(df["tokens"])
tokens = df["tokens"].values
percentiles = np.array([128, 256, 512])
percentile_values = [(np.sum(tokens <= p) / total_tokens * 100) for p in percentiles]

# Creating the second DataFrame for percentage of chunks with token count <= 128, 256, 512
percentile_chunks_df = pd.DataFrame({
    'Token Count Threshold': [128, 256, 512],
    'Percentage of Chunks': percentile_values
})

# Displaying the DataFrames
print(percentiles_df)
print(percentile_chunks_df)

   Percentile  Token Count
0          50       139.00
1          95       579.90
2          99       964.92
   Token Count Threshold  Percentage of Chunks
0                    128             45.161290
1                    256             80.645161
2                    512             94.044665


In [13]:
from openpyxl import Workbook
from openpyxl.drawing.image import Image

In [14]:
# Create a new Excel writer object
excel_writer = pd.ExcelWriter(analysis_file, engine='openpyxl')

# Exclude the "content" field
df_excluded = df.drop(columns=["text"])

df_excluded.to_excel(excel_writer, sheet_name='Chunks', index=False)


# Plotting a histogram of the values in the "tokens" column
plt.figure()
plt.hist(df["tokens"], bins=10, edgecolor='black')
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.title("Histogram of Tokens")
plt.savefig('histogram.png')  # Save the histogram to a file
plt.close()

# Open the workbook and add a new sheet for the histogram
wb = excel_writer.book
ws = wb.create_sheet('Histogram')

# Insert the image into the histogram sheet
img = Image('histogram.png')
ws.add_image(img, 'A1')

# Write the percentiles to the 'Token Data' sheet

percentiles_df.to_excel(excel_writer, sheet_name='Token Data', index=False, startrow=0)
percentile_chunks_df.to_excel(excel_writer, sheet_name='Token Data', index=False, startrow=len(percentiles_df) + 2)


# Save the Excel file
excel_writer._save()  # Correct method to save the file