### Parse and Analyze Input using fixed size chunking ###
- LlamaIndex Version

In [1]:
from llama_index.core import Settings

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
import tiktoken
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor
from llama_index.postprocessor.cohere_rerank import CohereRerank

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding

from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from config import set_environment
set_environment()

In [3]:
input_directory = "F:/rag_sdk/datasets/cmp_leave/files/pdf"
#input_file = "F:/rag_sdk/datasets/cmp_leave/files/md/KAI_NW_PLAN.md"
chunked_file = "F:/rag_sdk/evaluations/diagnostics/cmp_leave/CMP_LEAVE_fixed_chunked_LIDX.md"
analysis_file = "F:/rag_sdk/evaluations/diagnostics/cmp_leave/CMP_LEAVE_fixed_chunk_analysis_LIDX.xlsx"

chunk_size = 512
chunk_overlap = 0.1 * chunk_size

In [4]:
reader = SimpleDirectoryReader(input_directory)
documents = reader.load_data()

node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)
nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(nodes):
    node.id_ = f"node-{idx}"

In [5]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [6]:
# Initialize the documents dictionary
documents = {}
i = 0

# Populate the data
for i, chunk in enumerate(nodes):
    
    doc_id = i + 1
    content = chunk.get_content()
    doc_length = len(content)
    tokens = num_tokens_from_string(content,"cl100k_base")
    title = str(doc_id).zfill(6)
   
    
    # Create the document dictionary
    document = {
        "doc_id": doc_id,
        "doc_length": doc_length,
        "tokens": tokens,
        "text": content,
        "title": title
    }
    
    # Add the document to the documents dictionary
    documents[title] = document


In [7]:
with open(chunked_file, 'w') as file:
   for title, document in documents.items():
      file.write(f"## Document - {document['title']}")
      file.write(f"\n**Tokens - {document['tokens']}**")
      file.write(f"\n**Text of this document:**\n\n{document['text']}\n")
      

In [8]:
# Create a DataFrame from the documents dictionary
df = pd.DataFrame.from_dict(documents, orient='index')

In [9]:
token_count_percentiles = np.percentile(df["tokens"], [50, 95, 99])

# Creating the first DataFrame for token count percentiles
percentiles_df = pd.DataFrame({
    'Percentile': [50, 95, 99],
    'Token Count': token_count_percentiles
})

# Calculating the percentage of chunks with token count <= 128, 256, 512
total_tokens = len(df["tokens"])
tokens = df["tokens"].values
percentiles = np.array([128, 256, 512])
percentile_values = [(np.sum(tokens <= p) / total_tokens * 100) for p in percentiles]

# Creating the second DataFrame for percentage of chunks with token count <= 128, 256, 512
percentile_chunks_df = pd.DataFrame({
    'Token Count Threshold': [128, 256, 512],
    'Percentage of Chunks': percentile_values
})

In [10]:
from openpyxl import Workbook
from openpyxl.drawing.image import Image

In [11]:
# Create a new Excel writer object
excel_writer = pd.ExcelWriter(analysis_file, engine='openpyxl')

# Exclude the "content" field
df_excluded = df.drop(columns=["text"])

df_excluded.to_excel(excel_writer, sheet_name='Chunks', index=False)


# Plotting a histogram of the values in the "tokens" column
plt.figure()
plt.hist(df["tokens"], bins=10, edgecolor='black')
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.title("Histogram of Tokens")
plt.savefig('histogram.png')  # Save the histogram to a file
plt.close()

# Open the workbook and add a new sheet for the histogram
wb = excel_writer.book
ws = wb.create_sheet('Histogram')

# Insert the image into the histogram sheet
img = Image('histogram.png')
ws.add_image(img, 'A1')

# Write the percentiles to the 'Token Data' sheet

percentiles_df.to_excel(excel_writer, sheet_name='Token Data', index=False, startrow=0)
percentile_chunks_df.to_excel(excel_writer, sheet_name='Token Data', index=False, startrow=len(percentiles_df) + 2)


# Save the Excel file
excel_writer._save()  # Correct method to save the file