### Test with advanced chunking

In [9]:
import logging
import re
from tqdm import tqdm
import array
import numpy as np
import pandas as pd
import time

# to generate id from text
import hashlib

import oci

import tokenizers
from tokenizers import Tokenizer
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter

import oracledb
import ads

# This is the wrapper for GenAI Embeddings
from ads.llm import GenerativeAIEmbeddings

from oci_utils import load_oci_config

# this way we don't show & share
from config_private import (
    DB_USER,
    DB_PWD,
    DB_SERVICE,
    DB_HOST_IP,
    COMPARTMENT_OCID,
    ENDPOINT,
)

#
# Configs
#
from config import (
    INPUT_FILES,
    EMBED_MODEL,
    EMBEDDINGS_BITS,
    ID_GEN_METHOD,
    TOKENIZER,
    MAX_CHUNK_SIZE,
)

# to create embeddings in batch
BATCH_SIZE = 20

In [10]:
INPUT_FILE = "high-availability-23c.pdf"

In [11]:
def preprocess_text(text):
    text = text.replace("\t", " ")
    text = text.replace(" -\n", "")
    text = text.replace("-\n", "")
    text = text.replace("\n", " ")

    # remove repeated blanks
    text = re.sub(r"\s+", " ", text)

    return text

In [12]:
pages = SimpleDirectoryReader(input_files=[INPUT_FILE]).load_data()

for doc in pages:
    doc.text = preprocess_text(doc.text)

node_parser = SentenceSplitter(chunk_size=MAX_CHUNK_SIZE, chunk_overlap=20)

nodes = node_parser.get_nodes_from_documents(pages, show_progress=True)

Parsing nodes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 561/561 [00:00<00:00, 910.47it/s]


#### count the number of token per page

In [13]:
cohere_tokenizer = Tokenizer.from_pretrained(TOKENIZER)

In [14]:
list_pages = []
list_tokens = []

i = 0
for node in tqdm(nodes):
    list_pages.append(i + 1)
    list_tokens.append(len(cohere_tokenizer.encode(node.text)))
    i += 1

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1003/1003 [00:00<00:00, 2816.73it/s]


In [15]:
dict_pages = {"pages": list_pages, "tokens": list_tokens}

df_tokens = pd.DataFrame(dict_pages)

In [16]:
df_tokens.describe()

Unnamed: 0,pages,tokens
count,1003.0,1003.0
mean,502.0,263.057827
std,289.685462,110.712187
min,1.0,15.0
25%,251.5,170.0
50%,502.0,299.0
75%,752.5,358.0
max,1003.0,489.0
