### Test with advanced chunking

In [57]:
import logging
import re
from tqdm import tqdm
import array
import numpy as np
import pandas as pd
import time

# to generate id from text
import hashlib

import oci

import tokenizers
from tokenizers import Tokenizer
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter

import oracledb
import ads

# This is the wrapper for GenAI Embeddings
from ads.llm import GenerativeAIEmbeddings

from oci_utils import load_oci_config

# this way we don't show & share
from config_private import (
    DB_USER,
    DB_PWD,
    DB_SERVICE,
    DB_HOST_IP,
    COMPARTMENT_OCID,
    ENDPOINT,
)

#
# Configs
#
from config import INPUT_FILES, EMBED_MODEL, EMBEDDINGS_BITS, ID_GEN_METHOD, TOKENIZER

# to create embeddings in batch
BATCH_SIZE = 20

In [78]:
INPUT_FILE = "covid19_treatment_guidelines.pdf"

In [84]:
def preprocess_text(text):
    text = text.replace("\t", " ")
    text = text.replace(" -\n", "")
    text = text.replace("-\n", "")
    text = text.replace("\n", " ")

    # remove repeated blanks
    text = re.sub(r"\s+", " ", text)

    return text

In [85]:
pages = SimpleDirectoryReader(input_files=[INPUT_FILE]).load_data()

for doc in pages:
    doc.text = preprocess_text(doc.text)

node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)

nodes = node_parser.get_nodes_from_documents(pages, show_progress=True)

Parsing nodes:   0%|          | 0/469 [00:00<?, ?it/s]

#### count the number of token per page

In [86]:
cohere_tokenizer = Tokenizer.from_pretrained(TOKENIZER)

In [90]:
list_pages = []
list_tokens = []

i = 0
for node in tqdm(nodes):
    list_pages.append(i + 1)
    list_tokens.append(len(cohere_tokenizer.encode(node.text)))
    i += 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 993/993 [00:00<00:00, 1802.75it/s]


In [91]:
dict_pages = {"pages": list_pages, "tokens": list_tokens}

df_tokens = pd.DataFrame(dict_pages)

In [92]:
df_tokens.describe()

Unnamed: 0,pages,tokens
count,993.0,993.0
mean,497.0,369.526687
std,286.79871,120.007155
min,1.0,26.0
25%,249.0,303.0
50%,497.0,423.0
75%,745.0,460.0
max,993.0,502.0
