In [58]:
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
import glob
import os
from tqdm import tqdm
from typing import Generator, Tuple, Iterator

import faiss
import numpy as np
import pickle

In [59]:
def pickle_read(filename: str):
    with open("data.pkl", "rb") as f:
        loaded_data = pickle.load(f)
    return loaded_data
def pickle_write(data, filename: str):
    with open(filename + ".pkl", "wb") as f:
        pickle.dump(data, f)

In [4]:
def index_documents(knowledge_base_path: str = "raw-kb") -> Iterator[Tuple[str, str, int]]:
    for filepath in glob.glob(os.path.join(knowledge_base_path, "*.pdf")):
        filename = os.path.basename(filepath)
        with pdfplumber.open(filepath) as pdf:
            for idx, page in enumerate(pdf.pages):
                yield page.extract_text(), filename, idx

In [51]:
fine_splitter = SentenceTransformersTokenTextSplitter(
    model_name="all-mpnet-base-v2", #"sentence-transformers/all-MiniLM-L6-v2",
    tokens_per_chunk=1024,
    chunk_overlap=50
)

model = fine_splitter._model

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [69]:
try:
    #1/0
    index = faiss.read_index("basic_rag.faiss")
    meta = pickle_read("meta")
    print("Index readed")
except:
    index = faiss.IndexFlatL2(fine_splitter._model[1].word_embedding_dimension)
    meta = {}
    print("Index created")

Index created


In [96]:
%%time
document_index = index.ntotal
meta = {}
for text, filename, page_index in tqdm(index_documents()):
    for chunk in fine_splitter.split_text(text):
        embeddings = model.encode([chunk])
        meta[document_index] = {"filename": filename, "page_index": page_index, "text": chunk}
        index.add(embeddings)
        document_index += 1
faiss.write_index(index, "basic_rag.faiss")
pickle_write(meta, "meta")

20it [00:11,  2.32it/s]Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 

CPU times: user 14min 18s, sys: 4min 33s, total: 18min 52s
Wall time: 3min 13s


In [97]:
chunk

', like search, ads, chrome, cloud, youtube and android. contact investor relations : investor - relations @ abc. xyz media : press @ abc. xyz privacy terms about google google products https : / / abc. xyz / 2025 - 0812 / 1 / 1'

In [98]:
embeddings[0].tolist()[:10]

[0.020318258553743362,
 0.022291693836450577,
 -0.0035374928265810013,
 0.0019163294928148389,
 0.020228112116456032,
 -0.004878796171396971,
 0.0748341977596283,
 -0.0152121651917696,
 0.05519242212176323,
 -0.04612845182418823]

In [99]:
model.encode([chunk])[0].tolist()[:10]

[0.020318258553743362,
 0.022291693836450577,
 -0.0035374928265810013,
 0.0019163294928148389,
 0.020228112116456032,
 -0.004878796171396971,
 0.0748341977596283,
 -0.0152121651917696,
 0.05519242212176323,
 -0.04612845182418823]

In [111]:
scores, indexies = index.search(model.encode(["Philipp Schindler"]), k=10)
print(indexies[0])
#scores, indexies = index.search(embeddings, k=10)

[2014 1044 1271 1661 1250 1313  113  114  115 1145]


In [112]:
for score, idx in list(zip(scores[0], indexies[0])):
    if not idx in ii and idx > 1000:
        print(score, idx, meta[idx])    

1.6580353 1271 {'filename': '2024-q2-earnings-transcript.pdf', 'page_index': 11, 'text': ". to prevent any background noise, we ask that you please mute your line once your question has been stated. your first question comes from brian nowak with morgan stanley. your line is now open. brian nowak ( morgan stanley ) : thanks for taking my questions. first, thank you ruth for all of the help and significant impact over the past decade. the first one, it ' s a little bit of a jump ball, i guess, for sundar, philipp or ruth. i guess we ' re sort 12"}
1.7279496 1250 {'filename': '2024-q2-earnings-transcript.pdf', 'page_index': 4, 'text': "s seat. before i close, i want to acknowledge that today is ruth ' s final earnings call. let me take a moment to thank her for all she has done for google and alphabet as our longest - serving cfo. i ' m excited to continue to work with her in her new role. and i look forward to welcoming our newly appointed cfo, anat ashkenazi. she starts next week, and 

In [114]:
ii = []
for idx, m in meta.items():
    if 'krisp' in m['text'].lower():
        print(m)