In [1]:
import os
from chunker import PDFChunker
from vectorizer import TextVectorizer
from retriever import Retriever
from tqdm import tqdm
import time
import numpy as np

# Chunker

In [2]:
chunker = PDFChunker('../data/PDM_Porto_Aviso n.º 1934_2023.pdf')

In [3]:
chunks = chunker.split_pdf_by_sections(pattern='Artigo')

In [4]:
len(chunks)

171

In [5]:
for i, chunk in enumerate(chunks):
    if len(chunk.split()) > 4096:
        print(i)

170


In [6]:
chunker.save_chunks_to_file(chunks, '../data/artigos.txt')

Chunks saved to ../data/artigos.txt


# Vectorizer

In [6]:
vectorizer = TextVectorizer(api_key=os.getenv('MISTRAL_API_KEY'))

In [9]:
# Free API very low rate - introduced sleep in the method to overcome this
embeddings = []

for chunk in tqdm(chunks[:-1]):
    embeddings.append(vectorizer.get_text_embedding(chunk))
    time.sleep(2)

100%|██████████| 170/170 [06:18<00:00,  2.23s/it]


In [12]:
vectorizer.save_embeddings_to_db(np.array(embeddings), '../data/artigos_embeddings.faiss')    

# Retriever

In [7]:
retriever = Retriever('../data/artigos_embeddings.faiss', vectorizer)

In [11]:
relevant_chunks = retriever.retrieve('Corredor verde', chunks, k=5)

In [12]:
sum([len(chunk.split()) for chunk in relevant_chunks])

792