In [3]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    'https://www.podatki.gov.pl/pcc-sd/rozliczenie-podatku-pcc-od-kupna-samochodu/',
    'https://www.podatki.gov.pl/pcc-sd/rozliczenie-podatku-pcc-od-pozyczki/',
    'https://www.podatki.gov.pl/pcc-sd/rozliczenie-podatku-pcc-od-innych-czynnosci/',
    'https://www.podatki.gov.pl/pcc-sd/abc-pcc/przedmiot-opodatkowania-pcc/',
    'https://www.podatki.gov.pl/pcc-sd/abc-pcc/zwolnienia-pcc/',
    'https://www.podatki.gov.pl/wyszukiwarki/wyszukiwarka-teleadresowa-jednostek-kas/',
    'https://www.podatki.gov.pl/pcc-sd/abc-pcc/organy-podatkowe-pcc/',
]

loader = WebBaseLoader(urls)
documents = loader.load()


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=30 
)

chunked_docs = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        chunked_docs.append(Document(page_content=chunk, metadata=doc.metadata))

len(chunked_docs)

255

In [5]:
def preprocess_chunk(chunk):
    chunk.page_content = chunk.page_content.replace('\n', '')
    return chunk


In [6]:
preprocessed_docs = [preprocess_chunk(chunk) for chunk in chunked_docs]

# VECTOR STORE

In [8]:
import time
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

api_key = "c385a16f-bd88-45e7-9cfa-cd44ae86358a"

pc = Pinecone(api_key=api_key)

index_name = "hackyeah-3" 

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

model_name = "sdadas/st-polish-paraphrase-from-distilroberta"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [9]:
vector_store.add_documents(preprocessed_docs)

['f4bbb642-33c4-4bfd-8e62-199cd9032221',
 '9f7a61ce-dc68-4dbb-bf04-4e285e031617',
 '1dcf2eeb-41c1-430b-82da-4325c9a8e2dd',
 'f4390ab4-d86a-4739-8c1b-15d33922d709',
 '1e6d1904-f267-4f75-a813-62fd6e07d176',
 '425ecf7a-b242-42f1-a7e5-d7728a108a01',
 'bfc85c33-2fc6-4b39-b511-fab3f5dceebf',
 'eda22a63-ec9d-4bc5-8469-c2b14d9f5446',
 '00b23b91-a61f-4c9c-aa0a-928f58de19c8',
 'c306fbad-13e8-490e-9ccd-a836c9ddc01b',
 'e33ccb1a-4615-41c7-ad5f-273a109b8cba',
 '1e8fe373-91ff-47af-abb8-6141db625b74',
 '3061678e-3a4f-426c-9df2-f687dbf547a1',
 'd91bd9b6-c1df-41f5-a759-0297eb75ac4b',
 'd6659feb-d8d4-445b-88c3-f05fc53e1168',
 '517dc250-ca44-49ca-80e1-b868c02e5847',
 '70ea62ce-4df9-46e6-9b12-a6950e6ea024',
 'cb1bf2f6-59e4-41cb-a16e-1dbe02af5a67',
 'b6ecebf4-7f45-4205-acad-00253f319991',
 '4e289a5e-7560-4ca2-8bb9-f8851c22829b',
 'dd53d4e9-8192-434f-90d3-e49e5a0e4ecc',
 '19676039-72a8-43dd-82ff-1312b9c45d88',
 'c7fbf7db-3489-4eef-8687-99945a4e19b5',
 '09084b9b-1b24-4e5c-97a1-9efc042bafff',
 '959d9375-2363-