In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import faiss
from langchain_chroma import Chroma
from uuid import uuid4
from langchain_core.documents import Document



In [2]:
def tokenize_text(text):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
    )
    texts = []
    for page in text:
        page_texts = text_splitter.split_text(page.page_content)
        texts.extend(page_texts)
    
    return texts
def load_pdf_with_langchain(pdf_path):
    loader = PyPDFLoader(pdf_path)
    document = loader.load()
    return document

pdf= load_pdf_with_langchain('Data/2023 EU-wide stress test - Methodological Note.pdf')
texts= tokenize_text(pdf)



id= 0
documents_1= []
for i in texts:
    document= Document(
        page_content= i,
        metadata={"source": "Stress Test"},
       )
    id += 1
    documents_1.append(document)
        


# pdf_2= load_pdf_with_langchain('Data/sample.pdf')
# texts_2= tokenize_text(pdf_2)

# concatenated_text_2 = "\n\n".join(texts_2)

# # Create a single Document from the concatenated text
# document_2 = Document(
#     page_content= concatenated_text_2,
#     metadata={"source": "sample"},
#     id=2,
# )



In [3]:
embeddings_model= HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
vector_store = Chroma(
    collection_name="stress_test",
    embedding_function=embeddings_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not neccesary
)

  from tqdm.autonotebook import tqdm, trange


In [4]:
uuids = [str(uuid4()) for _ in range(len(documents_1))]

vector_store.add_documents(documents=documents_1, ids=uuids)

['67d7d2cb-f734-4036-8c43-1bbb707eef07',
 'c1085f72-6293-4616-8665-9acf54bee8d3',
 '49e8f145-8725-41c1-b0a3-263b918f36dd',
 'a7a69aca-7e81-4e01-a4cf-4e1025fdc3ae',
 'f5fcd1b2-4e97-4f1f-8c36-b26e8378b59e',
 '496560fd-5e6c-4347-90eb-ddda59928734',
 '72e136e4-a60f-49c4-bbaa-8df934f059ff',
 '8ec5a080-5a31-41eb-a65c-70cedecce394',
 '5bc2b42c-9422-474e-a335-3f55106986df',
 '5d01b9b7-e708-4acd-94ff-034a5ceba2fa',
 '7330807d-a4b8-4676-a5b2-ddf85513434a',
 '941b705c-5ea9-4b0d-8dbe-1c2c766f36a7',
 'a8c3601f-b092-4af3-9609-5babf4b38ffc',
 '703a3636-de0c-4397-b5a9-62348cd18c44',
 'cf213b90-ced0-4380-9706-769c93aa5a12',
 '28761a1d-ab07-429e-b4e2-3a3e6e059158',
 '459fa09b-ea7f-49e3-ae74-dcbf65f88d07',
 'edc27fa0-d0a3-4038-8245-0f917efe675a',
 '07cf62e2-39ea-4455-bbc4-5294e2d95a77',
 '21de25fc-9f09-4f31-b638-059a0b83a9f9',
 '58ec4224-4af3-4042-9a39-75a999d6f1e3',
 'd79f8816-67f7-4e52-a629-727173d49952',
 'dc1ed39f-213d-497d-b5d9-a7db85901d3d',
 '9e9f68a7-b451-4370-b019-448ad5840cdb',
 '4cd1759f-8cad-

In [18]:
results = vector_store.similarity_search_with_score(
    "What is Location and country of residence of the counterparty", k=10, filter={"source": "Stress Test"}
)


In [19]:
for i in results:
    print (i)

(Document(metadata={'source': 'Stress Test'}, page_content='2023 EU-WIDE STRESS TEST  – METHODOLOGICAL NOTE   \n \n \n45 \n \ncounterparty.32 For the starting points, this breakdown follows NACE sections (1 -digit) with \nsome targeted o/w positions at NACE division level (2 -digit), while projections should be \nprovided for NACE sections as well as for two sub -components of the manufacturing sector \n(NACE section C).  \n117. Additional data will be collected in the CSV_CR_COVID19 template for the sub -portfolios of \nexposures subject to COVID -19 PGS. The template guidance includes specific instructions for \nthe report of this information. Banks are required to provide information in the explanatory \nnote regarding the exposures reported in the template CSV_CR_COVID19 that are treated \nunder the securitisation framework.  \n118. In the template CSV_CR_COVID19, the breakdown by country of the counterparty will only be \nreported for countries where exposures under PGS are materi