In [1]:
# %pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
# %pip install -qU langchain-mistralai
# %pip install -qU langchain-chroma
# %pip install ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13
Note: you may need to restart the kernel to use updated packages.


In [1]:
# charger les clés
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv()
# os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")
# os.environ["MISTRALAI_API_KEY"] = os.getenv("MISTRAL_API_KEY")
# os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")

In [2]:
import time
import re
from langchain_mistralai import MistralAIEmbeddings
from langchain_chroma import Chroma
import xml.etree.ElementTree as ET
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4


In [6]:
embeddings = MistralAIEmbeddings(model="mistral-embed")

chunk_size = 2000
chunk_overlap = 100
vector_store = Chroma(
    collection_name="statpearls_articles",
    embedding_function=embeddings,
    persist_directory=f"./embed_s{chunk_size}_o{chunk_overlap}",
)
print("Number of stored documents:", vector_store._collection.count())

Number of stored documents: 0


In [7]:
def preprocess(text: str):
    text = re.sub("\xa0", ' ', text)
    return text


In [8]:
def parse_and_add_document(filepath:str):
    tree = ET.parse(filepath)
    root = tree.getroot()
    title = root.find("./book-part/book-part-meta/title-group/title").text
    body = root.find("./book-part/body")
    sections = body.findall(".//sec")
    for i in range(len(sections)):
        meta = {"book_title": title,
                "section": sections[i].find("./title").text if sections[i].find("./title") is not None else sections[i].attrib["sec-type"]}
        paragraphs = [paragraph.text for paragraph in sections[i].findall(".//p") if paragraph.text not in  ["\n", '', None]]
        if paragraphs is None:
            continue
        content = preprocess(' '.join(paragraphs))
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        split_texts = text_splitter.split_text(content)
        if split_texts == []:
            print(content)
            continue
        vector_store.add_texts(texts=split_texts, doc_id=[str(uuid4()) for _ in range(len(split_texts))], metadatas=[{"chunk_index": idx, **meta} for idx in range(len(split_texts))])
        time.sleep(2)

In [9]:
import os

def find_files_with_word(folder_path, word):
    matching_files = []
    files = os.listdir(folder_path)
    for file in files:
        file_path = os.path.join(folder_path, file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().lower()
                if word.lower() in content:
                    matching_files.append(file_path)
        except Exception as e:
            print(f"fail {file_path}: {e}")
    
    return matching_files

folder = "./data/articles"
word_to_search = "ruptured aneurysm"
result = find_files_with_word(folder, word_to_search)
len(result)


31

In [None]:
i = 0
for file in result:
    print(f"{file} {i}")
    parse_and_add_document(file)
    i += 1
i



./data/articles\article-135845.nxml 0


In [17]:
print("Number of stored documents:", vector_store._collection.count())


Number of stored documents: 0
