1. Extracting text

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup
import torch

In [4]:
! git clone https://github.com/naomi-helmbold/deeplearningproject.git #to load the data on google colab

fatal: destination path 'deeplearningproject' already exists and is not an empty directory.


In [5]:
files = glob.glob('deeplearningproject/data/*.html')

['deeplearningproject/data/Notice patient - EFFERALGAN 1000 mg, comprimé effervescent - Base de données publique des médicaments.html', 'deeplearningproject/data/Notice patient - ISIMIG 2,5 mg, comprimé pelliculé - Base de données publique des médicaments.html', 'deeplearningproject/data/Fiche info - SPASFON LYOC 160 mg, lyophilisat oral - Base de données publique des médicaments.html', 'deeplearningproject/data/Résumé des caractéristiques du produit - AUGMENTIN 1 g_125 mg, poudre pour suspension buvable en sachet-dose (rapport amoxicilline_acide clavulanique _ 8_1) - Base de données publique des médicaments.html', 'deeplearningproject/data/Résumé des caractéristiques du produit - LAMALINE, gélule - Base de données publique des médicaments.html', 'deeplearningproject/data/Résumé des caractéristiques du produit - IXPRIM 37,5 mg_325 mg, comprimé pelliculé - Base de données publique des médicaments.html', 'deeplearningproject/data/Résumé des caractéristiques du produit - EFFERALGAN 1000 m

In [6]:
text_lines = []

for file in files:
    with open(file, 'r', encoding='ISO-8859-1') as f:
        print(f"Processing: {file}")
        raw_html = f.read()

        # Clean HTML to plain text
        soup = BeautifulSoup(raw_html, "html.parser")
        clean_text = soup.get_text(separator=" ", strip=True)
        clean_text = " ".join(clean_text.split())

        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=80)
        chunks = text_splitter.split_text(clean_text)

        text_lines.extend(chunks)


print(text_lines[:5])  # Printing for verification

Processing: deeplearningproject/data/Notice patient - EFFERALGAN 1000 mg, comprimé effervescent - Base de données publique des médicaments.html
Processing: deeplearningproject/data/Notice patient - ISIMIG 2,5 mg, comprimé pelliculé - Base de données publique des médicaments.html
Processing: deeplearningproject/data/Fiche info - SPASFON LYOC 160 mg, lyophilisat oral - Base de données publique des médicaments.html
Processing: deeplearningproject/data/Résumé des caractéristiques du produit - AUGMENTIN 1 g_125 mg, poudre pour suspension buvable en sachet-dose (rapport amoxicilline_acide clavulanique _ 8_1) - Base de données publique des médicaments.html
Processing: deeplearningproject/data/Résumé des caractéristiques du produit - LAMALINE, gélule - Base de données publique des médicaments.html
Processing: deeplearningproject/data/Résumé des caractéristiques du produit - IXPRIM 37,5 mg_325 mg, comprimé pelliculé - Base de données publique des médicaments.html
Processing: deeplearningproject

2. Embedding a sentence

In [7]:
from sentence_transformers import SentenceTransformer

In [8]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
s = text_lines[0]
e = embedding_model.encode([s])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


3. Creating a Milvus data collection


In [9]:
! pip install pymilvus # installing pymilvus for google colab



In [10]:
from pymilvus import MilvusClient

In [11]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def emb_text (text_lines):
    embeddings = embedding_model.encode(text_lines, batch_size=32, show_progress_bar=False)
    return embeddings

In [12]:
milvus_client = MilvusClient(uri="./my_milvus_db.db")
collection_name = "rag_collection"
milvus_client.create_collection(
 collection_name=collection_name,
 dimension= 384, 
 metric_type="IP", 
 consistency_level="Strong",
)

In [13]:
data = []
for i, line in enumerate(text_lines):
    data.append({"id": i, "vector": emb_text(line), "text": line})


In [14]:
insert_res = milvus_client.insert(collection_name=collection_name, data=data)

4. Retrieving data for a query

In [15]:
question = "Can I take doliprane pregnant?"
search_res = milvus_client.search(
 collection_name=collection_name,
 data=[
 emb_text(question)
 ],
 limit=3, # Return top 3 results
 search_params={"metric_type": "IP", "params": {}},
 output_fields=["text"], 
)

data: ["[{'id': 24759, 'distance': 0.5623675584793091, 'entity': {'text': 'DOLIPRANE 500 mg, comprimé ? Tenir ce médicament hors de la vue et de la portée des enfants. N\\x92utilisez pas ce médicament après'}}, {'id': 24688, 'distance': 0.5539152026176453, 'entity': {'text': 'Si vous oubliez de prendre DOLIPRANE 500 mg, comprimé Ne prenez pas de dose double pour compenser la dose simple que vous avez'}}, {'id': 18095, 'distance': 0.5175432562828064, 'entity': {'text': 'service médical rendu par les spécialités DOLIPRANE, GELUPRANE et PARACETAMOL ZENTIVA reste important dans les indications de'}}]"]


In [16]:
import re

In [17]:
context = [result['entity']['text'] for result in search_res[0]]
print(context)
context_str = " ".join(context)  # turn list into a single string
context = re.sub(r"{.*?}", "", context_str)



['DOLIPRANE 500 mg, comprimé ? Tenir ce médicament hors de la vue et de la portée des enfants. N\x92utilisez pas ce médicament après', 'Si vous oubliez de prendre DOLIPRANE 500 mg, comprimé Ne prenez pas de dose double pour compenser la dose simple que vous avez', 'service médical rendu par les spécialités DOLIPRANE, GELUPRANE et PARACETAMOL ZENTIVA reste important dans les indications de']


5. Create a prompt

In [18]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
prompt = f"""
You are a helpful assistant. Use the following context in French to answer the question in English. Do short, complete answers for the patient.

Context:
{context}

Question:
{question}

Answer (in English):
"""

In [20]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
from transformers import pipeline
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

output = pipe(prompt,max_new_tokens=256, do_sample=True)[0]['generated_text']
print(output)

Device set to use cuda:0


Do not take a double dose to make up for missed dose for DOLIPRANE 500 mg, tablet The medical professional provided by Diloprane, GELUPRANE, and Paracetamol Zentiva remains important in the indications of


6. Making an interface

In [23]:
! pip install gradio

Collecting gradio
  Downloading gradio-5.24.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [None]:
import gradio as gr

In [24]:
def chatbot_interface(question): # making a function of all the steps
    search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
    emb_text(question)
    ],
    limit=3, # Return top 3 results
    search_params={"metric_type": "IP", "params": {}}
    output_fields=["text"],
    )

    context = [result['entity']['text'] for result in search_res[0]]
    context_str = " ".join(context) 
    context = re.sub(r"{.*?}", "", context_str)

    prompt = f"""
    You are a helpful assistant. Use the following context in French to answer the question in English. Do short, complete answers for the patient.

    Context:
    {context}

    Question:
    {question}

    Answer (in English):
    """
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

    output = pipe(prompt,max_new_tokens=256, do_sample=True)[0]['generated_text']
    return output

# Create a gradio interface
interface = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(lines=2, label="Enter your question (English or French)"),
    outputs="text",
    title="French Medical Chatbot",
    description="Ask a medical question. The model searches relevant French info and answers in English."
)

# Launch
interface.launch(inline=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dd6adce6d13c892a9a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


