In [4]:
!pip install transformers pymilvus sentence-transformers huggingface-hub langchain_community langchain-text-splitters pypdf

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting pymilvus
  Downloading pymilvus-2.5.5-py3-none-any.whl.metadata (5.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.4-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-macosx_11

1. Extracting text

In [1]:
import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
path = '/Users/naomi/Desktop/EM/Deep learning/deeplearningproject/data/sample/'
files = glob.glob(path +'*.html')

In [3]:
text_lines = []
for file in files:
    with open(file, 'r',encoding='ISO-8859-1') as f:
        print(file)
        text = f.read()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = text_splitter.split_text(text)
        text_lines += [chunk for chunk in chunks]

/Users/naomi/Desktop/EM/Deep learning/deeplearningproject/data/sample/Résumé des caractéristiques du produit - VENTOLINE 100 microgrammes_dose, suspension pour inhalation en flacon pressurisé - Base de données publique des médicaments.html
/Users/naomi/Desktop/EM/Deep learning/deeplearningproject/data/sample/Fiche info - ANTARENE 100 mg, comprimé pelliculé - Base de données publique des médicaments.html
/Users/naomi/Desktop/EM/Deep learning/deeplearningproject/data/sample/Fiche info - AMOXICILLINE VIATRIS 1 g, comprimé dispersible - Base de données publique des médicaments.html


2. Embedding a sentence

In [4]:
from sentence_transformers import SentenceTransformer

In [5]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 
s = text_lines[0]
e = embedding_model.encode([s])

3. Creating a Milvus data collection


In [6]:
from pymilvus import MilvusClient

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 

def emb_text (text_lines):
    embeddings = embedding_model.encode(text_lines)
    return embeddings

In [8]:
milvus_client = MilvusClient(uri="./my_milvus_db.db")
collection_name = "rag_collection"
milvus_client.create_collection(
 collection_name=collection_name,
 dimension= 384, # The size of the embedding
 metric_type="IP", # Inner product distance
 consistency_level="Strong", # Strong consistency level
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
data = []
# In the following example, emb_text is a function that needs to be written, based on
# an embedding model
for i, line in enumerate(text_lines):
    data.append({"id": i, "vector": emb_text(line), "text": line})


In [10]:
insert_res = milvus_client.insert(collection_name=collection_name, data=data)

4. Retrieving data for a query

In [11]:
question = "What is the best practice mentionned in the document?"
search_res = milvus_client.search(
 collection_name=collection_name,
 data=[
 emb_text(question)
 ],
 limit=2, # Return top 3 results
 search_params={"metric_type": "IP", "params": {}}, # Inner product distance
 output_fields=["text"], # Return the text field
)

In [12]:
context = [result['entity']['text'] for result in search_res[0]]
print(context)

['<!-- ***************************************** -->\n\t\t\t\t\t\t<!-- Docs de bon usage -->\n\t\t\t\t\t\t<!-- ***************************************** -->\n\t\t\t\t\t\t<a name="DocumentsBonUsage"></a>\n\t\t\t\t\t\t<h2 class="ficheInfo">Documents de bon usage du médicament</h2>\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t<ul>\n\t\t\t\t\t\t\t<li>\n\t\t\t\t\t\t\t\t<a class="lien_bon_usage" href=" https://www.has-sante.fr/jcms/p_3372966" target="_blank">Bon usage des inhibiteurs de la pompe à protons (IPP)</a>\n\t\t\t\t\t\t\t</li>\n\t\t\t\t\t\t\t<br/>\n\t\t\t\t\t\t\tAuteur : Haute autorité de santé<br/>\n\t\t\t\t\t\t\tType : Fiche Bon Usage du Médicament<br/>\n\t\t\t\t\t\t\tDate de mise à jour : Octobre 2022<br/>\n\t\t\t\t\t\t\t<br/>\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t</ul>\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t<ul>\n\t\t\t\t\t\t\t<li>\n\t\t\t\t\t\t\t\t<a class="lien_bon_usage" href=" https://www.has-sante.fr/jcms/p_3372966" target="_blank">Bon usage des inhibiteurs d

5. Create a prompt 

In [28]:
from huggingface_hub import notebook_login
notebook_login()

from transformers import AutoTokenizer, AutoModelForCausalLM



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [18]:
PROMPT = """
Use the information enclosed in <context> tags to provide an answer to the
question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>" \
"""

In [19]:
prompt = PROMPT.format(question=question, context=context)

In [24]:
'''input_text = prompt
encoded_input = tokenizer(input_text, return_tensors="pt")

output = model.generate(
    input_ids=encoded_input["input_ids"],
    attention_mask=encoded_input["attention_mask"],
    temperature=0.8,
    pad_token_id=tokenizer.pad_token_id
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)'''

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [33]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.1" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

prompt = """
Use the information enclosed in <context> tags to provide an answer to the
question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>" \
"""


inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output_ids = model.generate(**inputs, max_new_tokens=100)


response = tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(response)

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the disk.


AssertionError: Torch not compiled with CUDA enabled