In [None]:
%pip install langchain

In [None]:
%pip install pypdf==5.6.1 langchain-pinecone==0.2.8 langchain-community==0.3.26

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Chargement des donnees

In [3]:
# extraire le texte du pdf
def load_pdf_files(directory_path):
    loader = DirectoryLoader(
        directory_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
        )
    documents = loader.load()
    return documents

In [4]:
extrated_data = load_pdf_files('/content/drive/MyDrive/data_medical')

In [None]:
len(extrated_data)

637

In [5]:
from typing import List
from langchain.schema import Document

# filter le contenu des documents
def filter_to_minimal_docs(documents: List[Document]) -> List[Document]:
    """
    Given a list of document objects, return a new list of Document objects
    containing only 'source' in metadata and the 'page_content' of the original documents.
    """
    minimal_docs = []
    for doc in documents:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
             page_content=doc.page_content,
             metadata={'source':src}
             )
         )
    return minimal_docs

In [6]:
minimal_docs = filter_to_minimal_docs(extrated_data)

## Phase de Chuncking

In [7]:
# Decouper les documents en chuncks
def text_split(documents):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=500,
      chunk_overlap=20,
      #length_function=len,
      #add_start_index=True,
  )
  texts_chunks = text_splitter.split_documents(documents)
  return texts_chunks

In [8]:
texts_chunks = text_split(minimal_docs)
print(len(texts_chunks)) # nombre de chunks

5859


## Embedding Model

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

# telecharger l'embedding de HuggingFace
def download_embeddings():
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
  embeddings = HuggingFaceEmbeddings(
      model_name= model_name,
      model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
      )
  return embeddings

In [14]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [15]:
# test de l'embedding
query_result = embeddings.embed_query("Hello world")
len(query_result)

384

## Vector DataBase with PineCone

In [7]:
from dotenv import load_dotenv
import os
from google.colab import userdata
load_dotenv()
# PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
# OPENAI_API_KEY = userdata.get('OPENAI_API_KEY') # Uncomment if you need OpenAI API key

# Use these variables directly or set environment variables
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [8]:
from google.colab import userdata
pinecone_api_key = userdata.get('PINECONE_API_KEY')

In [9]:
from pinecone import Pinecone
from google.colab import userdata

# Assuming you have stored your API key in Colab secrets as 'PINECONE_API_KEY'
pinecone_api_key = userdata.get('PINECONE_API_KEY')


pc = Pinecone(
    api_key=pinecone_api_key,
    #environment="us-west1-gcp" # Uncomment and set your environment if not using serverless
    )
#pinecone_env = "us-west1-gcp"

In [10]:
from pinecone import ServerlessSpec

index_name = "medicalchatbot"
if not pc.has_index(index_name):
  pc.create_index(
      name=index_name,
      dimension=384,
      metric="cosine", #Cosine
      spec = ServerlessSpec(cloud = "aws", region="us-east-1")
  )
index = pc.Index(index_name)

In [11]:
from pinecone import Pinecone
from google.colab import userdata

# Assuming you have stored your API key in Colab secrets as 'PINECONE_API_KEY'
pinecone_api_key = userdata.get('PINECONE_API_KEY')


pc = Pinecone(
    api_key=pinecone_api_key,
    #environment="us-west1-gcp" # Uncomment and set your environment if not using serverless
    )
#pinecone_env = "us-west1-gcp"

In [None]:
from langchain_pinecone import PineconeVectorStore
import os
from google.colab import userdata

# Ensure the API key is set as an environment variable for langchain-pinecone
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY


docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    embedding=embeddings,
    index_name=index_name
)

In [16]:
# Si on doit charger la VrctorBase depuis Pinecone
from langchain_pinecone import PineconeVectorStore
# Ensure the API key is set as an environment variable for langchain-pinecone
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

### Add more data to the existing pinecone index

In [21]:
dswith = Document(
    page_content = "A document to add to the index",
    metadata = {"source": "test"}
)

In [22]:
docsearch.add_documents([dswith])

['8ee37454-5096-48d2-92ec-1b69d6ac0e28']

## Create the Retriever

In [17]:
# construction
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [18]:
#Test
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='5eeecc7e-6000-4262-aa9e-c548c5f7b18e', metadata={'source': '/content/drive/MyDrive/data_medical/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='fee33be0-fc91-44f3-a53e-3a1514d15bc1', metadata={'source': '/content/drive/MyDrive/data_medical/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='89bedffe-540c-411b-8975-728fe57d9c80', metadata={'source': '/content/drive/MyDrive/data_medical/Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npim

### Connect the LLM

In [29]:
# from langchain import HuggingFaceHub # Deprecated
from langchain_huggingface import HuggingFaceEndpoint # Recommended way to use Hugging Face Inference Endpoints
import os
from google.colab import userdata

###Add your token to Colab secrets with the name 'HF_API_TOKEN'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_API_TOKEN')

##Replace "google/flan-t5-large" with the desired free model
chatModel = HuggingFaceEndpoint(
    repo_id="google/flan-t5-large",
    task = "text2text-generation",
    temperature=0.3,  # Pass temperature directly
)

system_prompt = (
    "You are a Medical assistant for question-answering tasks in simple vocabulary. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)
question_answer_chain = create_stuff_documents_chain(
    chatModel,
    prompt=prompt
)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

response = rag_chain.invoke({"input":"What is Acne?"})
print(response["answer"])

In [30]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [31]:
system_prompt = (
    "You are a Medical assistant for question-answering tasks in simple vocabulary. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [32]:
question_answer_chain = create_stuff_documents_chain(
    chatModel,
    prompt=prompt
)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyA10Jyam5sK7Y4hEEL_oio6L1_01VlNzuM")

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="How does AI work?"
)
print(response.text)

In [63]:
import google.generativeai as genai
from langchain_community.vectorstores import Pinecone  # ou votre import actuel

# 1. Configuration de Gemini
genai.configure(api_key="AIzaSyA10Jyam5sK7Y4hEEL_oio6L1_01VlNzuM")  # Remplacez par votre clé
model = genai.GenerativeModel('gemini-1.5-flash')

from langchain_pinecone import PineconeVectorStore
# Ensure the API key is set as an environment variable for langchain-pinecone
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # 3 documents les plus pertinents
)

# 3. Fonction de reformulation intégrée
def get_rag_response(question: str) -> str:
    """Combine RAG + reformulation Gemini"""

    # Étape 1: Récupération des documents
    try:
        docs = retriever.invoke(question)
        context = "\n\n".join([doc.page_content for doc in docs])
    except Exception as e:
        print(f"Erreur de recherche RAG: {e}")
        context = ""

    # Étape 2: Reformulation
    if not context:
        return "Je n'ai pas trouvé d'informations pertinentes."

    prompt = f"""
    [ROLE]
    You are a Medical assistant for question-answering tasks in simple vocabulary.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.

    [CONTEXTE]
    {context}

    [QUESTION]
    {question}

    [INSTRUCTIONS]
    - 2-3 sentences maximum
    - langage for patients
    - Base on the context
    - If you don't know the answer, just say that you don't know

    [REPONSE]
    """

    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Erreur Gemini: {e}")
        # Fallback: premier document brut
        return docs[0].page_content[:300] + "..." if docs else context[:300]



In [67]:
# 4. Test

question = "What is the Treatment of Acne?"  # Ou "Qu'est-ce que l'acné ?"
response = get_rag_response(question)
print(response)

Treatment depends on how severe your acne is.  Mild acne may be treated with creams like tretinoin or benzoyl peroxide.  For more severe acne, or acne with inflammation,  your doctor may prescribe other medications.

