In [1]:
from langchain_core.documents import Document
import os
import pandas as pd
from google.cloud import storage
from ingest import (
    list_files_in_bucket,
    download_file_from_bucket,
    create_cloud_sql_database_connection,
    create_table_if_not_exists,
    get_embeddings,
    get_vector_store,
)
from config import (
    TABLE_NAME,
    BUCKET_NAME,
)

In [17]:
!gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=o5d4sasl76xwYlta10apdltNFw546X&access_type=offline&code_challenge=B3s3UMvp9UE0mh7VKhGdcgTTqSJB8Dkuga3lmXiR7AI&code_challenge_method=S256


You are now logged in as [malekhlouf@gmail.com].
Your current project is [medichat-451909].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update



In [2]:
# List all files in the 'data' directory
client = storage.Client()
bucket_name = BUCKET_NAME
blobs = list_files_in_bucket(client, bucket_name)
print("Files in 'medichat-bucket/data':")
for blob in blobs:
    print(blob)

Files in 'medichat-bucket/data':
data/
data/medquad.csv


In [3]:
file_path = "data/medquad.csv"
# Get the blob object
bucket = client.get_bucket(bucket_name)
blob = bucket.get_blob(file_path)
if blob:
    print(f"Information for '{file_path}':")
    print(f"Size: {blob.size} bytes")
    print(f"Content Type: {blob.content_type}")
    formatted_updated_on = (blob.updated).strftime("%Y-%m-%d %H:%M:%S")
    print(f"Updated On: {formatted_updated_on}")
    print(f"Blob name: {blob.name}")
else:
    print(f"File '{file_path}' not found in the bucket.")

Information for 'data/medquad.csv':
Size: 22835609 bytes
Content Type: text/csv
Updated On: 2025-02-26 11:55:38
Blob name: data/medquad.csv


In [4]:
DOWNLOADED_LOCAL_DIRECTORY = "./downloaded_files"
os.makedirs(DOWNLOADED_LOCAL_DIRECTORY, exist_ok=True)

file_path = "data/medquad.csv"
local_filepath = download_file_from_bucket(
    bucket, file_path, DOWNLOADED_LOCAL_DIRECTORY
)

Downloaded 'data/medquad.csv' to 'medquad.csv'


In [5]:
CSV_FILE_PATH = os.path.join(DOWNLOADED_LOCAL_DIRECTORY, "medquad.csv")
df = pd.read_csv(CSV_FILE_PATH)

In [6]:
df["answer"] = df["answer"].fillna("No answer provided")
df["source"] = df["source"].fillna("Unknown source")
df["focus_area"] = df["focus_area"].fillna("Not specified")

assert not df.isnull().values.any(), "NaN values found in the DataFrame"

In [7]:
print(df.isnull().sum())

question      0
answer        0
source        0
focus_area    0
dtype: int64


In [8]:
# CSV lines -> Langchain Documents
documents = [
    Document(
        page_content=row["question"],
        metadata={
            "answer": row["answer"],
            "source": row["source"],
            "focus_area": row["focus_area"],
        },
    )
    for _, row in df.iterrows()
]
print(f"{len(documents)} Convertions succesful!")

16412 Convertions succesful!


In [9]:
# Afficher le premier document
print("\nFirst Doc:")
print(documents[0])  # Affiche tout l'objet Document

# Afficher uniquement son contenu et ses métadonnées
print("\nQuestion :", documents[0].page_content)
print("Answer :", documents[0].metadata["answer"])
print("Source :", documents[0].metadata["source"])
print("Focus Area :", documents[0].metadata["focus_area"])


First Doc:
page_content='What is (are) Glaucoma ?' metadata={'answer': "Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma, for still unknown reasons, the fluid drains too slowly out of the eye. As the fluid builds up, the pressure inside the eye rises. Unless this pressure is controlled, it may cause damage to the optic nerve and other parts of the eye and result i

In [10]:
engine = create_cloud_sql_database_connection()
create_table_if_not_exists(table_name=TABLE_NAME, engine=engine)

<coroutine object create_table_if_not_exists at 0x1062d8ad0>

In [11]:
embeddings = get_embeddings()

In [12]:
vector_store = get_vector_store(
    engine=engine, table_name=TABLE_NAME, embedding=embeddings
)

# Convertir les documents en embeddings et les stocker dans Cloud SQL (A ne faire qu'une fois !)
c = """
vector_store.add_documents(
    [
        Document(
            page_content=row["question"],
            metadata={
                "answer": row["answer"],
                "source": row["source"],
                "focus_area": row["focus_area"],
            },
        )
        for _, row in df.iterrows()
    ]
)
print("Les embeddings des questions ont été générés et stockés dans la table !")
"""

## Test if it works

In [13]:
query = "What is Alzheimer ? "

In [14]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}
)

In [15]:
docs = retriever.invoke(query)

In [16]:
# Afficher les résultats
if docs:
    print(f"\n {len(docs)} documents trouvés pour la requête : {query}")
    for i, doc in enumerate(docs):
        print("-" * 50)
        print(f" Résultat {i+1} ")
        print(" Question trouvée: ", doc.page_content)
        print(" Réponse:", doc.metadata["answer"])
        print(" Source:", doc.metadata["source"])
        print(" Focus Area:", doc.metadata["focus_area"])
else:
    print("Aucun document pertinent trouvé pour cette requête.")


 4 documents trouvés pour la requête : What is Alzheimer ? 
--------------------------------------------------
 Résultat 1 
 Question trouvée:  What is (are) Alzheimer disease ?
 Réponse: Alzheimer disease is a degenerative disease of the brain that causes dementia, which is a gradual loss of memory, judgment, and ability to function. This disorder usually appears in people older than age 65, but less common forms of the disease appear earlier in adulthood.  Memory loss is the most common sign of Alzheimer disease. Forgetfulness may be subtle at first, but the loss of memory worsens over time until it interferes with most aspects of daily living. Even in familiar settings, a person with Alzheimer disease may get lost or become confused. Routine tasks such as preparing meals, doing laundry, and performing other household chores can be challenging. Additionally, it may become difficult to recognize people and name objects. Affected people increasingly require help with dressing, eating,