In [None]:
from langchain_core.documents import Document
import os
import pandas as pd
from google.cloud import storage
from medichat.ingest import (
    list_files_in_bucket,
    download_file_from_bucket,
    create_cloud_sql_database_connection,
    create_table_if_not_exists,
    get_embeddings,
    get_vector_store,
)
from medichat.config import (
    TABLE_NAME,
    BUCKET_NAME,
)

In [None]:
!gcloud auth login

In [None]:
# List all files in the 'data' directory
client = storage.Client()
bucket_name = BUCKET_NAME
blobs = list_files_in_bucket(client, bucket_name)
print("Files in 'medichat-bucket/data':")
for blob in blobs:
    print(blob)

In [None]:
file_path = "data/medquad.csv"
# Get the blob object
bucket = client.get_bucket(bucket_name)
blob = bucket.get_blob(file_path)
if blob:
    print(f"Information for '{file_path}':")
    print(f"Size: {blob.size} bytes")
    print(f"Content Type: {blob.content_type}")
    formatted_updated_on = (blob.updated).strftime("%Y-%m-%d %H:%M:%S")
    print(f"Updated On: {formatted_updated_on}")
    print(f"Blob name: {blob.name}")
else:
    print(f"File '{file_path}' not found in the bucket.")

In [None]:
DOWNLOADED_LOCAL_DIRECTORY = "./downloaded_files"
os.makedirs(DOWNLOADED_LOCAL_DIRECTORY, exist_ok=True)

file_path = "data/medquad.csv"
local_filepath = download_file_from_bucket(
    bucket, file_path, DOWNLOADED_LOCAL_DIRECTORY
)

In [None]:
CSV_FILE_PATH = os.path.join(DOWNLOADED_LOCAL_DIRECTORY, "medquad.csv")
df = pd.read_csv(CSV_FILE_PATH)

In [None]:
df["answer"] = df["answer"].fillna("No answer provided")
df["source"] = df["source"].fillna("Unknown source")
df["focus_area"] = df["focus_area"].fillna("Not specified")

assert not df.isnull().values.any(), "NaN values found in the DataFrame"

In [None]:
print(df.isnull().sum())

In [None]:
# CSV lines -> Langchain Documents
documents = [
    Document(
        page_content=row["question"],
        metadata={
            "answer": row["answer"],
            "source": row["source"],
            "focus_area": row["focus_area"],
        },
    )
    for _, row in df.iterrows()
]
print(f"{len(documents)} Convertions succesful!")

In [None]:
# Afficher le premier document
print("\nFirst Doc:")
print(documents[0])  # Affiche tout l'objet Document

# Afficher uniquement son contenu et ses métadonnées
print("\nQuestion :", documents[0].page_content)
print("Answer :", documents[0].metadata["answer"])
print("Source :", documents[0].metadata["source"])
print("Focus Area :", documents[0].metadata["focus_area"])

In [None]:
engine = create_cloud_sql_database_connection()
create_table_if_not_exists(table_name=TABLE_NAME, engine=engine)

In [None]:
embeddings = get_embeddings()

In [None]:
vector_store = get_vector_store(
    engine=engine, table_name=TABLE_NAME, embedding=embeddings
)

# Convertir les documents en embeddings et les stocker dans Cloud SQL (A ne faire qu'une fois !)
c = """
vector_store.add_documents(
    [
        Document(
            page_content=row["question"],
            metadata={
                "answer": row["answer"],
                "source": row["source"],
                "focus_area": row["focus_area"],
            },
        )
        for _, row in df.iterrows()
    ]
)
print("Les embeddings des questions ont été générés et stockés dans la table !")
"""

## Test if it works

In [None]:
query = "What is Glaucoma ?"

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}
)

In [None]:
docs = retriever.invoke(query)

In [None]:
# Afficher les résultats
if docs:
    print(f"\n {len(docs)} documents trouvés pour la requête : {query}")
    for i, doc in enumerate(docs):
        print("-" * 50)
        print(f" Résultat {i+1} ")
        print(" Question trouvée: ", doc.page_content)
        print(" Réponse:", doc.metadata["answer"])
        print(" Source:", doc.metadata["source"])
        print(" Focus Area:", doc.metadata["focus_area"])
else:
    print("Aucun document pertinent trouvé pour cette requête.")