In [None]:
import pandas as pd


merged_chunks_df = pd.read_csv('merged_chunks.csv')


all_texts = []

for col in merged_chunks_df.columns:
    
    col_texts = merged_chunks_df[col]
    for item in col_texts:
        if isinstance(item, list):
            for subitem in item:
                if isinstance(subitem, str) and len(subitem.strip())>0:
                    all_texts.append(subitem.strip())
        elif isinstance(item, str) and len(item.strip())>0:
            all_texts.append(item.strip())

# Now we have a Python list of strings in all_texts
print(f"Total number of chunk strings: {len(all_texts)}")


Build an Index for Retrieval

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load an embedding model. You can choose something more advanced if you want.
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

# Embed all the chunk texts
chunk_embeddings = embedder.encode(all_texts, convert_to_numpy=True, show_progress_bar=True)

# Build a Faiss index
dimension = chunk_embeddings.shape[1]  # e.g. 384 for MiniLM
index = faiss.IndexFlatIP(dimension)   # Cosine similarity if we do normalized vectors
# Optionally, normalize embeddings if you want approximate cosine similarity
# but let's keep it simple for now.

# Add embeddings to the index
index.add(chunk_embeddings)
print("FAISS index built. Number of vectors in the index:", index.ntotal)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# For demonstration, let's pick a small generative model (like "gpt2").
# In real usage, pick a model that can handle longer context (like GPT-Neo, Llama 2, or a local bigger model).
gen_model_name = "gpt2"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_model = AutoModelForCausalLM.from_pretrained(gen_model_name)

# If you have GPU, move model to CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
gen_model.to(device)

def rag_answer(query, top_k=3, max_new_tokens=128):
    """
    1) Embed query
    2) Retrieve top-k similar chunks
    3) Construct a prompt with retrieved text + query
    4) Generate answer
    """
    # 1) Embed
    q_embed = embedder.encode([query], convert_to_numpy=True)
    
    # 2) Retrieve top-k
    # (Dot product is bigger = more similar; or if you normalized, do index.search)
    D, I = index.search(q_embed, top_k)
    
    # Gather top-k chunks
    relevant_chunks = []
    for idx in I[0]:
        relevant_chunks.append(all_texts[idx])
    
    # 3) Construct a prompt
    # Below is a naive prompt. In real usage, you might want more refined instructions.
    context_text = "\n\n".join(relevant_chunks)
    prompt = (
        f"Here are some relevant pieces of context:\n\n"
        f"{context_text}\n\n"
        f"Answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Answer:"
    )
    
    # 4) Generate answer
    inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = gen_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # for deterministic greedy generation
            top_p=0.9
        )
    answer_text = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # You may want to strip out the initial prompt
    # so let's just return the "final" part, i.e. everything after 'Answer:'
    if "Answer:" in answer_text:
        answer_text = answer_text.split("Answer:")[-1].strip()
    
    return answer_text

# Quick test
test_query = "Quels sont les signes cliniques d'une exanthème subit ?"
response = rag_answer(test_query, top_k=3, max_new_tokens=64)
print("User Query:", test_query)
print("RAG-based answer:", response)
