In [5]:
import pandas as pd
import numpy as np
import ast
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document

# === Load the CSV with embeddings ===
csv_path = 'papers_adhd_csvs/all_papers_chunked_embeddings.csv'  # Modify if needed
df = pd.read_csv(csv_path)
df['embedding'] = df['embedding'].apply(ast.literal_eval)

# === Convert Embeddings to NumPy Matrix ===
embedding_matrix = np.array(df['embedding'].tolist())

# === Load ClinicalBERT Tokenizer & Model ===
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModel.from_pretrained("medicalai/ClinicalBERT")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Function to Embed Prompt ===
@torch.no_grad()
def get_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(device)
    outputs = model(**tokens)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().cpu().numpy()


# === Load the .docx file ===
# doc_path = 'adhd_sample_child_prompts\profile1.docx'  # Update this path
# doc = Document(doc_path)

# # === Combine all paragraphs into a single string ===
# prompt_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

# prompt_embedding = get_embedding(prompt_text).reshape(1, -1)

# # === Compute Cosine Similarity and Retrieve Top-k Chunks ===
# similarities = cosine_similarity(prompt_embedding, embedding_matrix).flatten()
# top_k = 5  # Change this to any number of top results
# top_k_indices = similarities.argsort()[-top_k:][::-1]

# # === Display Top-k Chunks ===
# top_chunks = df.iloc[top_k_indices][['chunk', 'source_file', 'page_number']]
# top_chunks['similarity'] = similarities[top_k_indices]

# # === Print Results ===
# print("\nTop-K Most Similar Chunks:\n")
# print(top_chunks.to_string(index=False))


In [6]:
# === Load the .docx file ===
doc_path = 'adhd_sample_child_prompts\profile1.docx'  # Update this path
doc = Document(doc_path)

# === Combine all paragraphs into a single string ===
prompt_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

prompt_embedding = get_embedding(prompt_text).reshape(1, -1)

# === Compute Cosine Similarity and Retrieve Top-k Chunks ===
similarities = cosine_similarity(prompt_embedding, embedding_matrix).flatten()
top_k = 5  # Change this to any number of top results
top_k_indices = similarities.argsort()[-top_k:][::-1]

# === Display Top-k Chunks ===
top_chunks = df.iloc[top_k_indices][['chunk', 'source_file', 'page_number']]
top_chunks['similarity'] = similarities[top_k_indices]

# === Print Results ===
print("\nTop-K Most Similar Chunks:\n")
print(top_chunks.to_string(index=False))



Top-K Most Similar Chunks:

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [7]:
# === Load the .docx file ===
doc_path = 'adhd_sample_child_prompts\profile2.docx'  # Update this path
doc = Document(doc_path)

# === Combine all paragraphs into a single string ===
prompt_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

prompt_embedding = get_embedding(prompt_text).reshape(1, -1)

# === Compute Cosine Similarity and Retrieve Top-k Chunks ===
similarities = cosine_similarity(prompt_embedding, embedding_matrix).flatten()
top_k = 5  # Change this to any number of top results
top_k_indices = similarities.argsort()[-top_k:][::-1]

# === Display Top-k Chunks ===
top_chunks = df.iloc[top_k_indices][['chunk', 'source_file', 'page_number']]
top_chunks['similarity'] = similarities[top_k_indices]

# === Print Results ===
print("\nTop-K Most Similar Chunks:\n")
print(top_chunks.to_string(index=False))



Top-K Most Similar Chunks:

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [8]:
# === Load the .docx file ===
doc_path = 'adhd_sample_child_prompts\profile3.docx'  # Update this path
doc = Document(doc_path)

# === Combine all paragraphs into a single string ===
prompt_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

prompt_embedding = get_embedding(prompt_text).reshape(1, -1)

# === Compute Cosine Similarity and Retrieve Top-k Chunks ===
similarities = cosine_similarity(prompt_embedding, embedding_matrix).flatten()
top_k = 5  # Change this to any number of top results
top_k_indices = similarities.argsort()[-top_k:][::-1]

# === Display Top-k Chunks ===
top_chunks = df.iloc[top_k_indices][['chunk', 'source_file', 'page_number']]
top_chunks['similarity'] = similarities[top_k_indices]

# === Print Results ===
print("\nTop-K Most Similar Chunks:\n")
print(top_chunks.to_string(index=False))



Top-K Most Similar Chunks:

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           