In [7]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from src.embeddings import EmbeddingModel
from src.data_loader import load_processed_data

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
model = EmbeddingModel()

In [None]:
print("Experiment 1: Semantic Similarity\n")

examples = [
    ("What is diabetes?", "Tell me about diabetes", "High similarity (same question)"),
    ("diabetes symptoms", "signs of diabetes", "High similarity (synonyms)"),
    ("diabetes treatment", "diabetes symptoms", "Medium similarity (related topics)"),
    ("diabetes", "cancer", "Low similarity (different diseases)"),
    ("diabetes", "chocolate cake recipe", "Very low similarity (unrelated)")
]

results = []
for text1, text2, expected in examples:
    sim = model.similarity(text1, text2)
    results.append({
        'Text 1': text1,
        'Text 2': text2,
        'Similarity': sim,
        'Expected': expected
    })
    print(f"{text1:.<30} vs {text2:.<30} = {sim:.3f} | {expected}")

results_df = pd.DataFrame(results)

In [None]:
from itertools import combinations

medical_terms = [
    "diabetes",
    "heart disease",
    "cancer",
    "flu",
    "headache",
    "fever",
    "insulin",
    "medication"
]

# generate embeddings
embeddings = model.encode(medical_terms, show_progress=False)

# calculate similarity matrix
n_terms = len(medical_terms)
sim_matrix = np.zeros((n_terms, n_terms))

for i in range(n_terms):
    for j in range(n_terms):
        sim_matrix[i, j] = np.dot(embeddings[i], embeddings[j]) / (
            np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j])
        )

# plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    sim_matrix,
    xticklabels=medical_terms,
    yticklabels=medical_terms,
    annot=True,
    fmt='.2f',
    cmap='RdYlGn',
    vmin=0,
    vmax=1,
    cbar_kws={'label': 'Cosine Similarity'}
)
plt.title('Semantic Similarity Matrix - Medical Terms', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nInsights:")
print("  â€¢ Diagonal = 1.0 (each term is identical to itself)")
print("  â€¢ 'insulin' and 'diabetes' have high similarity")
print("  â€¢ 'fever' and 'flu' are related")
print("  â€¢ 'cancer' and 'diabetes' are less similar (different diseases)")

In [None]:
print("\nExperiment 2: Visualizing Embeddings in 2D\n")

# sample different medical topics
sample_texts = [
    # diabetes
    "diabetes type 1", "diabetes type 2", "insulin treatment", "blood sugar levels",
    # heart
    "heart attack", "heart disease", "cardiovascular health", "blood pressure",
    # cancer
    "breast cancer", "lung cancer", "chemotherapy", "cancer treatment",
    # infections
    "flu symptoms", "cold symptoms", "viral infection", "bacterial infection"
]

categories = ['Diabetes']*4 + ['Heart']*4 + ['Cancer']*4 + ['Infections']*4

# generate embeddings
sample_embeddings = model.encode(sample_texts, show_progress=False)

# reduce to 2D using t-SNE
print("ðŸ“‰ Reducing dimensions with t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
embeddings_2d = tsne.fit_transform(sample_embeddings)

plt.figure(figsize=(12, 8))
colors = {'Diabetes': 'red', 'Heart': 'blue', 'Cancer': 'green', 'Infections': 'orange'}

for category in colors:
    mask = np.array(categories) == category
    plt.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        c=colors[category],
        label=category,
        s=200,
        alpha=0.7,
        edgecolors='black'
    )

# add labels
for i, txt in enumerate(sample_texts):
    plt.annotate(
        txt,
        (embeddings_2d[i, 0], embeddings_2d[i, 1]),
        fontsize=8,
        ha='center'
    )

plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.title('Medical Terms Embeddings Visualization (t-SNE)', fontsize=16, fontweight='bold')
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nInsights:")
print("  â€¢ Similar topics cluster together")
print("  â€¢ Different medical domains are separated")
print("  â€¢ Embeddings capture semantic relationships")

In [None]:
print("\nExperiment 3: Dataset Embeddings\n")

# load data
df = load_processed_data()

# sample 1000 documents for speed
sample_size = 1000
df_sample = df.sample(n=sample_size, random_state=42)

print(f"Generating embeddings for {sample_size} documents...")
doc_embeddings = model.encode(
    df_sample['answer'].tolist(),
    batch_size=32,
    show_progress=True
)

print(f"\nGenerated {len(doc_embeddings)} embeddings")
print(f"   Shape: {doc_embeddings.shape}")
print(f"   Memory: {doc_embeddings.nbytes / 1024 / 1024:.2f} MB")

In [None]:
query = "What are the symptoms of diabetes type 2?"

print(f"\nTesting Retrieval")
print(f"Query: {query}\n")

# get similarities
similarities = model.batch_similarity(query, df_sample['answer'].tolist())

#get top 5
top_k = 5
top_indices = np.argsort(similarities)[::-1][:top_k]

print(f"Top {top_k} most relevant documents:\n")
for i, idx in enumerate(top_indices, 1):
    doc_id = df_sample.iloc[idx]['id']
    source = df_sample.iloc[idx]['source']
    answer = df_sample.iloc[idx]['answer']
    question = df_sample.iloc[idx]['question']
    sim = similarities[idx]
    
    print(f"{i}. [Similarity: {sim:.3f}] [Source: {source}]")
    print(f"   Question: {question}")
    print(f"   Answer preview: {answer[:200]}...")
    print()

In [None]:
print("\nEmbedding Statistics:\n")

# calculate some statistics
norms = np.linalg.norm(doc_embeddings, axis=1)
avg_norm = np.mean(norms)
std_norm = np.std(norms)

print(f"Average embedding norm: {avg_norm:.3f} Â± {std_norm:.3f}")
print(f"Min norm: {np.min(norms):.3f}")
print(f"Max norm: {np.max(norms):.3f}")

# plot 
plt.figure(figsize=(10, 5))
plt.hist(norms, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(avg_norm, color='red', linestyle='--', linewidth=2, label=f'Mean: {avg_norm:.3f}')
plt.xlabel('Embedding Norm', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Embedding Norms', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()