# NHANES Variable Embeddings with BioBERT

This notebook demonstrates:
1. Loading NHANES 2017-2018 variable metadata
2. Encoding variable descriptions with BioBERT
3. Visualizing variables in 3D semantic space using UMAP

The metadata includes all variables from 5 categories:
- Demographics
- Dietary
- Examination
- Laboratory
- Questionnaire

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys

# Import our simple NHANES fetcher
from simple_nhanes_fetcher import SimpleNHANESFetcher

# BioBERT and embeddings
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from umap import UMAP

print("Imports successful!")

## 2. Fetch NHANES 2017-2018 Variable List

In [None]:
# Create fetcher
fetcher = SimpleNHANESFetcher()

# Fetch all variables for 2017-2018 cycle
df_variables = fetcher.fetch_all_for_cycle("2017-2018")

print(f"\nTotal variables: {len(df_variables)}")
print(f"\nColumns: {df_variables.columns.tolist()}")
print(f"\nFirst few rows:")
df_variables.head()

In [None]:
# Check distribution across components
print("Variables per component:")
print(df_variables['component'].value_counts())

# Sample variables
print("\nSample variables:")
print(df_variables[['variable_name', 'variable_description', 'component']].head(10))

## 3. Load BioBERT Model

We'll use BioBERT, a biomedical language model pre-trained on PubMed and PMC articles.

In [None]:
# Load BioBERT model and tokenizer
model_name = "dmis-lab/biobert-v1.1"  # BioBERT base model

print(f"Loading BioBERT model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

print(f"Model loaded on device: {device}")

## 4. Encode Variable Descriptions with BioBERT

In [None]:
def encode_text(texts, model, tokenizer, device, batch_size=32):
    """
    Encode texts using BioBERT.
    
    Args:
        texts: List of text strings
        model: BioBERT model
        tokenizer: BioBERT tokenizer
        device: torch device
        batch_size: Batch size for encoding
    
    Returns:
        numpy array of embeddings (n_texts, embedding_dim)
    """
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to device
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**encoded)
            # Use [CLS] token embedding (first token)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        embeddings.append(batch_embeddings)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Processed {i + len(batch_texts)}/{len(texts)} texts")
    
    return np.vstack(embeddings)

print("Encoding function defined")

In [None]:
# Prepare texts for encoding
# Combine variable name and description for richer context
texts = [
    f"{row['variable_name']}: {row['variable_description']}"
    for _, row in df_variables.iterrows()
]

print(f"Encoding {len(texts)} variable descriptions...")
embeddings = encode_text(texts, model, tokenizer, device, batch_size=32)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

# Add embeddings to dataframe (store as separate numpy file to save memory)
np.save('nhanes_2017_2018_embeddings.npy', embeddings)
print("\nSaved embeddings to: nhanes_2017_2018_embeddings.npy")

## 5. Dimensionality Reduction with UMAP

Reduce from 768 dimensions (BioBERT embedding size) to 3D for visualization.

In [None]:
# Reduce dimensions for visualization
print("Reducing dimensions with UMAP...")

# UMAP to 3D
reducer_3d = UMAP(
    n_components=3,
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)

coords_3d = reducer_3d.fit_transform(embeddings)

print(f"3D coordinates shape: {coords_3d.shape}")

# Add to dataframe
df_variables['x'] = coords_3d[:, 0]
df_variables['y'] = coords_3d[:, 1]
df_variables['z'] = coords_3d[:, 2]

print("\nDimensionality reduction complete!")

## 6. Interactive 3D Visualization

In [None]:
# Create interactive 3D scatter plot
fig = px.scatter_3d(
    df_variables,
    x='x',
    y='y',
    z='z',
    color='component',
    hover_data=['variable_name', 'variable_description', 'data_file_name'],
    title='NHANES 2017-2018 Variables in 3D Semantic Space (BioBERT + UMAP)',
    labels={'component': 'Component'},
    width=1000,
    height=800
)

# Update layout
fig.update_traces(
    marker=dict(size=3, opacity=0.7),
    hovertemplate='<b>%{customdata[0]}</b><br>' +
                  '%{customdata[1]}<br>' +
                  'File: %{customdata[2]}<br>' +
                  '<extra></extra>'
)

fig.update_layout(
    scene=dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

# Save as HTML
fig.write_html('nhanes_2017_2018_variables_3d.html')
print("Saved visualization to: nhanes_2017_2018_variables_3d.html")

# Display
fig.show()

## 7. 2D Visualization (Alternative)

In [None]:
# Also create a 2D version for easier viewing
print("Creating 2D projection...")

reducer_2d = UMAP(
    n_components=2,
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)

coords_2d = reducer_2d.fit_transform(embeddings)

df_variables['x_2d'] = coords_2d[:, 0]
df_variables['y_2d'] = coords_2d[:, 1]

# Create 2D scatter plot
fig_2d = px.scatter(
    df_variables,
    x='x_2d',
    y='y_2d',
    color='component',
    hover_data=['variable_name', 'variable_description'],
    title='NHANES 2017-2018 Variables in 2D Semantic Space (BioBERT + UMAP)',
    labels={'component': 'Component'},
    width=1200,
    height=800
)

fig_2d.update_traces(
    marker=dict(size=5, opacity=0.6)
)

fig_2d.write_html('nhanes_2017_2018_variables_2d.html')
print("Saved 2D visualization to: nhanes_2017_2018_variables_2d.html")

fig_2d.show()

## 8. Semantic Search Example

Find variables semantically similar to a query.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_variables(query, df_variables, embeddings, model, tokenizer, device, top_k=10):
    """
    Find variables similar to a query using semantic search.
    """
    # Encode query
    query_embedding = encode_text([query], model, tokenizer, device, batch_size=1)
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top-k indices
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    # Return results
    results = df_variables.iloc[top_indices].copy()
    results['similarity'] = similarities[top_indices]
    
    return results[['variable_name', 'variable_description', 'component', 'similarity']]

# Example: Search for cholesterol-related variables
print("Searching for variables similar to 'cholesterol'...\n")
results = find_similar_variables(
    "cholesterol levels in blood",
    df_variables,
    embeddings,
    model,
    tokenizer,
    device,
    top_k=10
)

print(results.to_string(index=False))

In [None]:
# Another example: diabetes-related variables
print("Searching for variables similar to 'diabetes'...\n")
results = find_similar_variables(
    "diabetes glucose blood sugar",
    df_variables,
    embeddings,
    model,
    tokenizer,
    device,
    top_k=10
)

print(results.to_string(index=False))

## 9. Save Processed Data

In [None]:
# Save the enriched dataframe
df_variables.to_csv('nhanes_2017_2018_variables_with_coords.csv', index=False)
print("Saved enriched variable list to: nhanes_2017_2018_variables_with_coords.csv")

print("\nFiles created:")
print("  - nhanes_2017_2018_embeddings.npy (BioBERT embeddings)")
print("  - nhanes_2017_2018_variables_with_coords.csv (Variables + 2D/3D coordinates)")
print("  - nhanes_2017_2018_variables_3d.html (Interactive 3D visualization)")
print("  - nhanes_2017_2018_variables_2d.html (Interactive 2D visualization)")

## Summary

This notebook demonstrated:
1. ✅ Fetching NHANES 2017-2018 variable metadata from CDC
2. ✅ Encoding ~2000-3000 variables with BioBERT
3. ✅ Reducing to 2D/3D using UMAP
4. ✅ Creating interactive visualizations
5. ✅ Semantic search for similar variables

### Next Steps:
- Use `nhanes_data_loader_all_years.py` to download actual NHANES data (not just metadata)
- Merge data across years by SEQN
- Export to CSV or Google Drive
- Use embeddings for variable selection in hypothesis generation