# GatorTron ICD-11 Embeddings Generation

This notebook generates embeddings for ICD-11 codes using the GatorTron model.

In [None]:
# Install required packages
!pip install -q transformers torch pandas numpy scikit-learn
!pip install -q datasets tqdm

# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import TruncatedSVD
from tqdm.auto import tqdm
import re
import warnings
warnings.filterwarnings('ignore')

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cpu


In [None]:
def load_gatortron_model():
    """
    Load the GatorTron base model from Hugging Face.
    """
    print("Loading GatorTron-base model...")
    model_name = "UFNLP/gatortron-base"

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    )

    # Move model to device
    model = model.to(device)
    model.eval()

    print(f"GatorTron model loaded successfully")
    print(f"Vector dimension: {model.config.hidden_size}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

    return tokenizer, model

# Load the GatorTron model
tokenizer, model = load_gatortron_model()

Loading GatorTron-base model...
GatorTron model loaded successfully
Vector dimension: 1024
Model parameters: 355,267,584


In [None]:
def generate_gatortron_embeddings(texts, model, tokenizer, batch_size=8):
    """
    Generate embeddings for texts using GatorTron model.
    Uses average pooling over the last hidden states to create sentence embeddings.
    """
    all_embeddings = []

    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        # Get batch texts
        batch_texts = texts[i:i+batch_size]

        # Tokenize
        encoded_input = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        # Generate embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

            # Get the last hidden states
            last_hidden_state = model_output.last_hidden_state

            # Perform average pooling
            attention_mask = encoded_input['attention_mask'].unsqueeze(-1)

            # Multiply by attention mask to zero out padding
            token_embeddings = last_hidden_state * attention_mask

            # Sum over tokens and divide by the number of tokens
            sum_embeddings = torch.sum(token_embeddings, dim=1)
            sum_mask = torch.sum(attention_mask, dim=1)

            # Avoid division by zero
            sum_mask = torch.clamp(sum_mask, min=1e-9)

            # Calculate average
            embeddings = sum_embeddings / sum_mask

            # Move to CPU and convert to numpy
            embeddings = embeddings.cpu().numpy()
            all_embeddings.append(embeddings)

    # Concatenate all batches
    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

In [None]:
def gatortron(file_in, file_out):
    df=pd.read_csv(file_in)
    # Extract texts for embedding
    texts = df['vectorization_text'].tolist()
    print(f"Generating embeddings for {len(texts)} ICD-11 codes...")

    # Generate embeddings
    embeddings = generate_gatortron_embeddings(texts, model, tokenizer)

    print(f"Generated embeddings shape: {embeddings.shape}")
    print(f"Original vector dimension: {embeddings.shape[1]}")

    # Apply dimensionality reduction to 768 dimensions if needed
    target_dim = 768
    if embeddings.shape[1] > target_dim:
        print(f"Reducing dimensionality from {embeddings.shape[1]} to {target_dim}...")
        svd = TruncatedSVD(n_components=target_dim, random_state=42)
        embeddings = svd.fit_transform(embeddings)
        print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.4f}")
    else:
        print(f"No dimensionality reduction needed. Current dimension: {embeddings.shape[1]}")

    print(f"Final embeddings shape: {embeddings.shape}")

    # Format output according to specification
    df_embeddings = pd.DataFrame({
        "ICD11_code": df["code"],
        "Vector": [np.array2string(vec, separator=',', precision=6, suppress_small=True)
                   for vec in embeddings]
    })

    # Remove rows with NaN codes if any
    initial_count = len(df_embeddings)
    df_embeddings = df_embeddings.dropna(subset=['ICD11_code'])
    final_count = len(df_embeddings)

    if initial_count != final_count:
        print(f"Removed {initial_count - final_count} rows with missing ICD codes")

    # Display summary statistics
    print(f"\n=== EMBEDDING GENERATION SUMMARY ===")
    print(f"Total embeddings generated: {len(df_embeddings)}")
    print(f"Vector dimension: {embeddings.shape[1]}")
    print(f"Sample vector shape: {embeddings[0].shape}")

    # Show first few rows
    print(f"\nFirst few rows of output:")
    display(df_embeddings.head(3))

    # Save to CSV
    df_embeddings.to_csv(file_out, index=False)
    print(f"\n✅ Embeddings saved to: {file_out}")

    # Verify the saved file
    saved_df = pd.read_csv(file_out)
    print(f"✅ Verification: Saved file contains {len(saved_df)} rows and {len(saved_df.columns)} columns")

    # Display final sample
    print(f"\nSample from saved file:")
    print(f"ICD Code: {saved_df.iloc[0]['ICD11_code']}")
    print(f"Vector (first 5 elements): {saved_df.iloc[0]['Vector'][:50]}...")

    print(f"\n🎉 GatorTron embedding generation completed successfully!")
    print(f"📁 Output file: {file_out}")
    print(f"📊 Total embeddings: {len(saved_df)}")

In [None]:
# embeddings for ICD11 codes
gatortron("ICD11_preprocessed.csv", "gatortron_ICD11_embeddings.csv")

In [None]:
# embeddings for Merriam-Webster definitions
gatortron("encyclopedia_sd_preprocessed.csv", "gatortron_encyclopedia_embeddings.csv")

Generating embeddings for 378 ICD-11 codes...


Generating embeddings:   0%|          | 0/48 [00:00<?, ?it/s]

Generated embeddings shape: (378, 1024)
Original vector dimension: 1024
Reducing dimensionality from 1024 to 768...
Explained variance ratio: 1.0000
Final embeddings shape: (378, 378)

=== EMBEDDING GENERATION SUMMARY ===
Total embeddings generated: 378
Vector dimension: 378
Sample vector shape: (378,)

First few rows of output:


Unnamed: 0,ICD11_code,Vector
0,1F57,"[ 8.627036, 3.052382, 0.372213,-0.268565, 0.39..."
1,1C32,"[ 8.354035, 3.54344 , 0.728892, 0.296024, 0.47..."
2,1B91,"[ 8.453274, 3.558011,-0.150812,-0.084735, 0.60..."



✅ Embeddings saved to: gatortron_encyclopedia_embeddings.csv
✅ Verification: Saved file contains 378 rows and 2 columns

Sample from saved file:
ICD Code: 1F57
Vector (first 5 elements): [ 8.627036, 3.052382, 0.372213,-0.268565, 0.391229...

🎉 GatorTron embedding generation completed successfully!
📁 Output file: gatortron_encyclopedia_embeddings.csv
📊 Total embeddings: 378


In [None]:
# embeddings for Non-medical prompts
gatortron("Non_medical_prompts_preprocessed.csv", "gatortron_non_medical_prompts_embeddings.csv")

Generating embeddings for 10 ICD-11 codes...


Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings shape: (10, 1024)
Original vector dimension: 1024
Reducing dimensionality from 1024 to 768...
Explained variance ratio: 1.0000
Final embeddings shape: (10, 10)

=== EMBEDDING GENERATION SUMMARY ===
Total embeddings generated: 10
Vector dimension: 10
Sample vector shape: (10,)

First few rows of output:


Unnamed: 0,ICD11_code,Vector
0,0,"[ 9.043072, 2.787691, 0.806538, 0.712946,-1.24..."
1,0,"[10.049998,-1.566575,-1.286537, 0.262282,-0.64..."
2,0,"[ 9.674391,-1.248284, 1.101108,-0.23801 ,-0.44..."



✅ Embeddings saved to: gatortron_non_medical_prompts_embeddings.csv
✅ Verification: Saved file contains 10 rows and 2 columns

Sample from saved file:
ICD Code: 0
Vector (first 5 elements): [ 9.043072, 2.787691, 0.806538, 0.712946,-1.24706 ...

🎉 GatorTron embedding generation completed successfully!
📁 Output file: gatortron_non_medical_prompts_embeddings.csv
📊 Total embeddings: 10
