<a href="https://colab.research.google.com/github/mhernandezlordui/Final-Project-LLM/blob/main/inferences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading, Model Modification and Vector Extraction

In [1]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModel #FOR MODERNBERT
from scipy.spatial.distance import cosine

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# --- PARAMETERS ---
ENGLISH_MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'
BEST_MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/project 2 LLM/clinicalbert_final_epoch_model.pt'
NUM_LABELS = 36
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VECTOR_DIMENSION = 768 # The standard output of Pooler Output from ClinicalBERT

# --- LOAD TOKENIZER AND BASE MODEL ---
print(f"Loading model onto: {DEVICE}")

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(ENGLISH_MODEL_NAME)

# Load the FULL model (including the temporary classification layer)
try:
    model_full = AutoModelForSequenceClassification.from_pretrained(
        ENGLISH_MODEL_NAME,
        num_labels=NUM_LABELS,
        problem_type="multi_label_classification"
    )

    # 🔑 LOAD TRAINED WEIGHTS
    model_full.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=DEVICE))
    model_full.to(DEVICE)
    model_full.eval() # Set to evaluation mode

    print("✅ Trained ClinicalBERT weights loaded successfully.")

except Exception as e:
    print(f"❌ ERROR: Could not load the model or the weights. Check the path: {BEST_MODEL_PATH}. Error: {e}")
    exit()

Loading model onto: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Trained ClinicalBERT weights loaded successfully.


In [3]:
# --- REMOVING THE CLASSIFICATION LAYER ---
# We create a new model (or reference) that is only the BERT body.
# The Hugging Face structure for this model is `model.bert`.
model_embedding_extractor = model_full.bert
print("✅ Final classification layer removed. Model ready to extract embeddings (768D).")


# --- 3. VECTOR EXTRACTION FUNCTION ---

def get_clinical_vector(narrative: str) -> np.ndarray:
    """
    Tokenizes the narrative, passes it through the ClinicalBERT body (without the final layer),
    and returns the Pooler Output (the 768-dimensional vector).
    """

    # 1. Tokenization and preparation
    inputs = tokenizer(
        narrative,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # Move to GPU/CPU
    input_ids = inputs['input_ids'].to(DEVICE)
    attention_mask = inputs['attention_mask'].to(DEVICE)

    # 2. Inference
    with torch.no_grad():
        # We use the modified extractor
        outputs = model_embedding_extractor(input_ids, attention_mask=attention_mask)

    # The 'pooler_output' is the 768-dimensional vector that summarizes the meaning.
    # The flatten ensures a 1-D array
    vector = outputs.pooler_output.cpu().numpy().flatten()

    return vector

✅ Final classification layer removed. Model ready to extract embeddings (768D).


# Differential Diagnosis Function (Cosine Similarity)
Logic for comparing the vectors and generating the percentages.

In [4]:
def calculate_differential_diagnosis(user_narrative, ref_asd, ref_adhd):
    """
    Compares the user's vector with the reference vectors using Cosine Similarity
    and normalizes the results into percentages.
    """
    # Get the user's clinical vector (768D) from the narrative
    user_vector = get_clinical_vector(user_narrative)

    # Flatten the user vector to ensure it is 1-D for the cosine function
    user_vector = user_vector.flatten()

    # 1. Calculate the Cosine Distance (scipy)
    # Note: Similarity = 1 - Distance
    similarity_asd = 1 - cosine(user_vector, ref_asd)
    similarity_adhd = 1 - cosine(user_vector, ref_adhd)

    # 2. Softmax Normalization to obtain percentages
    # This assigns a differentiated probability to each disorder

    # Convert similarities to PyTorch tensors to apply Softmax
    similarities = torch.tensor([similarity_asd, similarity_adhd], dtype=torch.float32)

    # Apply Softmax
    probabilities = torch.nn.functional.softmax(similarities, dim=0).numpy()

    # 3. Return the results
    results = {
        # Similarity Keys (for printing with .4f format)
        "Similarity_ASD": similarity_asd,
        "Similarity_ADHD": similarity_adhd,

        # Percentage Keys (for printing with .2f format)
        "ASD_Porcentage": probabilities[0] * 100,
        "ADHD_Porcentage": probabilities[1] * 100,

        # Key for the Principal Diagnosis
        "Principal Diagnosis": "ASD" if probabilities[0] > probabilities[1] else "ADHD"
    }

    return results

# GENERATING THE REFERENCE CORPUS (DSM-5)

# --- ASD Reference Corpus (DSM-5 + Pure Narratives) ---

In [5]:
'''# Pure Narratives ASD(Contextual)
ref_asd_corpus = [

    "I find it very hard to keep a conversation going that flows back and forth.",
    "I often realize I tend to monologue, and I don't know how to adjust what I say for the person I'm talking to.",
    #"People have commented that my body language is a bit rigid or seems atypical.",
    #"I'm not good at using hand gestures and sometimes I don't understand what other people mean by theirs.",
    #"I have a lot of difficulty grasping unwritten social rules, like the tone I should be using or the level of formality.",
    "Honestly, I don't have much interest in, or I find it very difficult to interact with, my peers.",
    "I was never very good at pretend play or engaging in social games with other.",
    "I generally find it very hard to form and maintain relationships that feel appropriate.",
    #"Sometimes I rock or flap my hands, especially when I feel nervous or really excited.",
    "I feel the need to line up my possessions or use objects in a repetitive way.",
    #"I catch myself repeating phrases I hear (echolalia) or using language that people consider strange.",
    "I have a very strong and negative reaction to the smallest change in my routine or environment.",
    "I feel like I must follow certain specific verbal or behavioral rituals.",
    "It's very difficult for me to switch from one activity to the next or move to a new location.",
    "I have adverse or excessive reactions to certain sensory stimuli, like loud noises or bright lights.",
    #"I have an unusual fascination with certain sensory input, or I am indifferent to things that should bother me.",
    "My interests are extremely intense and completely absorb me, much more than they do other people.",
]'''


'''# 1. DSM-5 Criteria ASD (Conceptual Purity)
ref_asd_corpus = [

    "Difficulty maintaining the social give and take flow.",
    "Tendency toward monologuing or lack of conversational adjustment.",
    "Atypical body language, rigidity.",
    "Deficits in the use and understanding of gestures.",
    "Difficulty adjusting to implicit social rules (e.g., tone, formality).",
    "Absence of interest or difficulty in interacting with peers.",
    "Deficits in imaginative or social play.",
    "General difficulty establishing and maintaining appropriate relationships.",
    "Motor stereotypies (flapping, rocking, etc.).",
    "Repetitive use or lining up of objects.",
    "Echolalia, repetitive phrases, or idiosyncratic language.",
    "Negative reaction to small changes in environment or routine.",
    "Need to follow verbal or behavioral rituals.",
    "Difficulty transitioning between activities or places.",
    "Excessively intense, circumscribed, or absorbing interests (Asperger focus).",
    "Adverse or excessive reactions to sensory stimuli.",
    "Unusual indifference or fascination with sensory stimuli.",
    "Restricted and fixed interests, with unusual attachment to inanimate objects.",
]'''

# Reference values ​​summarized by area ASD (short)
ref_asd_corpus = [

    # 1. DSM-5 Criteria (Conceptual Purity)
    "Deficit in Social Cognition (not understanding social subtleties/unwritten rules).",
    "Interests are restricted, intense, and obsessive; can sustain intense hyperfocus.",
    "Extreme rigidity; high resistance to change in routines and rituals.",
    "Difficulties understanding non-literal language (sarcasm, metaphors, irony).",
    "Defined by repetitive behaviors or restricted interests (stimming may be present).",

]

# --- ADHD Reference Corpus (DSM-5 + Pure Narratives) ---

In [6]:
'''# Pure Narratives ADHD (Contextual)
ref_adhd_corpus = [

    "I struggle to keep my attention focused when I'm working on a task or even when I'm trying to play a game.",
    "I often miss tiny details and end up making careless mistakes, especially on schoolwork.",
    "It looks like I don't listen when someone talks right to me, but my mind is just elsewhere.",
    #"I have a lot of trouble following through on instructions and rarely finish a task once I start it.",
    #"I find it really difficult to organize my chores, my schoolwork, or any activity I have to do.",
	  "I actively try to avoid tasks that I know will require me to concentrate hard for a long time.",
	  "I constantly lose important items that I need for school or for my daily routine.",
    #"I get easily distracted by almost any noise or random thing happening around me.",
    "I am very forgetful about things I need to do every single day.",
    #"I'm always fidgeting with my hands or tapping my feet, and I can never sit still in my chair."
	  #"I frequently get up and leave my seat even when I'm supposed to stay put."
	  #"I often feel the urge to run around or climb things, even if it's not the right time or place."
	  #"I can't seem to play or take part in hobbies without making a lot of noise or disrupting others."
    "on the go, acting as if driven by a motor."	"I feel like I'm constantly 'on the go,' almost like I'm being driven by an internal motor."
	  "I tend to talk way too much, sometimes without even realizing it."
	  "I often blurt out the answer before the person can even finish asking the question."
    "It's really hard for me to wait my turn, whether it's in a line or during a game."
	  "I frequently interrupt what other people are doing or intrude on their conversations or games."
]'''


'''# Reference values DSM-5 Criteria ADHD (pure conceptual)
ref_adhd_corpus = [

    "Fails to give close attention to details or makes careless mistakes.",
    "Difficulty sustaining attention in tasks or play.",
    "Does not seem to listen when spoken to directly.",
    "Does not follow through on instructions and fails to finish duties.",
    "Difficulty organizing tasks and activities.",
    "Avoids or is reluctant to engage in tasks requiring sustained mental effort.",
    "Loses things necessary for tasks or activities.",
    "Is often easily distracted by extraneous stimuli.",
    "Is often forgetful in daily activities",
    "Fidgets with or taps hands or feet or squirms in seat.",
    "Often leaves seat in situations when remaining seated is expected.",
    "Often interrupts or intrudes on others."
    "Often runs about or climbs in situations where it is inappropriate (or feels restless).",
    "Often unable to play or engage in leisure activities quietly.",
    "Is often on the go, acting as if driven by a motor.",
    "Often talks excessively.",
    "Often blurts out an answer before a question has been completed.",
    "Often has difficulty waiting his or her turn.",
] '''

# Reference values ​​summarized by area ADHD(short)
ref_adhd_corpus = [

    # 1. DSM-5 Criteria  (poor conceptual)
    "Impulsivity and inattention to the interaction flow.",
    "Broad interests that change quickly; difficulty sustaining focus.",
    "Difficulty adhering to or maintaining routines and schedules.",
    "Generally intact, but may interrupt or talk excessively.",
    "Defined by repetitive behaviors or restricted interests (stimming may be present).",

]

In [7]:
# Assuming that model_embedding_extractor, tokenizer, and DEVICE are loaded
# The variables (model_embedding_extractor, tokenizer, DEVICE) are global

# ----------------------------------------------------
# 1. ASD REFERENCE VECTOR
# ----------------------------------------------------
print("Generating ASD Reference Vector...")
# Generate a vector for each pure ASD narrative and store them
asd_vectors = [get_clinical_vector(n) for n in ref_asd_corpus]
# Calculate the mean vector, which represents the pure core of the disorder
reference_vector_asd = np.mean(asd_vectors, axis=0)

print(f"✅ ASD Vector generated. Dimension: {reference_vector_asd.shape}")


# ----------------------------------------------------
# 2. ADHD REFERENCE VECTOR
# ----------------------------------------------------
print("Generating ADHD Reference Vector...")
# Generate a vector for each pure ADHD narrative
adhd_vectors = [get_clinical_vector(n) for n in ref_adhd_corpus]
# Calculate the mean vector
reference_vector_adhd = np.mean(adhd_vectors, axis=0)

print(f"✅ ADHD Vector generated. Dimension: {reference_vector_adhd.shape}")

Generating ASD Reference Vector...
✅ ASD Vector generated. Dimension: (768,)
Generating ADHD Reference Vector...
✅ ADHD Vector generated. Dimension: (768,)


In [8]:
# --- PATIENT NARRATIVE ---
patient_narrative = "I forget to close the doors, I forget to turn off the stove, but I'm able to focus on my algorithms for 12 hours. I find it difficult to start a conversation because I don't know what to say."

print("--- ANALYZING PATIENT NARRATIVE ---")
print(f"Narrative: {patient_narrative}")

# Execute the diagnostic function
diagnosis = calculate_differential_diagnosis(
    patient_narrative,
    reference_vector_asd,
    reference_vector_adhd
)

print("\n--- DIFFERENTIAL DIAGNOSIS RESULT ---")
# The following keys must match those defined in the 'calculate_differential_diagnosis' return dictionary.
print(f"Similarity (ASD): {diagnosis.get('Similarity_ASD'):.4f}")
print(f"Similarity (ADHD): {diagnosis.get('Similarity_ADHD'):.4f}")
print("\n-------------------------------------------")
print(f"Principal Diagnosis: **{diagnosis['Principal Diagnosis']}**")
print(f"ASD Probability: **{diagnosis['ASD_Porcentage']:.2f}%**")
print(f"ADHD Probability: **{diagnosis['ADHD_Porcentage']:.2f}%**")
print("-------------------------------------------")

--- ANALYZING PATIENT NARRATIVE ---
Narrative: I forget to close the doors, I forget to turn off the stove, but I'm able to focus on my algorithms for 12 hours. I find it difficult to start a conversation because I don't know what to say.

--- DIFFERENTIAL DIAGNOSIS RESULT ---
Similarity (ASD): 0.9039
Similarity (ADHD): 0.8998

-------------------------------------------
Principal Diagnosis: **ASD**
ASD Probability: **50.10%**
ADHD Probability: **49.90%**
-------------------------------------------


# Preparing ModernBERT for comparison.

In [9]:
# --- PARAMETERS AND LOADING OF MODERNBERT (BERT Base) ---
MODERN_BERT_NAME = 'bert-base-uncased' # The original general-purpose BERT model
VECTOR_DIMENSION = 768

# Load the ModernBERT Tokenizer
modern_bert_tokenizer = AutoTokenizer.from_pretrained(MODERN_BERT_NAME)

# Load only the body of the ModernBERT model (without the classification layer)
try:
    modern_bert_model = AutoModel.from_pretrained(MODERN_BERT_NAME)
    modern_bert_model.to(DEVICE)
    modern_bert_model.eval() # Set to evaluation mode
    print(f"✅ ModernBERT ({MODERN_BERT_NAME}) successfully loaded for comparison.")
except Exception as e:
    print(f"❌ ERROR: Could not load ModernBERT. Error: {e}")
    exit()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

✅ ModernBERT (bert-base-uncased) successfully loaded for comparison.


In [10]:
def get_modern_bert_vector(narrative: str) -> np.ndarray:
    """
    Tokenizes the narrative and passes it through the body of ModernBERT (BERT Base)
    to return the Pooler Output (a 768-dimensional vector).
    """

    # Use the ModernBERT tokenizer
    inputs = modern_bert_tokenizer(
        narrative,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # Move to the GPU/CPU
    input_ids = inputs['input_ids'].to(DEVICE)
    attention_mask = inputs['attention_mask'].to(DEVICE)

    # Inference with the ModernBERT model
    with torch.no_grad():
        # Use the ModernBERT extractor
        outputs = modern_bert_model(input_ids, attention_mask=attention_mask)

    # The 'pooler_output' is the 768-dimensional vector
    vector = outputs.pooler_output.cpu().numpy().flatten()

    return vector

In [11]:
def calculate_modern_bert_diagnosis(user_narrative, ref_asd, ref_adhd):

    """
    Compares the user's vector (generated by ModernBERT) with the reference vectors
    (the same ones you used for ClinicalBERT).
    """
    # ModernBERT extractor
    user_vector = get_modern_bert_vector(user_narrative)

    user_vector = user_vector.flatten()

    # Calculate the Cosine Distance (scipy)
    similarity_asd = 1 - cosine(user_vector, ref_asd)
    similarity_adhd = 1 - cosine(user_vector, ref_adhd)

    # Softmax Normalization
    similarities = torch.tensor([similarity_asd, similarity_adhd], dtype=torch.float32)
    probabilities = torch.nn.functional.softmax(similarities, dim=0).numpy()


    results = {
        "Similarity_ASD": similarity_asd,
        "Similarity_ADHD": similarity_adhd,
        "ASD_Porcentage": probabilities[0] * 100,
        "ADHD_Porcentage": probabilities[1] * 100,
        "Principal Diagnosis": "ASD" if probabilities[0] > probabilities[1] else "ADHD"
    }

    return results

In [12]:
# --- TEST NARRATIVE ---
patient_narrative = "I forget to close the doors, I forget to turn off the stove, but I'm able to focus on my algorithms for 12 hours. I find it difficult to start a conversation because I don't know what to say."
print("--- NARRATIVE ANALYSIS ---")
print(f"Narrative: {patient_narrative}\n")

# A. Diagnosis with YOUR MODEL (ClinicalBERT, Specialized)
diag_clinical = calculate_differential_diagnosis(
    patient_narrative, reference_vector_asd, reference_vector_adhd
)

# B. Diagnosis with GENERALIST MODEL (ModernBERT)
diag_modern = calculate_modern_bert_diagnosis(
    patient_narrative, reference_vector_asd, reference_vector_adhd
)

print("\n## ⚔️ ACTUAL RESULTS COMPARISON TABLE ⚔️ ##")
print("-" * 105)

from tabulate import tabulate

data = [
    # Table Headers:
    ["METHOD / BASE", "SIMILARITY", "ASD PROBABILITY", "ADHD PROBABILITY", "DIAGNOSIS"],
    [
        "1. ClinicalBERT (Project Model)",
        f"{diag_clinical.get('Similarity_ADHD'):.4f}",
        f"{diag_clinical['ASD_Porcentage']:.2f}%",
        f"{diag_clinical['ADHD_Porcentage']:.2f}%",
        f"**{diag_clinical['Principal Diagnosis']}**"
    ],
    [
        "2. ModernBERT (Generalist)",
        f"{diag_modern.get('Similarity_ADHD'):.4f}",
        f"{diag_modern['ASD_Porcentage']:.2f}%",
        f"{diag_modern['ADHD_Porcentage']:.2f}%",
        f"**{diag_modern['Principal Diagnosis']}**"
    ]
]

print(tabulate(data, headers="firstrow", tablefmt="fancy_grid"))
print("-" * 105)

--- NARRATIVE ANALYSIS ---
Narrative: I forget to close the doors, I forget to turn off the stove, but I'm able to focus on my algorithms for 12 hours. I find it difficult to start a conversation because I don't know what to say.


## ⚔️ ACTUAL RESULTS COMPARISON TABLE ⚔️ ##
---------------------------------------------------------------------------------------------------------
╒═════════════════════════════════╤══════════════╤═══════════════════╤════════════════════╤═════════════╕
│ METHOD / BASE                   │   SIMILARITY │ ASD PROBABILITY   │ ADHD PROBABILITY   │ DIAGNOSIS   │
╞═════════════════════════════════╪══════════════╪═══════════════════╪════════════════════╪═════════════╡
│ 1. ClinicalBERT (Project Model) │       0.8998 │ 50.10%            │ 49.90%             │ **ASD**     │
├─────────────────────────────────┼──────────────┼───────────────────┼────────────────────┼─────────────┤
│ 2. ModernBERT (Generalist)      │       0.0547 │ 50.06%            │ 49.94%           