In [31]:
# text to text
# tokenize the text => index the tokens => train/teste the model
# In many NLP tasks, indexing (or creating a vocabulary mapping) after tokenization is important, but when using modern transformer models like T5, BERT, or GPT, you don't need to create your own index because:

In [32]:
import numpy as np
import pandas as pd

# 1. Generate synthetic clinical notes and summaries
def generate_data(sample_size=100):
    clinical_notes = []  # like X
    summaries = []  # like y
    
    conditions = ['diabetes', 'hypertension', 'asthma']  # Fixed typo in variable name
    symptoms = ['fatigue', 'headache', 'dizziness', 'chest pain']
    medications = ['metformin', 'lisinopril', 'albuterol']
    
    for i in range(sample_size):
        condition = np.random.choice(conditions)
        symptom1, symptom2 = np.random.choice(symptoms, 2, replace=False)
        medication = np.random.choice(medications)
        
        note = f"""Patient presents with {condition}. Chief complaints include {symptom1} and {symptom2}.
        Current medications include {medication}. Patient reports symptoms began 2 weeks ago.
        Vital signs are stable. Blood pressure 120/80, heart rate 72, temperature 98.6F.
        Patient advised to continue current medications and follow up in 2 weeks."""
        
        summary = f"Patient with {condition} presenting with {symptom1} and {symptom2}. Continuing {medication}."
        
        clinical_notes.append(note)
        summaries.append(summary)
    
    return pd.DataFrame({
        'clinical_note': clinical_notes,
        'summary': summaries
    })

In [33]:
# actually this doesn't summarize it, it gives me the whole text (fix it later)

import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

# 2. Prepare the data
df = generate_data(100)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # note that we don't split them to X and y here yet

# 3. Initialize tokenizer and model
model_name = "t5-small"   # Using a smaller model for demonstration
tokenizer = T5Tokenizer.from_pretrained(model_name)  #  loads a pre-trained tokenizer for the T5 model
model = T5ForConditionalGeneration.from_pretrained(model_name)  #  loads a pre-trained T5 (Text-to-Text Transfer Transformer) model for conditional generation, such as text summarization, translation, and question answering.
# we usually use the same etokanizer and model for better compatibility

# 4. Prepare datasets
# data tokenizer function
def preprocess_data(examples):
    inputs = tokenizer(examples['clinical_note'], max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')
    
    return {
        'input_ids': inputs['input_ids'],  # input_ids are vocabulary indices (token IDs)
        # We return IDs instead of exact inputs because neural networks can't process text directly - they need numbers to perform calculations. 
        'attention_mask': inputs['attention_mask'],  # gives you a binary array (1s and 0s) that tells the model which tokens are real content (1) and which are padding (0).
        'labels': labels['input_ids']
    }

# convert pandas DataFrame into Hugging Face Dataset format (a special ML-ready format (HuggingFace Dataset) that's optimized for training deep learning models).
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, remove_columns=test_dataset.column_names)
# .map(preprocess_data): 
# Takes each row of the dataset
# Runs it through the preprocess_data function
# This function converts text to token IDs and creates attention masks

# remove_columns=train_dataset.column_names:
# Removes the original text columns after preprocessing
# We don't need the original text anymore, just the IDs
# Saves memory by keeping only the processed data

# 5. Set up training arguments
training_args = TrainingArguments(
    output_dir="./clinical_summarizer",  # specifies the directory path where the model will save
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# 6. Create trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 7. Train the model
trainer.train()

# 8. create function to use the model
def generate_summary(text):
    inputs = tokenizer(text, max_length=512, truncation=True,  padding='max_length', return_tensors='pt') 
    summary_ids = model.generate(inputs['input_ids'], max_length=128, min_length=30, num_beams=4)
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# 9. Example usage
test_note = """Patient presents with diabetes. Chief complaints include fatigue and dizziness.
Current medications include metformin. Patient reports symptoms began 2 weeks ago.
Vital signs are stable. Blood pressure 120/80, heart rate 72, temperature 98.6F.
Patient advised to continue current medications and follow up in 2 weeks."""

generated_summary = generate_summary(test_note)
print("Generated Summary:", generated_summary)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss


Generated Summary: fatigue and dizziness. Chief complaints include fatigue and dizziness. Patient presents with diabetes. Chief complaints include fatigue and dizziness. Current medications include metformin. Patient reports symptoms started 2 weeks ago. Vital signs are stable. Blood pressure 120/80, heart rate 72, temperature 98.6F. Patient advised to continue current medications and follow up in 2 weeks.
