In [None]:
import pandas as pd

clinical_data = pd.read_csv('clinical_data.csv')

# to change delimiter from "," to "|"
clinical_data.to_csv('clinical_data_pipe.csv', sep='|', index=False)
# we use "|" instead of "," as delimiter so we won't accidentally seperate a phrase that has ',' inside

clinical_data = pd.read_csv('clinical_data_pipe.csv', delimiter='|')

# Data cleaning
import re
def deidentify(text):
    # Remove MRN (medical record number), names, dates, phone numbers, etc.
    text = re.sub(r'MRN:?\s*\d+', '[MRN]', text)
    # eg "Patient MRN: 12345" => "Patient [MRN]"
    # re.sub() is a Python function that substitutes text matching a pattern with replacement text
    # r'MRN:?\s*\d+' is the pattern to search for:
    # MRN - the literal characters "MRN"
    # :? - an optional colon (the ? makes the : optional)
    # \s* - zero or more whitespace characters (spaces, tabs, etc.)
    # \d+ - one or more digits
    # '[MRN]' is the replacement text
    text = re.sub(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b', '[DATE]', text)
    # \b (word boundry): a position in text where a word character (like a letter, number, or underscore) meets a non-word character (like a space, punctuation mark, or the start/end of text). we put \b at the beginnig and end to ensure that the patern is not part of another word
    # [a-z]*: to capture full name of the months as well
    # to replace (eg, "Jan 1, 2023" or "February 14 2022") with [Date]
    # \d{1,2}: One or two digits for the day (1-31)
    # (?:...): to make the month as non capturing group, so we can't index it later (don't get the point at all!!) 
    text = re.sub(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', '[PHONE]', text)
    # My number is 555.123.4567" → "My number is [PHONE]"
    # [-.\s]: - or . or space (like (-|.|\s) )
    # The character class [-.\s] is slightly more efficient in most regex engines
    # The group with alternation (-|.|\s) creates a capturing group unless you use (?:-|.|\s)

    return text

# apply the function to the whole column
clinical_data['clinical_notes'] = clinical_data['clinical_notes'].apply(deidentify)

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(clinical_data, test_size=0.2,stratify=clinical_data['specialities'], random_state=42)
# stratify=...: ensures that the proportion of each medical specialty is maintained in both your training and test datasets.
# For example, if your original dataset contains:
# 50% Cardiology, 30% Neurology, 20% Dermatology
# Then both your training and test sets will maintain these same proportions.
# important in model training

In [None]:
# didn't learn from this point
# so didn't learn after Classifying_Document.ipynb in 8th folder

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a medical terminology list (simplified example)
medical_terms = ['arrhythmia', 'hypertension', 'tachycardia', 'lesion', 'melanoma', 
                 'seizure', 'migraine', 'gastritis', 'reflux', 'fracture', 'arthritis']

# Use medical stopwords and include bigrams/trigrams which are important in medical text
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),  # Include phrases up to 3 words
    stop_words='english'
)

X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])

y_train = train_data['specialty']
y_test = test_data['specialty']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Handle imbalanced classes (some specialties might have fewer notes)
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train model
model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
model.fit(X_train_balanced, y_train_balanced)

# Evaluate
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Convert data to HuggingFace format
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Use BioBERT, which is pretrained on biomedical literature
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-v1.1", 
    num_labels=len(specialties)
)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Configure training
training_args = TrainingArguments(
    output_dir="./medical_classifier_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch"
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

trainer.train()

In [None]:
# Get examples where model was most confident but wrong
from scipy.special import softmax

def get_critical_errors(texts, true_labels, predicted_labels, probabilities, top_n=10):
    errors = []
    for i, (text, true, pred, probs) in enumerate(zip(texts, true_labels, predicted_labels, probabilities)):
        if true != pred:
            confidence = max(softmax(probs))
            errors.append((confidence, text, true, pred))
    
    # Sort by confidence (most confident mistakes first)
    return sorted(errors, key=lambda x: x[0], reverse=True)[:top_n]

# These would be reviewed by clinical experts
critical_errors = get_critical_errors(
    test_data['text'],
    y_test, 
    predictions,
    model.predict_proba(X_test)
)

for confidence, text, true, pred in critical_errors:
    print(f"Confidence: {confidence:.2f}")
    print(f"Text: {text[:100]}...")
    print(f"True: {true}, Predicted: {pred}")
    print("-" * 80)

In [None]:
from flask import Flask, request, jsonify
import secrets
import logging

app = Flask(__name__)

# Configure secure logging (no PHI)
logging.basicConfig(filename='classifier_audit.log', level=logging.INFO)

# Authentication for internal systems
API_KEYS = {"internal_ehr_system": secrets.token_hex(32)}

@app.route('/classify_note', methods=['POST'])
def classify_note():
    # Verify API key
    api_key = request.headers.get('X-API-Key')
    if api_key not in API_KEYS.values():
        logging.warning(f"Unauthorized access attempt from {request.remote_addr}")
        return jsonify({"error": "Unauthorized"}), 401
    
    data = request.json
    text = data['clinical_note']
    note_id = data.get('note_id', 'unknown')  # For audit trail
    
    # Deidentify before processing
    text = deidentify(text)
    
    # Process and predict
    features = vectorizer.transform([text])
    specialty = model.predict(features)[0]
    confidence = float(max(model.predict_proba(features)[0]))
    
    # Log the prediction (without PHI)
    logging.info(f"Note ID: {note_id}, Predicted: {specialty}, Confidence: {confidence:.2f}")
    
    return jsonify({
        'note_id': note_id,
        'predicted_specialty': specialty,
        'confidence': confidence,
        'recommended_review': confidence < 0.7  # Flag low confidence predictions
    })

if __name__ == '__main__':
    # Run with HTTPS in production
    app.run(host='0.0.0.0', port=5000, ssl_context='adhoc')

In [None]:
# Documentation for FDA compliance (if used for clinical decision support)
def generate_model_card():
    model_info = {
        "model_name": "Clinical Specialty Classifier v1.0",
        "purpose": "Assist in routing clinical notes to appropriate specialists",
        "training_data": "10,000 deidentified clinical notes from 2020-2023",
        "performance": {
            "overall_accuracy": 0.89,
            "cardiology_precision": 0.92,
            "cardiology_recall": 0.88,
            # Add metrics for all specialties
        },
        "limitations": "Not intended for emergency triage. Clinical oversight required.",
        "validation_method": "5-fold cross validation + expert review",
        "last_updated": "2024-02-25"
    }
    
    with open("model_documentation.json", "w") as f:
        json.dump(model_info, f, indent=2)

# Regular retraining schedule
def schedule_retraining():
    from apscheduler.schedulers.background import BackgroundScheduler
    
    scheduler = BackgroundScheduler()
    # Retrain every 3 months with new clinical data
    scheduler.add_job(retrain_model, 'interval', months=3)
    scheduler.start()