# Patient Diagnosis Pattern Analysis

This notebook demonstrates efficient methods to analyze patterns in patient diagnoses from large datasets, focusing on ICD code categories and comorbidity patterns.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import defaultdict, Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

KeyboardInterrupt: 

## Approach 1: Chunked Processing

Process the large diagnoses file in manageable chunks to avoid memory issues.

In [None]:
# File paths
diagnoses_file = "Data/diagnoses_icd.csv.gz"

# Process in chunks and extract categories
def process_diagnoses_in_chunks(file_path, chunk_size=100000):
    # Dictionaries to store our processed data
    patient_diagnoses = defaultdict(set)  # subject_id -> set of diagnosis categories
    category_patients = defaultdict(set)  # category -> set of subject_ids
    category_counts = Counter()  # Count occurrences of each category
    
    # Process file in chunks
    for chunk in pd.read_csv(file_path, compression='gzip', chunksize=chunk_size):
        # Extract the category (first 3 chars) from ICD code
        chunk['category'] = chunk['icd_code'].str[:3]
        
        # Update our data structures
        for _, row in chunk.iterrows():
            subject_id = row['subject_id']
            category = row['category']
            patient_diagnoses[subject_id].add(category)
            category_patients[category].add(subject_id)
            category_counts[category] += 1
    
    return patient_diagnoses, category_patients, category_counts

In [None]:
# We'll run this when needed - it's commented out to avoid unnecessary processing
#patient_diagnoses, category_patients, category_counts = process_diagnoses_in_chunks(diagnoses_file)

## Approach 2: Comorbidity Network Analysis

Create a network graph to visualize and analyze common comorbidities.

In [None]:
def build_comorbidity_network(patient_diagnoses, min_comorbidity=50):
    # Create a graph
    G = nx.Graph()
    
    # Count comorbidities
    comorbidity_counts = defaultdict(int)
    
    # For each patient, look at all pairs of diagnoses
    for patient, categories in patient_diagnoses.items():
        categories = list(categories)
        for i in range(len(categories)):
            for j in range(i+1, len(categories)):
                # Sort to ensure consistent ordering
                pair = tuple(sorted([categories[i], categories[j]]))
                comorbidity_counts[pair] += 1
    
    # Add edges for comorbidities meeting the threshold
    for (cat1, cat2), count in comorbidity_counts.items():
        if count >= min_comorbidity:
            # Add nodes if they don't exist
            if cat1 not in G:
                G.add_node(cat1)
            if cat2 not in G:
                G.add_node(cat2)
            # Add edge with weight based on comorbidity count
            G.add_edge(cat1, cat2, weight=count)
    
    return G, comorbidity_counts

def visualize_comorbidity_network(G, target_category="F31", top_n=20):
    # If target_category isn't in our graph, can't proceed
    if target_category not in G:
        print(f"Category {target_category} not found in graph")
        return
    
    # Get the most strongly connected nodes to our target
    edges = G.edges(target_category, data=True)
    top_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True)[:top_n]
    
    # Create a subgraph with just these connections
    subgraph_nodes = {target_category}
    for edge in top_edges:
        subgraph_nodes.add(edge[1])
    
    subgraph = G.subgraph(subgraph_nodes)
    
    # Plot
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(subgraph, k=0.5, seed=42)
    
    # Get edge weights for line thickness
    edge_weights = [G[u][v]['weight']/500 for u, v in subgraph.edges()]
    
    # Draw the graph
    nx.draw_networkx(
        subgraph, pos, 
        node_size=1000, 
        node_color='lightblue',
        font_size=10,
        width=edge_weights,
        edge_color='gray',
        with_labels=True
    )
    
    # Highlight target node
    nx.draw_networkx_nodes(
        subgraph, pos, 
        nodelist=[target_category], 
        node_color='red', 
        node_size=1200
    )
    
    plt.title(f"Top {top_n} Comorbidities with {target_category}")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

## Approach 3: Machine Learning for Diagnosis Prediction

Use patient diagnosis patterns to predict likelihood of specific conditions.

In [None]:
def prepare_ml_dataset(patient_diagnoses, target_category="F31", min_diagnoses=3):
    """Prepare data for machine learning models"""
    # Get list of all unique categories
    all_categories = set()
    for categories in patient_diagnoses.values():
        all_categories.update(categories)
    
    # Filter out patients with too few diagnoses
    filtered_patients = {}
    for patient, categories in patient_diagnoses.items():
        if len(categories) >= min_diagnoses:
            filtered_patients[patient] = categories
    
    # Create feature matrix (patients × diagnoses)
    patients = list(filtered_patients.keys())
    categories = sorted(list(all_categories))
    cat_to_idx = {cat: idx for idx, cat in enumerate(categories)}
    
    # Create sparse matrix of features
    rows, cols = [], []
    for i, patient in enumerate(patients):
        for category in filtered_patients[patient]:
            j = cat_to_idx[category]
            rows.append(i)
            cols.append(j)
    
    data = [1] * len(rows)
    X = csr_matrix((data, (rows, cols)), shape=(len(patients), len(categories)))
    
    # Create target vector
    y = [1 if target_category in filtered_patients[patient] else 0 for patient in patients]
    
    # Get category names for interpretability
    feature_names = categories
    
    return X, y, feature_names, patients

def train_and_evaluate_models(X, y, feature_names):
    """Train RF and KNN models and evaluate performance"""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )
    
    # Train models
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    knn = KNeighborsClassifier(n_neighbors=15)
    knn.fit(X_train, y_train)
    
    # Evaluate
    y_pred_rf = rf.predict(X_test)
    y_pred_knn = knn.predict(X_test)
    
    print("Random Forest Results:")
    print(classification_report(y_test, y_pred_rf))
    
    print("\nKNN Results:")
    print(classification_report(y_test, y_pred_knn))
    
    # Feature importance (for RF only)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1][:20]  # Top 20 features
    
    plt.figure(figsize=(12, 6))
    plt.title('Feature Importance for Diagnosis Prediction')
    plt.bar(range(len(indices)), importances[indices], color='b', align='center')
    plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
    
    return rf, knn

## Running the Analysis

Demonstration of the full workflow.

In [None]:
# Process data (uncomment to run)
# print("Processing diagnoses data...")
# patient_diagnoses, category_patients, category_counts = process_diagnoses_in_chunks(diagnoses_file)

# print("Number of patients:", len(patient_diagnoses))
# print("Number of diagnosis categories:", len(category_counts))

# # Top diagnostic categories
# print("\nTop 10 diagnostic categories:")
# for category, count in category_counts.most_common(10):
#     print(f"{category}: {count} occurrences")

# # Build and visualize comorbidity network
# print("\nBuilding comorbidity network...")
# comorbidity_graph, comorbidity_counts = build_comorbidity_network(patient_diagnoses, min_comorbidity=100)
# visualize_comorbidity_network(comorbidity_graph, target_category="F31", top_n=15)

# # Prepare data for ML
# print("\nPreparing machine learning dataset...")
# X, y, feature_names, patients = prepare_ml_dataset(
#     patient_diagnoses, target_category="F31", min_diagnoses=3
# )

# print(f"Dataset shape: {X.shape}, Positive cases: {sum(y)}, Negative cases: {len(y) - sum(y)}")

# # Train and evaluate models
# print("\nTraining models...")
# rf_model, knn_model = train_and_evaluate_models(X, y, feature_names)

## Interpretation and Insights

1. **Chunked Processing** efficiently handles the large dataset without memory issues

2. **Comorbidity Network** helps visualize which conditions commonly co-occur with F31 (Bipolar Disorder)

3. **Machine Learning Models**:
   - Random Forest provides insight into which diagnosis categories are most predictive
   - KNN can identify patients with similar diagnosis patterns
   - Feature importance reveals which comorbidities are most strongly associated with F31

4. **Clinical Applications**:
   - Identify patients with high likelihood of undiagnosed bipolar disorder
   - Prioritize diagnostic screenings based on risk factors
   - Understand common comorbidity patterns to improve treatment planning