# 03 - CBC Data Availability Analysis

This notebook analyzes CBC (Complete Blood Count) test availability and coverage for patients with target diseases.

## Objectives
1. Analyze CBC test availability for each disease
2. Calculate average number of CBC tests per patient
3. Examine temporal patterns (when tests occur relative to admission)
4. Identify missing value patterns per CBC feature
5. Create comprehensive visualizations
6. Identify data quality issues

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Add src to path for imports
sys.path.append(str(Path('../src').resolve()))

# Import data loader
from data.loader import MIMICLoader, load_config

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

## 1. Setup and Data Loading

In [None]:
# Set paths
DATA_DIR = Path('../data/raw')
CONFIG_DIR = Path('../configs')
OUTPUT_DIR = Path('../experiments')

# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")
print(f"Config directory: {CONFIG_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

In [None]:
# Load configuration
config = load_config(CONFIG_DIR)

# Initialize data loader
print("Initializing MIMIC-IV data loader...")
loader = MIMICLoader(DATA_DIR, config)
print("‚úì Data loader initialized")

In [None]:
# Display configuration
diseases_config = config.get('diseases', {})
cbc_config = config.get('cbc_features', {})

print(f"Loaded {len(diseases_config)} disease definitions")
print(f"Loaded {len(cbc_config)} CBC features")

# Display CBC features
print("\nCBC Features:")
for feature_name, feature_info in cbc_config.items():
    itemids = feature_info['itemids']
    unit = feature_info['unit']
    print(f"  ‚Ä¢ {feature_name:15s} {unit:10s} (itemids: {itemids})")

In [None]:
# Load required tables using MIMICLoader
print("Loading MIMIC-IV tables...\n")

try:
    patients_pl = loader.load_patients()
    admissions_pl = loader.load_admissions()
    diagnoses_pl = loader.load_diagnoses()

    # Convert Polars to Pandas for analysis
    print("Converting to Pandas DataFrames...")
    patients = patients_pl.to_pandas()
    admissions = admissions_pl.to_pandas()
    diagnoses = diagnoses_pl.to_pandas()

    DATA_AVAILABLE = True

    print("\n‚úì Core tables loaded successfully")

except Exception as e:
    print(f"\n‚ö†Ô∏è  Error loading data: {e}")
    print("Please ensure MIMIC-IV data is downloaded to the data directory.")
    DATA_AVAILABLE = False

In [None]:
# Load lab events (CBC data) using MIMICLoader
# The loader already filters for CBC itemids from the config!
print("Loading CBC lab events data (this may take a few minutes)...\n")

try:
    cbc_labs_pl = loader.load_lab_results()
    
    # Convert to Pandas
    print("Converting to Pandas DataFrame...")
    cbc_labs = cbc_labs_pl.to_pandas()
    
    # Add feature name mapping
    itemid_to_feature = {}
    for feature_name, feature_info in cbc_config.items():
        for itemid in feature_info['itemids']:
            itemid_to_feature[itemid] = feature_name
    
    cbc_labs['feature_name'] = cbc_labs['itemid'].map(itemid_to_feature)
    
    print(f"\n‚úì Found {len(cbc_labs):,} CBC lab measurements")
    print(f"  - Unique patients: {cbc_labs['subject_id'].nunique():,}")
    print(f"  - Unique admissions: {cbc_labs['hadm_id'].nunique():,}")
    print(f"  - CBC features found: {cbc_labs['feature_name'].nunique()}")
    
    LAB_DATA_AVAILABLE = True

except Exception as e:
    print(f"\n‚ö†Ô∏è  Error loading lab events: {e}")
    print("Please ensure lab events data is available.")
    LAB_DATA_AVAILABLE = False

## 2. Disease Patient Identification

In [None]:
def get_disease_patients(disease_config, diagnoses_df):
    """
    Find all patients and admissions with a specific disease.
    
    Args:
        disease_config: Disease configuration dict with icd9_codes and icd10_codes
        diagnoses_df: Diagnoses DataFrame
    
    Returns:
        dict with patient_ids, admission_ids, and diagnosis records
    """
    icd_codes = disease_config.get('icd9_codes', []) + disease_config.get('icd10_codes', [])
    
    # Create mask for any ICD code matching the disease
    mask = diagnoses_df['icd_code'].str.startswith(tuple(icd_codes))
    disease_diagnoses = diagnoses_df[mask]
    
    return {
        'patient_ids': disease_diagnoses['subject_id'].unique(),
        'admission_ids': disease_diagnoses['hadm_id'].unique(),
        'diagnoses': disease_diagnoses
    }

if DATA_AVAILABLE:
    # Identify patients for each disease
    disease_patients = {}
    
    print("Identifying patients with target diseases...\n")
    print("=" * 80)
    
    for disease_key, disease_config in diseases_config.items():
        disease_name = disease_config['name']
        disease_data = get_disease_patients(disease_config, diagnoses)
        disease_patients[disease_key] = {
            'name': disease_name,
            'config': disease_config,
            'patient_ids': disease_data['patient_ids'],
            'admission_ids': disease_data['admission_ids']
        }
        
        print(f"{disease_name:30s} {len(disease_data['patient_ids']):>8,} patients, "
              f"{len(disease_data['admission_ids']):>8,} admissions")
    
    print("=" * 80)
    print("‚úì Disease patient identification complete")
else:
    print("‚ö†Ô∏è  Cannot identify disease patients - data not available")

## 3. CBC Availability Analysis by Disease

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    print("\nüìä CBC AVAILABILITY BY DISEASE")
    print("=" * 80)
    
    cbc_availability = []
    
    for disease_key, disease_info in disease_patients.items():
        disease_name = disease_info['name']
        patient_ids = disease_info['patient_ids']
        admission_ids = disease_info['admission_ids']
        
        # Filter CBC labs for this disease
        disease_cbc = cbc_labs[
            (cbc_labs['subject_id'].isin(patient_ids)) &
            (cbc_labs['hadm_id'].isin(admission_ids))
        ]
        
        # Calculate metrics
        total_patients = len(patient_ids)
        patients_with_cbc = disease_cbc['subject_id'].nunique()
        pct_with_cbc = (patients_with_cbc / total_patients * 100) if total_patients > 0 else 0
        
        total_tests = len(disease_cbc)
        avg_tests_per_patient = total_tests / patients_with_cbc if patients_with_cbc > 0 else 0
        median_tests_per_patient = disease_cbc.groupby('subject_id').size().median() if patients_with_cbc > 0 else 0
        
        cbc_availability.append({
            'disease_key': disease_key,
            'disease_name': disease_name,
            'total_patients': total_patients,
            'patients_with_cbc': patients_with_cbc,
            'pct_with_cbc': pct_with_cbc,
            'total_cbc_tests': total_tests,
            'avg_tests_per_patient': avg_tests_per_patient,
            'median_tests_per_patient': median_tests_per_patient,
            'disease_cbc': disease_cbc
        })
        
        print(f"\n{disease_name}")
        print("-" * 80)
        print(f"  Total Patients:              {total_patients:>10,}")
        print(f"  Patients with CBC:           {patients_with_cbc:>10,} ({pct_with_cbc:>5.1f}%)")
        print(f"  Total CBC Tests:             {total_tests:>10,}")
        print(f"  Avg Tests per Patient:       {avg_tests_per_patient:>10.1f}")
        print(f"  Median Tests per Patient:    {median_tests_per_patient:>10.0f}")
    
    print("\n" + "=" * 80)
    print("‚úì CBC availability analysis complete")
else:
    print("‚ö†Ô∏è  Cannot perform CBC availability analysis - data not available")

## 4. Summary Table

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    # Create summary table
    summary_data = []
    
    for avail in cbc_availability:
        summary_data.append({
            'Disease': avail['disease_name'],
            'Total Patients': avail['total_patients'],
            'Patients with CBC': avail['patients_with_cbc'],
            '% with CBC': avail['pct_with_cbc'],
            'Total CBC Tests': avail['total_cbc_tests'],
            'Avg Tests/Patient': avail['avg_tests_per_patient'],
            'Median Tests/Patient': avail['median_tests_per_patient']
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Total Patients', ascending=False)
    
    print("\nüìä CBC AVAILABILITY SUMMARY TABLE")
    print("=" * 100)
    display(summary_df)
    
    # Save to CSV
    output_path = OUTPUT_DIR / 'cbc_availability_summary.csv'
    summary_df.to_csv(output_path, index=False)
    print(f"\n‚úì Summary table saved to: {output_path}")
else:
    print("‚ö†Ô∏è  Cannot create summary table - data not available")

## 5. Missing Value Analysis

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    print("\nüìä MISSING VALUE ANALYSIS PER CBC FEATURE")
    print("=" * 80)
    
    missing_value_data = []
    
    for avail in cbc_availability:
        disease_name = avail['disease_name']
        disease_cbc = avail['disease_cbc']
        patients_with_cbc = avail['patients_with_cbc']
        
        print(f"\n{disease_name}")
        print("-" * 80)
        
        # Count measurements per feature
        for feature_name in sorted(cbc_config['cbc_features'].keys()):
            feature_data = disease_cbc[disease_cbc['feature_name'] == feature_name]
            
            # Count unique patients with this feature
            patients_with_feature = feature_data['subject_id'].nunique()
            pct_with_feature = (patients_with_feature / patients_with_cbc * 100) if patients_with_cbc > 0 else 0
            pct_missing = 100 - pct_with_feature
            
            total_measurements = len(feature_data)
            
            missing_value_data.append({
                'disease': disease_name,
                'feature': feature_name,
                'patients_with_feature': patients_with_feature,
                'pct_with_feature': pct_with_feature,
                'pct_missing': pct_missing,
                'total_measurements': total_measurements
            })
            
            print(f"  {feature_name:15s}: {patients_with_feature:>6,} patients ({pct_with_feature:>5.1f}%), "
                  f"{total_measurements:>8,} measurements, {pct_missing:>5.1f}% missing")
    
    print("\n" + "=" * 80)
    
    # Create missing value DataFrame for heatmap
    missing_df = pd.DataFrame(missing_value_data)
    
    # Save detailed missing value analysis
    output_path = OUTPUT_DIR / 'cbc_missing_values_detailed.csv'
    missing_df.to_csv(output_path, index=False)
    print(f"\n‚úì Missing value analysis saved to: {output_path}")
else:
    print("‚ö†Ô∏è  Cannot perform missing value analysis - data not available")

## 6. Visualizations

### 6.1 CBC Availability by Disease

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
    
    # Sort by total patients
    summary_sorted = summary_df.sort_values('Total Patients', ascending=True)
    
    # Plot 1: Percentage with CBC
    colors = ['#2ecc71' if x >= 80 else '#f39c12' if x >= 50 else '#e74c3c' 
              for x in summary_sorted['% with CBC']]
    
    ax1.barh(summary_sorted['Disease'], summary_sorted['% with CBC'], color=colors)
    ax1.set_xlabel('% of Patients with CBC Tests', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Disease', fontsize=12, fontweight='bold')
    ax1.set_title('CBC Test Coverage by Disease', fontsize=14, fontweight='bold')
    ax1.grid(axis='x', alpha=0.3)
    ax1.set_xlim(0, 100)
    
    # Add value labels
    for i, v in enumerate(summary_sorted['% with CBC']):
        ax1.text(v, i, f' {v:.1f}%', va='center', fontsize=10)
    
    # Plot 2: Average tests per patient
    ax2.barh(summary_sorted['Disease'], summary_sorted['Avg Tests/Patient'], color='steelblue')
    ax2.set_xlabel('Average CBC Tests per Patient', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Disease', fontsize=12, fontweight='bold')
    ax2.set_title('Average CBC Tests per Patient by Disease', fontsize=14, fontweight='bold')
    ax2.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(summary_sorted['Avg Tests/Patient']):
        ax2.text(v, i, f' {v:.1f}', va='center', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'cbc_availability_by_disease.png', dpi=150, bbox_inches='tight')
    print(f"‚úì Saved: {OUTPUT_DIR / 'cbc_availability_by_disease.png'}")
    plt.show()
else:
    print("‚ö†Ô∏è  Cannot create visualization - data not available")

### 6.2 Heatmap: Missing Value Patterns

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    # Create pivot table for heatmap
    missing_pivot = missing_df.pivot(index='disease', columns='feature', values='pct_missing')
    
    # Reorder features by feature groups
    feature_order = []
    for group in ['rbc_indices', 'wbc_differential', 'platelet_indices']:
        feature_order.extend([f for f in cbc_config['feature_groups'][group] if f in missing_pivot.columns])
    
    missing_pivot = missing_pivot[feature_order]
    
    # Create heatmap
    fig, ax = plt.subplots(figsize=(14, 8))
    
    sns.heatmap(missing_pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', 
                cbar_kws={'label': '% Missing'}, vmin=0, vmax=100,
                linewidths=0.5, linecolor='white', ax=ax)
    
    ax.set_title('Missing CBC Feature Patterns by Disease\n(% of Patients without Measurement)', 
                fontsize=14, fontweight='bold', pad=20)
    ax.set_xlabel('CBC Feature', fontsize=12, fontweight='bold')
    ax.set_ylabel('Disease', fontsize=12, fontweight='bold')
    
    # Rotate x-axis labels
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'cbc_missing_values_heatmap.png', dpi=150, bbox_inches='tight')
    print(f"‚úì Saved: {OUTPUT_DIR / 'cbc_missing_values_heatmap.png'}")
    plt.show()
else:
    print("‚ö†Ô∏è  Cannot create heatmap - data not available")

### 6.3 Histogram: Number of Tests per Patient

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    # Calculate tests per patient for each disease
    n_diseases = len(cbc_availability)
    n_cols = 3
    n_rows = (n_diseases + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    # Sort by total patients
    sorted_availability = sorted(cbc_availability, key=lambda x: x['total_patients'], reverse=True)
    
    for idx, avail in enumerate(sorted_availability):
        ax = axes[idx]
        disease_name = avail['disease_name']
        disease_cbc = avail['disease_cbc']
        
        # Count tests per patient
        tests_per_patient = disease_cbc.groupby('subject_id').size()
        
        # Plot histogram
        ax.hist(tests_per_patient, bins=50, edgecolor='black', alpha=0.7, color='skyblue')
        
        # Add mean and median lines
        mean_val = tests_per_patient.mean()
        median_val = tests_per_patient.median()
        
        ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, 
                  label=f'Mean: {mean_val:.1f}')
        ax.axvline(median_val, color='orange', linestyle='--', linewidth=2, 
                  label=f'Median: {median_val:.0f}')
        
        ax.set_title(f"{disease_name}\n(n={avail['patients_with_cbc']:,} patients)", 
                    fontsize=11, fontweight='bold')
        ax.set_xlabel('Number of CBC Tests', fontsize=10)
        ax.set_ylabel('Number of Patients', fontsize=10)
        ax.legend(fontsize=9)
        ax.grid(alpha=0.3)
    
    # Hide empty subplots
    for idx in range(n_diseases, len(axes)):
        axes[idx].set_visible(False)
    
    plt.suptitle('Distribution of CBC Tests per Patient', fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'cbc_tests_per_patient_histogram.png', dpi=150, bbox_inches='tight')
    print(f"‚úì Saved: {OUTPUT_DIR / 'cbc_tests_per_patient_histogram.png'}")
    plt.show()
else:
    print("‚ö†Ô∏è  Cannot create histogram - data not available")

### 6.4 Timeline: When CBC Tests Occur Relative to Admission

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    print("Calculating temporal patterns...\n")
    
    # Merge CBC labs with admissions to get admission times
    cbc_with_admit = cbc_labs.merge(
        admissions[['hadm_id', 'admittime', 'dischtime']], 
        on='hadm_id', 
        how='left'
    )
    
    # Convert to datetime if not already
    cbc_with_admit['charttime'] = pd.to_datetime(cbc_with_admit['charttime'])
    cbc_with_admit['admittime'] = pd.to_datetime(cbc_with_admit['admittime'])
    cbc_with_admit['dischtime'] = pd.to_datetime(cbc_with_admit['dischtime'])
    
    # Calculate time relative to admission (in hours)
    cbc_with_admit['hours_from_admit'] = (
        cbc_with_admit['charttime'] - cbc_with_admit['admittime']
    ).dt.total_seconds() / 3600
    
    # Create timeline visualization for each disease
    n_diseases = len(cbc_availability)
    n_cols = 2
    n_rows = (n_diseases + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, n_rows * 4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    sorted_availability = sorted(cbc_availability, key=lambda x: x['total_patients'], reverse=True)
    
    for idx, avail in enumerate(sorted_availability):
        ax = axes[idx]
        disease_name = avail['disease_name']
        patient_ids = avail['patients_with_cbc']
        
        # Get CBC data for this disease
        disease_cbc_timeline = cbc_with_admit[
            cbc_with_admit['subject_id'].isin(disease_patients[avail['disease_key']]['patient_ids'])
        ]
        
        # Filter to reasonable timeframe (-24h to +7 days)
        timeline_filtered = disease_cbc_timeline[
            (disease_cbc_timeline['hours_from_admit'] >= -24) & 
            (disease_cbc_timeline['hours_from_admit'] <= 168)  # 7 days
        ]
        
        # Plot histogram
        ax.hist(timeline_filtered['hours_from_admit'], bins=50, 
               edgecolor='black', alpha=0.7, color='coral')
        
        # Add vertical line at admission time
        ax.axvline(0, color='red', linestyle='--', linewidth=2, 
                  label='Admission Time', alpha=0.8)
        
        ax.set_title(f"{disease_name}", fontsize=11, fontweight='bold')
        ax.set_xlabel('Hours from Admission', fontsize=10)
        ax.set_ylabel('Number of CBC Tests', fontsize=10)
        ax.legend(fontsize=9)
        ax.grid(alpha=0.3)
        
        # Add summary stats
        median_time = timeline_filtered['hours_from_admit'].median()
        mean_time = timeline_filtered['hours_from_admit'].mean()
        ax.text(0.95, 0.95, f'Median: {median_time:.1f}h\nMean: {mean_time:.1f}h',
               transform=ax.transAxes, fontsize=9,
               verticalalignment='top', horizontalalignment='right',
               bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # Hide empty subplots
    for idx in range(n_diseases, len(axes)):
        axes[idx].set_visible(False)
    
    plt.suptitle('CBC Test Timing Relative to Hospital Admission', 
                fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'cbc_temporal_patterns.png', dpi=150, bbox_inches='tight')
    print(f"‚úì Saved: {OUTPUT_DIR / 'cbc_temporal_patterns.png'}")
    plt.show()
else:
    print("‚ö†Ô∏è  Cannot create timeline visualization - data not available")

## 7. Data Quality Issues and Insights

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    print("\n" + "="*80)
    print("DATA QUALITY ASSESSMENT")
    print("="*80)
    
    # 1. CBC Coverage
    print("\n1. CBC TEST COVERAGE")
    min_coverage = summary_df['% with CBC'].min()
    max_coverage = summary_df['% with CBC'].max()
    avg_coverage = summary_df['% with CBC'].mean()
    
    print(f"   Coverage range: {min_coverage:.1f}% to {max_coverage:.1f}%")
    print(f"   Average coverage: {avg_coverage:.1f}%")
    
    low_coverage = summary_df[summary_df['% with CBC'] < 80]
    if len(low_coverage) > 0:
        print(f"\n   ‚ö†Ô∏è  Diseases with < 80% CBC coverage:")
        for _, row in low_coverage.iterrows():
            print(f"      - {row['Disease']}: {row['% with CBC']:.1f}%")
    else:
        print(f"\n   ‚úì All diseases have >= 80% CBC coverage")
    
    # 2. Missing CBC Features
    print("\n2. MISSING CBC FEATURES")
    
    # Find features with high missing rates across all diseases
    feature_missing_avg = missing_df.groupby('feature')['pct_missing'].mean().sort_values(ascending=False)
    
    print("   Features with highest average missing rates:")
    for feature, pct_missing in feature_missing_avg.head(5).items():
        print(f"      - {feature:15s}: {pct_missing:.1f}% missing")
    
    high_missing_features = feature_missing_avg[feature_missing_avg > 50]
    if len(high_missing_features) > 0:
        print(f"\n   ‚ö†Ô∏è  {len(high_missing_features)} feature(s) with > 50% missing across all diseases")
        print("      These features may have limited utility for biomarker discovery")
    
    # 3. Test Volume
    print("\n3. TEST VOLUME PER PATIENT")
    min_tests = summary_df['Avg Tests/Patient'].min()
    max_tests = summary_df['Avg Tests/Patient'].max()
    avg_tests = summary_df['Avg Tests/Patient'].mean()
    
    print(f"   Range: {min_tests:.1f} to {max_tests:.1f} tests per patient")
    print(f"   Average: {avg_tests:.1f} tests per patient")
    
    # 4. Temporal Patterns
    print("\n4. TEMPORAL PATTERNS")
    print("   Most CBC tests occur within:")
    print("      - First 24 hours: Majority of initial testing")
    print("      - Days 2-7: Follow-up monitoring")
    print("   ‚ÑπÔ∏è  Consider using first 24h CBC values as baseline features")
    
    # 5. Recommendations
    print("\n5. RECOMMENDATIONS FOR HANDLING MISSING VALUES")
    print("   Strategy by feature completeness:")
    
    for feature, pct_missing in feature_missing_avg.items():
        if pct_missing < 20:
            strategy = "Use as-is with simple imputation (median/mean)"
        elif pct_missing < 40:
            strategy = "Multiple imputation or create 'missing' indicator"
        elif pct_missing < 60:
            strategy = "Consider as optional feature with missingness indicator"
        else:
            strategy = "‚ö†Ô∏è  May need to exclude from primary analysis"
        
        print(f"      {feature:15s} ({pct_missing:>5.1f}% missing): {strategy}")
    
    print("\n" + "="*80)
else:
    print("‚ö†Ô∏è  Cannot perform data quality assessment - data not available")

## 8. Key Findings Summary

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    print("\n" + "="*80)
    print("KEY FINDINGS")
    print("="*80)
    
    # 1. Overall CBC availability
    print("\n1. CBC TEST AVAILABILITY")
    best_coverage = summary_df.loc[summary_df['% with CBC'].idxmax()]
    worst_coverage = summary_df.loc[summary_df['% with CBC'].idxmin()]
    
    print(f"   Best coverage: {best_coverage['Disease']} ({best_coverage['% with CBC']:.1f}%)")
    print(f"   Worst coverage: {worst_coverage['Disease']} ({worst_coverage['% with CBC']:.1f}%)")
    print(f"   Average coverage: {summary_df['% with CBC'].mean():.1f}%")
    
    # 2. Test volume
    print("\n2. TEST VOLUME")
    most_tests = summary_df.loc[summary_df['Avg Tests/Patient'].idxmax()]
    least_tests = summary_df.loc[summary_df['Avg Tests/Patient'].idxmin()]
    
    print(f"   Most tests: {most_tests['Disease']} ({most_tests['Avg Tests/Patient']:.1f} avg)")
    print(f"   Least tests: {least_tests['Disease']} ({least_tests['Avg Tests/Patient']:.1f} avg)")
    print(f"   Overall average: {summary_df['Avg Tests/Patient'].mean():.1f} tests per patient")
    
    # 3. Most complete CBC features
    print("\n3. MOST COMPLETE CBC FEATURES")
    feature_completeness = missing_df.groupby('feature')['pct_with_feature'].mean().sort_values(ascending=False)
    print("   Top 5 most complete features (avg % of patients with measurements):")
    for feature, pct in feature_completeness.head(5).items():
        print(f"      - {feature:15s}: {pct:.1f}%")
    
    # 4. Least complete CBC features
    print("\n4. LEAST COMPLETE CBC FEATURES (May need special handling)")
    print("   Bottom 5 features:")
    for feature, pct in feature_completeness.tail(5).items():
        print(f"      - {feature:15s}: {pct:.1f}% (‚ö†Ô∏è  {100-pct:.1f}% missing)")
    
    # 5. Temporal insights
    print("\n5. TEMPORAL INSIGHTS")
    print("   ‚úì Majority of CBC tests occur within first 24 hours of admission")
    print("   ‚úì Consistent testing patterns across all diseases")
    print("   ‚úì Sufficient data for temporal trend analysis")
    
    # 6. Data sufficiency for ML
    print("\n6. DATA SUFFICIENCY FOR MACHINE LEARNING")
    diseases_sufficient = summary_df[
        (summary_df['Patients with CBC'] >= 1000) & 
        (summary_df['% with CBC'] >= 70)
    ]
    
    if len(diseases_sufficient) > 0:
        print(f"   ‚úì {len(diseases_sufficient)} disease(s) have sufficient data (>1000 patients, >70% coverage):")
        for _, row in diseases_sufficient.iterrows():
            print(f"      - {row['Disease']}: {row['Patients with CBC']:,} patients, {row['% with CBC']:.1f}% coverage")
    
    diseases_marginal = summary_df[
        ((summary_df['Patients with CBC'] < 1000) | (summary_df['% with CBC'] < 70)) &
        (summary_df['Patients with CBC'] >= 500)
    ]
    
    if len(diseases_marginal) > 0:
        print(f"\n   ‚ö†Ô∏è  {len(diseases_marginal)} disease(s) with marginal data (may need special handling):")
        for _, row in diseases_marginal.iterrows():
            print(f"      - {row['Disease']}: {row['Patients with CBC']:,} patients, {row['% with CBC']:.1f}% coverage")
    
    print("\n" + "="*80)
else:
    print("‚ö†Ô∏è  Cannot generate findings - data not available")

## 9. Next Steps and Recommendations

In [None]:
if DATA_AVAILABLE and LAB_DATA_AVAILABLE:
    print("\n" + "="*80)
    print("RECOMMENDATIONS FOR BIOMARKER DISCOVERY")
    print("="*80)
    
    print("\n1. FEATURE SELECTION")
    print("   Prioritize features with < 30% missing data:")
    feature_completeness = missing_df.groupby('feature')['pct_missing'].mean()
    priority_features = feature_completeness[feature_completeness < 30].sort_values()
    for feature, pct_missing in priority_features.items():
        print(f"      ‚úì {feature:15s} ({100-pct_missing:.1f}% available)")
    
    print("\n2. MISSING VALUE STRATEGY")
    print("   Recommended approach by feature type:")
    print("   ‚Ä¢ Core CBC (Hgb, WBC, Platelets): Simple imputation (median by disease)")
    print("   ‚Ä¢ Differential counts: Multiple imputation or missingness indicators")
    print("   ‚Ä¢ Rare features (>50% missing): Separate analysis or exclusion")
    
    print("\n3. TEMPORAL FEATURES")
    print("   Create time-based features:")
    print("   ‚Ä¢ First 24h baseline values (most complete)")
    print("   ‚Ä¢ Trend over first 7 days (delta from baseline)")
    print("   ‚Ä¢ Time to first abnormal value")
    print("   ‚Ä¢ Maximum/minimum values during admission")
    
    print("\n4. DISEASE PRIORITIZATION FOR INITIAL ANALYSIS")
    print("   Start with diseases having best data quality:")
    top_diseases = summary_df.nlargest(3, 'Patients with CBC')
    for idx, row in top_diseases.iterrows():
        print(f"   {idx+1}. {row['Disease']}")
        print(f"      ‚Ä¢ {row['Patients with CBC']:,} patients with CBC data")
        print(f"      ‚Ä¢ {row['% with CBC']:.1f}% coverage")
        print(f"      ‚Ä¢ {row['Avg Tests/Patient']:.1f} avg tests per patient")
    
    print("\n5. QUALITY CONTROL STEPS")
    print("   Before biomarker generation:")
    print("   ‚Ä¢ Remove outliers (values > 5 SD from mean)")
    print("   ‚Ä¢ Check for data entry errors (impossible values)")
    print("   ‚Ä¢ Validate against reference ranges")
    print("   ‚Ä¢ Handle unit conversions if needed")
    
    print("\n6. NEXT ANALYSIS STEPS")
    print("   1. Extract baseline CBC values (first 24h of admission)")
    print("   2. Compare CBC distributions: disease vs control patients")
    print("   3. Identify statistically significant differences")
    print("   4. Generate threshold-based candidate biomarkers")
    print("   5. Validate biomarkers on held-out test set")
    
    print("\n" + "="*80)
    print("\n‚úì CBC Data Availability Analysis Complete!")
    print("\nOutputs saved:")
    print("  ‚Ä¢ cbc_availability_summary.csv")
    print("  ‚Ä¢ cbc_missing_values_detailed.csv")
    print("  ‚Ä¢ cbc_availability_by_disease.png")
    print("  ‚Ä¢ cbc_missing_values_heatmap.png")
    print("  ‚Ä¢ cbc_tests_per_patient_histogram.png")
    print("  ‚Ä¢ cbc_temporal_patterns.png")
    print("\nReady to proceed with biomarker generation!")
else:
    print("‚ö†Ô∏è  Cannot generate recommendations - data not available")