In [None]:
# %% [markdown]
# # Sepsis Prediction Model Training with MIMIC-III on PhysioNet
# 
# This notebook trains sepsis prediction models using MIMIC-III data directly from PhysioNet.
# 
# **Credentials Required**: You need valid PhysioNet credentials (apply at https://physionet.org/).

# %% [markdown]
# ## Step 1: Setup and Authentication

# %%
# Install required packages
!pip install wget pandas numpy scikit-learn xgboost shap imbalanced-learn plotly seaborn matplotlib scipy
!pip install psycopg2-binary sqlalchemy  # Optional: if you want to connect to PostgreSQL

# %%
# Import libraries
import os
import wget
import pandas as pd
import numpy as np
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, confusion_matrix, 
                           classification_report, roc_curve, auc)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Explainability
import shap

# Progress bar
from tqdm import tqdm

print("✓ Libraries imported")

# %% [markdown]
# ## Step 2: Configure PhysioNet Credentials

# %%
# Configure PhysioNet credentials
# Replace with your actual PhysioNet credentials
PHYSIONET_USERNAME = "nazeefulhaq"  # Replace with your PhysioNet username
PHYSIONET_PASSWORD = "Nazeef161haq#"  # Replace with your PhysioNet password

# Create .netrc file for authentication
netrc_content = f"""
machine physionet.org
login {PHYSIONET_USERNAME}
password {PHYSIONET_PASSWORD}
"""

# Save credentials
with open('/root/.netrc', 'w') as f:
    f.write(netrc_content)

print("✓ PhysioNet credentials configured")

# %% [markdown]
# ## Step 3: Download Required MIMIC-III Files

# %%
# Define which files to download (we'll start with essential tables)
MIMIC_FILES = {
    "PATIENTS": "https://physionet.org/content/mimiciii/1.4/PATIENTS.csv.gz",
    "ADMISSIONS": "https://physionet.org/content/mimiciii/1.4/ADMISSIONS.csv.gz",
    "DIAGNOSES_ICD": "https://physionet.org/content/mimiciii/1.4/DIAGNOSES_ICD.csv.gz",
    #"D_ICD_DIAGNOSES": "https://physionet.org/content/mimiciii/1.4/D_ICD_DIAGNOSES.csv.gz",
    # Optional: add more tables as needed
    # "CHARTEVENTS": "https://physionet.org/files/mimiciii/1.4/CHARTEVENTS.csv.gz",
    # "LABEVENTS": "https://physionet.org/files/mimiciii/1.4/LABEVENTS.csv.gz",
    # "D_ITEMS": "https://physionet.org/files/mimiciii/1.4/D_ITEMS.csv.gz",
    # "D_LABITEMS": "https://physionet.org/files/mimiciii/1.4/D_LABITEMS.csv.gz",
}

os.makedirs('mimic_data', exist_ok=True)

for name, url in MIMIC_FILES.items():
    output_path = f"mimic_data/{name}.csv.gz"
    if not os.path.exists(output_path):
        print(f"Downloading {name}...")
        !wget --user {PHYSIONET_USERNAME} --password {PHYSIONET_PASSWORD} {url} -O {output_path}
    else:
        print(f"{name} already exists")

# Check the folder contents
print("Files now in mimic_data:", os.listdir('mimic_data'))

# Download files
print("Downloading MIMIC-III files from PhysioNet...")
for file_name, url in MIMIC_FILES.items():
    output_path = f"mimic_data/{file_name}.csv.gz"
    
    if not os.path.exists(output_path):
        print(f"Downloading {file_name}...")
        try:
            wget.download(url, output_path)
            print(f"\n✓ {file_name} downloaded")
        except Exception as e:
            print(f"\n✗ Error downloading {file_name}: {e}")
    else:
        print(f"✓ {file_name} already exists")

print("\n✓ All files downloaded")

# %% [markdown]
# ## Step 4: Load and Process Data

# %%
# Load essential tables
print("Loading data...")

patients = pd.read_csv('mimic_data/PATIENTS.csv.gz', compression='gzip', engine='python')
admissions = pd.read_csv('mimic_data/ADMISSIONS.csv.gz', compression='gzip', engine='python')
diagnoses = pd.read_csv('mimic_data/DIAGNOSES_ICD.csv.gz', compression='gzip', engine='python')
# icd_diagnoses = pd.read_csv('mimic_data/D_ICD_DIAGNOSES.csv.gz', compression='gzip', engine='python')

print(f"Patients: {patients.shape}")
print(f"Admissions: {admissions.shape}")
print(f"Diagnoses: {diagnoses.shape}")
# print(f"ICD Diagnoses Dictionary: {icd_diagnoses.shape}")

# Show column names
print("\nColumn names:")
print("Patients:", patients.columns.tolist())
print("Admissions:", admissions.columns.tolist())
print("Diagnoses:", diagnoses.columns.tolist())

# %% [markdown]
# ## Step 5: Create Sepsis Labels

# %%
# Define sepsis ICD-9 codes (standard sepsis codes)
SEPSIS_ICD_CODES = [
    '038',       # Septicemia
    '038.0',     # Streptococcal septicemia
    '038.1',     # Staphylococcal septicemia
    '038.2',     # Pneumococcal septicemia
    '038.3',     # Septicemia due to anaerobes
    '038.4',     # Septicemia due to other gram-negative organisms
    '038.40',    # Septicemia due to gram-negative organism, unspecified
    '038.41',    # Septicemia due to hemophilus influenzae
    '038.42',    # Septicemia due to escherichia coli
    '038.43',    # Septicemia due to pseudomonas
    '038.44',    # Septicemia due to serratia
    '038.49',    # Septicemia due to other gram-negative organisms
    '038.8',     # Other specified septicemias
    '038.9',     # Unspecified septicemia
    '785.52',    # Septic shock
    '995.91',    # Sepsis
    '995.92'     # Severe sepsis
]

# Filter sepsis diagnoses
sepsis_diagnoses = diagnoses[diagnoses['ICD9_CODE'].isin(SEPSIS_ICD_CODES)]

# Create sepsis label for admissions
admissions_with_sepsis = admissions.copy()
admissions_with_sepsis['SEPSIS_LABEL'] = admissions_with_sepsis.apply(
    lambda row: 1 if ((row['SUBJECT_ID'], row['HADM_ID']) in 
                     zip(sepsis_diagnoses['SUBJECT_ID'], sepsis_diagnoses['HADM_ID'])) else 0,
    axis=1
)

print(f"Total admissions: {len(admissions_with_sepsis)}")
print(f"Sepsis admissions: {admissions_with_sepsis['SEPSIS_LABEL'].sum()}")
print(f"Sepsis percentage: {admissions_with_sepsis['SEPSIS_LABEL'].mean()*100:.2f}%")

# Merge with patient data
admissions_patients = pd.merge(
    admissions_with_sepsis,
    patients[['SUBJECT_ID', 'GENDER', 'DOB']],
    on='SUBJECT_ID',
    how='left'
)

print(f"\nMerged data shape: {admissions_patients.shape}")

# %% [markdown]
# ## Step 6: Feature Engineering

# %%
print("Engineering features...")

# Convert dates
admissions_patients['ADMITTIME'] = pd.to_datetime(admissions_patients['ADMITTIME'])
admissions_patients['DISCHTIME'] = pd.to_datetime(admissions_patients['DISCHTIME'])
admissions_patients['DOB'] = pd.to_datetime(admissions_patients['DOB'])

# Calculate age
admissions_patients['AGE'] = (admissions_patients['ADMITTIME'] - admissions_patients['DOB']).dt.days / 365.25
admissions_patients['AGE'] = admissions_patients['AGE'].clip(18, 100)  # Cap unrealistic ages

# Calculate length of stay
admissions_patients['LOS_DAYS'] = (admissions_patients['DISCHTIME'] - admissions_patients['ADMITTIME']).dt.days

# Extract features from existing columns
admissions_patients['IS_EMERGENCY'] = (admissions_patients['ADMISSION_TYPE'] == 'EMERGENCY').astype(int)
admissions_patients['IS_URGENT'] = (admissions_patients['ADMISSION_TYPE'] == 'URGENT').astype(int)
admissions_patients['IS_ELECTIVE'] = (admissions_patients['ADMISSION_TYPE'] == 'ELECTIVE').astype(int)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['GENDER', 'ADMISSION_LOCATION', 'INSURANCE', 'LANGUAGE', 
                    'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DISCHARGE_LOCATION']

for col in categorical_cols:
    if col in admissions_patients.columns:
        le = LabelEncoder()
        admissions_patients[col + '_ENCODED'] = le.fit_transform(admissions_patients[col].fillna('Unknown'))
        label_encoders[col] = le

print(f"Features created. Total columns: {len(admissions_patients.columns)}")

# %% [markdown]
# ## Step 7: Prepare Training Data

# %%
# Select features for training
feature_columns = [
    'AGE',
    'LOS_DAYS',
    'IS_EMERGENCY',
    'IS_URGENT', 
    'IS_ELECTIVE',
    'HOSPITAL_EXPIRE_FLAG'
] + [col for col in admissions_patients.columns if '_ENCODED' in col]

# Remove any columns that might not exist
feature_columns = [col for col in feature_columns if col in admissions_patients.columns]

print(f"Selected {len(feature_columns)} features:")
print(feature_columns)

# Prepare X and y
X = admissions_patients[feature_columns].copy()
y = admissions_patients['SEPSIS_LABEL'].copy()

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Sepsis percentage: {y.mean()*100:.2f}%")

# Handle missing values
print("\nHandling missing values...")
missing_before = X.isnull().sum().sum()
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)
missing_after = X.isnull().sum().sum()

print(f"Missing values: {missing_before} → {missing_after}")

# Scale features
print("\nScaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

print("✓ Data preparation complete")

# %% [markdown]
# ## Step 8: Handle Class Imbalance

# %%
print(f"Original class distribution: {pd.Series(y).value_counts().to_dict()}")

# Use SMOTE for balanced training
smote = SMOTE(random_state=42, sampling_strategy=0.5)  # Balance to 1:2 ratio
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"\nAfter SMOTE:")
print(f"  X shape: {X_resampled.shape}")
print(f"  Class distribution: {pd.Series(y_resampled).value_counts().to_dict()}")
print(f"  Sepsis percentage: {y_resampled.mean()*100:.2f}%")

# %% [markdown]
# ## Step 9: Train-Test Split

# %%
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_resampled
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"\nTraining sepsis: {y_train.sum()} ({y_train.mean()*100:.1f}%)")
print(f"Testing sepsis: {y_test.sum()} ({y_test.mean()*100:.1f}%)")

# %% [markdown]
# ## Step 10: Model Training with GPU Acceleration

# %%
# Enable GPU for XGBoost if available
import sys
xgboost_params = {
    'n_estimators': 300,
    'max_depth': 8,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}

# Check for GPU
try:
    import torch
    if torch.cuda.is_available():
        xgboost_params['tree_method'] = 'gpu_hist'
        xgboost_params['predictor'] = 'gpu_predictor'
        print("✓ GPU detected. Using GPU acceleration for XGBoost")
    else:
        xgboost_params['tree_method'] = 'hist'
        print("ℹ No GPU detected. Using CPU for training")
except:
    xgboost_params['tree_method'] = 'hist'
    print("ℹ GPU check failed. Using CPU for training")

# Define models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    
    'XGBoost': XGBClassifier(**xgboost_params),
    
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        random_state=42
    ),
    
    'Logistic Regression': LogisticRegression(
        C=0.1,
        penalty='l2',
        solver='liblinear',
        random_state=42,
        max_iter=1000
    )
}

# Train and evaluate models
print("\nTraining models...")
print("=" * 60)

results = {}
best_model = None
best_score = 0
best_model_name = ''

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    # Store results
    results[name] = {
        'model': model,
        'metrics': metrics,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    # Print results
    print(f"  Accuracy:  {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  F1-Score:  {metrics['f1_score']:.4f}")
    print(f"  ROC-AUC:   {metrics['roc_auc']:.4f}")
    
    # Track best model
    if metrics['f1_score'] > best_score:
        best_score = metrics['f1_score']
        best_model = model
        best_model_name = name

print(f"\n{'='*60}")
print(f"✓ Best model: {best_model_name} with F1-Score: {best_score:.4f}")
print(f"{'='*60}")

# %% [markdown]
# ## Step 11: Model Evaluation and Visualization

# %%
# Confusion Matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, (name, result) in enumerate(results.items()):
    ax = axes[idx]
    cm = confusion_matrix(y_test, result['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(f'{name}\nF1: {result["metrics"]["f1_score"]:.3f}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.show()

# %%
# ROC Curves
plt.figure(figsize=(10, 8))

colors = ['blue', 'green', 'red', 'purple']
for idx, (name, result) in enumerate(results.items()):
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=colors[idx], lw=2, 
             label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

# %% [markdown]
# ## Step 12: Feature Importance Analysis

# %%
# Get feature importance from best model
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot top 15 features
    plt.figure(figsize=(12, 8))
    top_n = min(15, len(feature_importance))
    sns.barplot(x='importance', y='feature', 
                data=feature_importance.head(top_n))
    plt.title(f'Top {top_n} Feature Importance - {best_model_name}')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    # Save feature importance
    feature_importance.to_csv('feature_importance.csv', index=False)
    print("✓ Feature importance saved to feature_importance.csv")

# %% [markdown]
# ## Step 13: SHAP Explainability

# %%
print("Creating SHAP explanations...")

try:
    # Use a sample for SHAP (for speed)
    X_sample = X_train.iloc[:100]
    
    # Create explainer based on model type
    if best_model_name in ['XGBoost', 'Random Forest', 'Gradient Boosting']:
        explainer = shap.TreeExplainer(best_model)
        shap_values = explainer.shap_values(X_sample)
        
        # Summary plot
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
        plt.title(f'SHAP Feature Importance - {best_model_name}')
        plt.tight_layout()
        plt.show()
        
        # Save SHAP explainer
        with open('shap_explainer.pkl', 'wb') as f:
            pickle.dump(explainer, f)
        print("✓ SHAP explainer saved")
        
    else:
        # For non-tree models, use KernelExplainer
        explainer = shap.KernelExplainer(best_model.predict_proba, X_sample)
        shap_values = explainer.shap_values(X_sample.iloc[:5])  # Small sample for speed
        print("ℹ Using KernelExplainer (small sample for speed)")
        
except Exception as e:
    print(f"ℹ SHAP explanation skipped: {e}")

# %% [markdown]
# ## Step 14: Save Model Artifacts

# %%
# Create directory for model artifacts
os.makedirs('model_artifacts', exist_ok=True)

# Save all artifacts
artifacts = {
    'best_model': best_model,
    'best_model_name': best_model_name,
    'scaler': scaler,
    'imputer': imputer,
    'feature_columns': feature_columns,
    'label_encoders': label_encoders,
    'results': results,
    'feature_importance': feature_importance if 'feature_importance' in locals() else None
}

# Save individual files
with open('model_artifacts/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('model_artifacts/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('model_artifacts/imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

with open('model_artifacts/feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)

with open('model_artifacts/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print("✓ Model artifacts saved:")
print("  1. best_model.pkl - Trained model")
print("  2. scaler.pkl - Feature scaler")
print("  3. imputer.pkl - Missing value imputer")
print("  4. feature_columns.json - Feature names")
print("  5. label_encoders.pkl - Categorical encoders")

# %% [markdown]
# ## Step 15: Create Inference Script for VS Code

# %%
# Create inference script
inference_script = '''
import pickle
import json
import pandas as pd
import numpy as np
import os

class SepsisPredictor:
    """Pre-trained sepsis prediction model"""
    
    def __init__(self, model_dir="model_artifacts/"):
        self.model_dir = model_dir
        self.model = None
        self.scaler = None
        self.imputer = None
        self.feature_columns = None
        self.label_encoders = None
        self.is_loaded = False
    
    def load_model(self):
        """Load all model artifacts"""
        try:
            # Load model
            with open(os.path.join(self.model_dir, 'best_model.pkl'), 'rb') as f:
                self.model = pickle.load(f)
            
            # Load scaler
            with open(os.path.join(self.model_dir, 'scaler.pkl'), 'rb') as f:
                self.scaler = pickle.load(f)
            
            # Load imputer
            with open(os.path.join(self.model_dir, 'imputer.pkl'), 'rb') as f:
                self.imputer = pickle.load(f)
            
            # Load feature columns
            with open(os.path.join(self.model_dir, 'feature_columns.json'), 'r') as f:
                self.feature_columns = json.load(f)
            
            # Load label encoders
            with open(os.path.join(self.model_dir, 'label_encoders.pkl'), 'rb') as f:
                self.label_encoders = pickle.load(f)
            
            self.is_loaded = True
            print("✓ Sepsis prediction model loaded successfully")
            return True
            
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
    
    def preprocess_input(self, patient_data):
        """Preprocess patient data for prediction"""
        if isinstance(patient_data, dict):
            df = pd.DataFrame([patient_data])
        else:
            df = patient_data.copy()
        
        # Ensure all features exist
        for feature in self.feature_columns:
            if feature not in df.columns:
                # Provide sensible defaults
                if feature == 'AGE':
                    df[feature] = 50
                elif feature == 'LOS_DAYS':
                    df[feature] = 5
                elif feature.startswith('IS_'):
                    df[feature] = 0
                elif feature.endswith('_ENCODED'):
                    # Try to encode categorical values
                    original_col = feature.replace('_ENCODED', '')
                    if original_col in df.columns and original_col in self.label_encoders:
                        try:
                            df[feature] = self.label_encoders[original_col].transform(
                                df[original_col].fillna('Unknown')
                            )
                        except:
                            df[feature] = 0
                    else:
                        df[feature] = 0
                else:
                    df[feature] = 0
        
        # Reorder columns to match training
        df = df[self.feature_columns]
        
        # Apply preprocessing pipeline
        df_imputed = self.imputer.transform(df)
        df_scaled = self.scaler.transform(df_imputed)
        
        return df_scaled
    
    def predict(self, patient_data):
        """Make sepsis prediction"""
        if not self.is_loaded:
            raise Exception("Model not loaded. Call load_model() first.")
        
        # Preprocess
        processed_data = self.preprocess_input(patient_data)
        
        # Predict
        prediction = self.model.predict(processed_data)[0]
        probability = self.model.predict_proba(processed_data)[0][1]
        
        # Determine risk level
        risk_level = self._get_risk_level(probability)
        
        return {
            'prediction': int(prediction),
            'probability': float(probability),
            'risk_level': risk_level,
            'recommendation': self._get_recommendation(risk_level),
            'confidence': self._get_confidence(probability)
        }
    
    def _get_risk_level(self, probability):
        if probability >= 0.7:
            return "High"
        elif probability >= 0.4:
            return "Medium"
        else:
            return "Low"
    
    def _get_recommendation(self, risk_level):
        recommendations = {
            "High": "Immediate medical attention required. Consider ICU admission.",
            "Medium": "Close monitoring recommended. Repeat assessment in 6 hours.",
            "Low": "Regular monitoring. Follow clinical protocols."
        }
        return recommendations.get(risk_level, "Consult physician.")
    
    def _get_confidence(self, probability):
        if probability > 0.8 or probability < 0.2:
            return "High"
        elif probability > 0.6 or probability < 0.4:
            return "Medium"
        else:
            return "Low"
    
    def get_feature_importance(self, top_n=10):
        """Get top N important features"""
        if hasattr(self.model, 'feature_importances_'):
            importance = dict(zip(self.feature_columns, self.model.feature_importances_))
            sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
            return sorted_importance[:top_n]
        return None

# Singleton instance for easy access
sepsis_predictor = SepsisPredictor()

if __name__ == "__main__":
    # Example usage
    predictor = SepsisPredictor("model_artifacts/")
    if predictor.load_model():
        # Example patient
        sample_patient = {
            'AGE': 65,
            'LOS_DAYS': 3,
            'IS_EMERGENCY': 1,
            'IS_URGENT': 0,
            'IS_ELECTIVE': 0,
            'HOSPITAL_EXPIRE_FLAG': 0,
            'GENDER_ENCODED': 1,
            'ADMISSION_LOCATION_ENCODED': 2,
            'INSURANCE_ENCODED': 1,
            'ETHNICITY_ENCODED': 3
        }
        
        result = predictor.predict(sample_patient)
        print(f"Prediction: {result}")
'''

# Save inference script
with open('sepsis_predictor.py', 'w') as f:
    f.write(inference_script)

print("✓ Inference script created: sepsis_predictor.py")

# %% [markdown]
# ## Step 16: Create Complete Model Package

# %%
# Create a zip file with everything needed
import zipfile
import os

# Files to include in the package
files_to_zip = [
    'model_artifacts/best_model.pkl',
    'model_artifacts/scaler.pkl',
    'model_artifacts/imputer.pkl',
    'model_artifacts/feature_columns.json',
    'model_artifacts/label_encoders.pkl',
    'sepsis_predictor.py',
    'feature_importance.csv'
]

# Filter existing files
existing_files = [f for f in files_to_zip if os.path.exists(f)]

# Create zip
with zipfile.ZipFile('sepsis_model_package.zip', 'w') as zipf:
    for file in existing_files:
        zipf.write(file, arcname=os.path.basename(file))  # Keep filenames clean
print("✓ Model package created: sepsis_model_package.zip")
    
   

✓ Libraries imported
✓ PhysioNet credentials configured
Downloading PATIENTS...
--2025-12-15 10:34:31--  https://physionet.org/content/mimiciii/1.4/PATIENTS.csv.gz
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2025-12-15 10:34:32 ERROR 403: Forbidden.

Downloading ADMISSIONS...
--2025-12-15 10:34:32--  https://physionet.org/content/mimiciii/1.4/ADMISSIONS.csv.gz
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2025-12-15 10:34:32 ERROR 403: Forbidden.

Downloading DIAGNOSES_ICD...
--2025-12-15 10:34:32--  https://physionet.org/content/mimiciii/1.4/DIAGNOSES_ICD.csv.gz
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awa

EmptyDataError: No columns to parse from file