In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Make a copy to avoid modifying the original
df = pd.read_csv('final_hf_data.csv')

# --- Step 1: Compute Age ---
df['admittime'] = pd.to_datetime(df['admittime'])
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (df['admittime'] - df['dob']).dt.days / 365.25
df['age'] = df['age'].clip(0, 120)  # remove weird age outliers

# --- Step 2: Select final features ---
features = [
    'age',
    'gender',
    'ethnicity_group',
    'insurance_type',
    'admit_hour',
    'admit_day',
    'admission_type',
    'avg_lab_value',
    'cpt_code_count',
    'drg_severity',
    'icd9_category'
]

X = df[features]

# --- Step 3: Handle missing values ---
# Separate categorical and numeric columns
cat_cols = ['gender', 'ethnicity_group', 'insurance_type', 'admission_type', 'icd9_category']
num_cols = ['age', 'admit_hour', 'admit_day', 'avg_lab_value', 'cpt_code_count', 'drg_severity']

# Impute
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X[num_cols] = num_imputer.fit_transform(X[num_cols])

# --- Step 4: Encode categoricals and scale numerics ---
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

X_encoded = encoder.fit_transform(X[cat_cols])
X_scaled = scaler.fit_transform(X[num_cols])

# Combine final feature set
X_final = pd.DataFrame(
    data = np.hstack((X_scaled, X_encoded)),
    columns = (
        [f'scaled_{col}' for col in num_cols] +
        encoder.get_feature_names_out(cat_cols).tolist()
    )
)

# X_final is ready for modeling
print(X_final.head())


   scaled_age  scaled_admit_hour  scaled_admit_day  scaled_avg_lab_value  \
0    0.472772           0.793785          0.166757             -0.047532   
1   -2.234930          -0.154058          1.709025              0.000184   
2    1.323478          -0.312031          1.194936             -0.074927   
3    1.323478          -0.312031          1.194936             -0.074927   
4    0.120410           0.161890         -1.375511             -0.116460   

   scaled_cpt_code_count  scaled_drg_severity  gender_F  gender_M  \
0              -1.158667         1.044721e-15       0.0       1.0   
1              -1.454562         1.044721e-15       0.0       1.0   
2               1.208491         1.044721e-15       0.0       1.0   
3               1.208491         1.255081e+00       0.0       1.0   
4              -1.158667         1.044721e-15       0.0       1.0   

   ethnicity_group_Non-Caucasian  insurance_type_Government  \
0                            1.0                        0.0   
1 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = num_imputer.fit_transform(X[num_cols])


In [8]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 1.9 MB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 1.7 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 1.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.6 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install shap


Collecting shap
  Downloading shap-0.47.2-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.2-cp312-cp312-win_amd64.whl.metadata (2.9 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-extensions (from shap)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Downloading shap-0.47.2-cp312-cp312-win_amd64.whl (545 kB)
   ---------------------------------------- 0.0/545.2 kB ? eta -:--:--
   ---------------------------------------- 545.2/545.2 kB 2.6 MB/s eta 0:00:00
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading numba-0.61.2-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------

In [34]:
pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    roc_auc_score, precision_recall_curve
)

# --- Step 1: Clean feature names ---
X_final.columns = X_final.columns.str.replace(' ', '_')

# --- Step 2: Define target and train-test split ---
y = df['readmitted_within_30_days'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

# --- Step 3: Train Random Forest model ---
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    class_weight='balanced',  # helps a bit with imbalance
    random_state=42
)
rf.fit(X_train, y_train)

# --- Step 4: Predict probabilities ---
y_proba = rf.predict_proba(X_test)[:, 1]

# --- Step 5: Find best threshold using F2-score ---
prec, rec, thresh = precision_recall_curve(y_test, y_proba)
f2_scores = (5 * prec * rec) / (4 * prec + rec + 1e-8)
best_idx = np.argmax(f2_scores)
best_threshold = thresh[best_idx]

# --- Step 6: Predict using optimized threshold ---
y_pred = (y_proba >= best_threshold).astype(int)

# --- Step 7: Final evaluation ---
print(f"\n--- Random Forest (Optimized Threshold = {best_threshold:.4f}) ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))  # stays same
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))





#best code so far



--- Random Forest (Optimized Threshold = 0.0766) ---
Accuracy: 0.9433258762117822
ROC AUC: 0.7555705691367457
Confusion Matrix:
 [[6248  297]
 [  83   77]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      6545
           1       0.21      0.48      0.29       160

    accuracy                           0.94      6705
   macro avg       0.60      0.72      0.63      6705
weighted avg       0.97      0.94      0.95      6705



In [8]:
import pickle

# Save the trained model to a file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf, file)

print("Model saved successfully!")


# Save threshold
with open('best_threshold.txt', 'w') as f:
    f.write(str(best_threshold))

Model saved successfully!
