# <center>INFO511: Final Project
### <center>Namig Abbasov

## Import Libraries and Data

In [19]:
import os
import random
import numpy as np
import pandas as pd
from zipfile import ZipFile

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [20]:
### reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [21]:
train = pd.read_csv("data/train.csv")
dev = pd.read_csv("data/dev.csv")
test = pd.read_csv("data/test.csv")

## Data Preprocessing 

In [22]:
### Feature engineering
def add_features(df):
    df["age_x_diabetes"] = df["age"] * df["has_diabetes"]
    df["uninsured_x_depression"] = (
        ((df["payer_type"] == "NO_INSURANCE") & (df["has_depression"] == 1)).astype(int)
    )
    df["race_ethnicity_combo"] = df["race"] + "_" + df["ethnicity"]
    df["cost_per_med"] = df["total_med_cost"] / (df["num_meds"] + 1)
    df["cost_per_proc"] = df["total_proc_cost"] / (df["num_procedures"] + 1)
    df["chronic_pain_x_pain_score"] = df["has_chronic_pain"] * df["pain_score"]
    return df

train = add_features(train)
dev = add_features(dev)
test = add_features(test)

In [23]:
### Features
feature_cols = [
    'age', 'gender', 'race', 'ethnicity', 'payer_type',
    'has_chronic_pain', 'has_hypertension', 'has_diabetes',
    'has_asthma', 'has_depression', 'encounter_cost',
    'num_meds', 'total_med_cost', 'num_procedures',
    'total_proc_cost', 'pain_score',
    'age_x_diabetes', 'uninsured_x_depression', 'race_ethnicity_combo',
    'cost_per_med', 'cost_per_proc', 'chronic_pain_x_pain_score']

X_train = train[feature_cols]
y_train = train["readmitted_within_30_days"]
X_dev = dev[feature_cols]
y_dev = dev["readmitted_within_30_days"]
X_test = test[feature_cols]
y_test = test["readmitted_within_30_days"]

dev_ids = dev["encounter_id"]
test_ids = test["encounter_id"]

In [24]:
### Column types
categorical_cols = ['gender', 'race', 'ethnicity', 'payer_type', 'race_ethnicity_combo']
numeric_cols = [col for col in feature_cols if col not in categorical_cols]

In [25]:
### preprocessing
numeric_transformer = IterativeImputer(random_state=SEED)
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)])

## Model Development 

In [26]:
### XGBoost pipeline
xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        n_jobs=-1,
        random_state=SEED
    ))])

## Hyperparameter Optimization 

In [27]:
### hyperparameter space
param_distributions = {
    "classifier__n_estimators": randint(300, 700),
    "classifier__max_depth": randint(4, 12),
    "classifier__learning_rate": uniform(0.01, 0.15),
    "classifier__subsample": uniform(0.6, 0.4),
    "classifier__colsample_bytree": uniform(0.6, 0.4),
    "classifier__gamma": uniform(0, 5),
    "classifier__reg_lambda": uniform(0, 5),
    "classifier__reg_alpha": uniform(0, 5),}

In [28]:
### RandomizedSearchCV
search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=100,
    scoring="roc_auc",
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=SEED)

In [None]:
### Fit search
print("Tuning XGBoost with RandomizedSearchCV:")
search.fit(X_train, y_train)

Tuning XGBoost with RandomizedSearchCV:
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


In [None]:
### Best model
best_model = search.best_estimator_

## Predict on Development and Save 

In [None]:
### Dev performance
dev_probas = best_model.predict_proba(X_dev)[:, 1]
dev_auc = roc_auc_score(y_dev, dev_probas)
print(f"\n Dev AUC: {dev_auc:.4f}")

In [None]:
### Save dev predictions
submission_dev = pd.DataFrame({
    "encounter_id": dev_ids,
    "readmitted_within_30_days": dev_probas
})
submission_dev.to_csv("submission.csv", index=False)
with ZipFile("submission.zip", "w") as zipf:
    zipf.write("submission.csv")
print("submission.zip ready")

## Predict on Test and Save

In [None]:
### test performance
test_probas = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_probas)
print(f"\n Test AUC: {test_auc:.4f}")

In [None]:
### Save test predictions
submission_test = pd.DataFrame({
    "encounter_id": test_ids,
    "readmitted_within_30_days": test_probas
})
submission_test.to_csv("submission_test.csv", index=False)
with ZipFile("submission_test.zip", "w") as zipf:
    zipf.write("submission_test.csv")
print("submission_test.zip ready")