In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    precision_recall_curve
)

import xgboost as xgb

In [17]:
# Load the merged dataset
df = pd.read_csv("../data/interim/opioid_cases_full.csv", low_memory=False)

print("Data loaded:", df.shape)

# Quick peek at columns
df.head()

Data loaded: (271206, 133)


Unnamed: 0,PcrKey,eDispatch_01,eDispatch_02,eArrest_14,eArrest_01,eArrest_02,eArrest_05,eArrest_07,eArrest_11,eArrest_16,...,ems_cpr,rosc_achieved,use_flag_count,unique_use_flags,all_use_flags,all_use_flags_str,symptom_count,unique_symptoms,all_symptoms,all_symptoms_str
0,225614082,~2301053 ~,~7701003 ~,~Not Applicable ~,~3001001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,...,False,False,1,1,['~8801015 '],~8801015,1.0,1.0,['~7701003 '],~7701003
1,225614423,~2301053 ~,~2302007 ~,~Not Applicable ~,~3001001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,...,False,False,1,1,['~8801023 '],~8801023,1.0,1.0,['~R11.10 '],~R11.10
2,225614880,~2301079 ~,~2302001 ~,~Not Applicable ~,~3001001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,...,False,False,1,1,['~8801015 '],~8801015,1.0,1.0,['~7701001 '],~7701001
3,225615710,~2301051 ~,~2302007 ~,~Not Applicable ~,~3001001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,...,False,False,1,1,['~8801015 '],~8801015,1.0,1.0,['~R44.3 '],~R44.3
4,225616308,~2301053 ~,~7701003 ~,~Not Applicable ~,~3001001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,~7701001 ~,...,False,False,1,1,['~3117007 '],~3117007,1.0,1.0,['~7701003 '],~7701003


In [18]:
# Strip tildes and spaces from all object columns
cols_to_clean = df.select_dtypes(include="object").columns

for col in cols_to_clean:
    df[col] = df[col].astype(str).str.strip(" ~")

print("Cleaned all string columns.")

Cleaned all string columns.


In [19]:
# Count missing values
missing_counts = df.isna().sum().sort_values(ascending=False)
missing_percent = (missing_counts / len(df) * 100).round(2)

# Combine into a DataFrame
missing_summary = pd.DataFrame({
    "Missing Count": missing_counts,
    "Missing %": missing_percent
}).query("`Missing Count` > 0")

print("Columns with missing values:", missing_summary.shape[0])
missing_summary.head(20)

Columns with missing values: 65


Unnamed: 0,Missing Count,Missing %
BGL_std,260563,96.08
ETCO2_std,227360,83.83
ETCO2_min,218952,80.73
ETCO2_last,218952,80.73
ETCO2_first,218952,80.73
ETCO2_mean,218952,80.73
ETCO2_max,218952,80.73
BGL_mean,124639,45.96
BGL_max,124639,45.96
BGL_min,124639,45.96


In [20]:
# Show outcome codes
print("eOutcome_01 value counts:")
print(df["eOutcome_01"].value_counts(dropna=False))

print("\neOutcome_02 value counts:")
print(df["eOutcome_02"].value_counts(dropna=False))

eOutcome_01 value counts:
eOutcome_01
7701003    223107
7701001     43323
01           1959
30           1197
09            779
07            271
02            216
65            139
21             85
70             38
20             28
03             19
04             17
06              7
66              7
05              6
62              6
63              1
61              1
Name: count, dtype: int64

eOutcome_02 value counts:
eOutcome_02
7701003    206880
7701001     61818
30           1049
01            915
07            155
65            100
02             77
06             54
70             36
20             29
62             27
03             19
05             14
21             11
66              6
51              5
04              5
50              3
63              2
61              1
Name: count, dtype: int64


In [21]:
# Define mapping for eOutcome codes
admit_codes = {
    "02",  # Transferred Hospital
    "03",  # Transferred SNF
    "04",  # Transferred ICF
    "05",  # Transferred Other
    "06",  # Home Health
    "09",  # Admitted Hospital
    "20",  # Deceased
    "21",  # Transferred Law Enforcement
    "30",  # Still Patient
    "50",  # Hospice Home
    "51",  # Hospice Facility
    "61",  # Swing Bed
    "62",  # Rehab Facility
    "63",  # Long-Term Care
    "64",  # Medicaid Nursing
    "65",  # Psychiatric Facility
    "66",  # Critical Access Hospital
    "70",  # Other Healthcare Facility
}

# Helper to check if admitted/transferred
def is_admitted(row):
    val1 = str(row["eOutcome_01"]).strip()
    val2 = str(row["eOutcome_02"]).strip()
    return int((val1 in admit_codes) or (val2 in admit_codes))

# Create binary column
df["Outcome_Admitted"] = df.apply(is_admitted, axis=1)

# Show counts
print("Binary outcome distribution (0 = Discharged/Refused, 1 = Admitted/Transferred):")
print(df["Outcome_Admitted"].value_counts(dropna=False))


Binary outcome distribution (0 = Discharged/Refused, 1 = Admitted/Transferred):
Outcome_Admitted
0    268520
1      2686
Name: count, dtype: int64


In [22]:
# Codes that mean "missing outcome"
missing_codes = {"7701001", "7701003"}

# Boolean mask: keep rows where at least one outcome field is not missing
mask_valid_outcome = ~(
    df["eOutcome_01"].isin(missing_codes) & df["eOutcome_02"].isin(missing_codes)
)

# Subset
df_model = df.loc[mask_valid_outcome].copy()

print("Rows with valid outcome data:", df_model.shape)


Rows with valid outcome data: (5197, 134)


In [23]:
# Count positive and negative in the filtered set
print("Outcome distribution among records with valid outcomes:")
print(df_model["Outcome_Admitted"].value_counts(dropna=False))

Outcome distribution among records with valid outcomes:
Outcome_Admitted
1    2686
0    2511
Name: count, dtype: int64


In [24]:
# Show columns, types, and a sample value
summary = pd.DataFrame({
    "dtype": df_model.dtypes,
    "sample_value": df_model.iloc[0]
}).reset_index().rename(columns={"index": "column"})

# Show all rows
pd.set_option("display.max_rows", None)
summary

Unnamed: 0,column,dtype,sample_value
0,PcrKey,int64,225769756
1,eDispatch_01,object,2301053
2,eDispatch_02,object,2302003
3,eArrest_14,object,Not Recorded
4,eArrest_01,object,3001001
5,eArrest_02,object,7701003
6,eArrest_05,object,7701003
7,eArrest_07,object,7701003
8,eArrest_11,object,7701003
9,eArrest_16,object,7701003


In [25]:
# List of columns to drop
cols_to_drop = [
    "PcrKey",
    "all_procedures",
    "all_procedures_str",
    "all_use_flags",
    "all_use_flags_str",
    "all_symptoms",
    "all_symptoms_str",
]

# Drop them
df_modeling = df_model.drop(columns=cols_to_drop).copy()

print("After dropping unneeded columns:", df_modeling.shape)

After dropping unneeded columns: (5197, 127)


In [26]:
# Find all boolean columns
bool_cols = df_modeling.select_dtypes(include="bool").columns.tolist()
print("Boolean columns:", bool_cols)

# Convert to int
df_modeling[bool_cols] = df_modeling[bool_cols].astype(int)

print("Converted booleans to integers.")

Boolean columns: ['naloxone_flag', 'cpr_given', 'bystander_cpr', 'ems_cpr', 'rosc_achieved']
Converted booleans to integers.


In [27]:
# Find object columns
cat_cols = df_modeling.select_dtypes(include="object").columns.tolist()

print("Categorical columns:")
for col in cat_cols:
    print("-", col)

Categorical columns:
- eDispatch_01
- eDispatch_02
- eArrest_14
- eArrest_01
- eArrest_02
- eArrest_05
- eArrest_07
- eArrest_11
- eArrest_16
- eArrest_18
- eDisposition_12
- eDisposition_19
- eDisposition_16
- eDisposition_21
- eDisposition_22
- eDisposition_23
- eOutcome_01
- eOutcome_02
- ePatient_15
- ePatient_16
- ePayment_01
- ePayment_50
- eResponse_05
- eResponse_07
- eResponse_15
- eResponse_23
- eScene_01
- eScene_06
- eScene_07
- eScene_08
- eScene_09
- eSituation_02
- eSituation_07
- eSituation_08
- eSituation_13
- eSituation_01
- eTimes_01
- eTimes_03
- eTimes_05
- eTimes_06
- eTimes_07
- eTimes_09
- eTimes_11
- eTimes_12
- eTimes_13
- eDisposition_17
- first_route
- first_response
- first_procedure


In [28]:
# Drop timestamp columns
cols_to_drop_time = [
    "eSituation_01",
    "eTimes_01",
    "eTimes_03",
    "eTimes_05",
    "eTimes_06",
    "eTimes_07",
    "eTimes_09",
    "eTimes_11",
    "eTimes_12",
    "eTimes_13"
]

df_modeling = df_modeling.drop(columns=cols_to_drop_time)

print("Dropped timestamp columns. New shape:", df_modeling.shape)

Dropped timestamp columns. New shape: (5197, 117)


In [29]:
# Recompute categorical columns (now timestamps are gone)
cat_cols = df_modeling.select_dtypes(include="object").columns.tolist()
print("Label encoding these columns:", len(cat_cols))

# Initialize encoders dictionary
label_encoders = {}

# Encode each categorical column
for col in cat_cols:
    le = LabelEncoder()
    # Fill NaN with a placeholder string
    df_modeling[col] = df_modeling[col].fillna("MISSING")
    df_modeling[col] = le.fit_transform(df_modeling[col])
    label_encoders[col] = le

print("All categorical columns label-encoded.")


Label encoding these columns: 39
All categorical columns label-encoded.
