Plots for each feature

In [14]:
%cd ..
import matplotlib.pyplot as plt
import numpy as np
from data.data_loader import DataLoader
import pandas as pd


c:\Users\lifeh\OneDrive\Desktop\data4good-2026\datathon\med_proj


In [15]:
%cd data/
data_loader = DataLoader()
zip_paths = ["ed2015-sas.sas7bdat.zip", "ed2016_sas.zip", "ed2017_sas.zip", "ed2018_sas.zip",
                "ed2019_sas.zip", "ed2020_sas.zip", "ed2021_sas.zip"]
df = pd.DataFrame()
for zip_path in zip_paths:
    df_temp = data_loader.load_data(zip_path)
    df = pd.concat([df, df_temp])

c:\Users\lifeh\OneDrive\Desktop\data4good-2026\datathon\med_proj\data


In [16]:
half_of_rows = len(df) * 0.5
df_pruned = df.dropna(thresh = half_of_rows, axis=1)
print(df_pruned.shape)
print(df.shape)

#correlations = df_pruned.corr()['RETRNED'].abs().sort_values(ascending=False)
#print(correlations.head(21))

(147427, 446)
(147427, 1031)


In [17]:
correlations = df_pruned.corr(numeric_only=True)['RETRNED'].abs().sort_values(ascending=False)
print(correlations.head(21))

RETRNED      1.000000
REGION       0.113377
TOTHRDIVR    0.106203
NOFU         0.098290
ADMITHOS     0.086705
HDSTAT       0.083240
HDDIAG1R     0.082456
ADISP        0.080142
ADMIT        0.078522
HOSPCODE     0.078226
LOS          0.076490
SUTURE       0.070204
RACERFL      0.065644
RACEUN       0.063709
EOUTINFOE    0.062945
IMMEDR       0.062601
RESPRD       0.061411
EPTRECE      0.061332
ADMTPHYS     0.061204
RESINT       0.060393
IVFLUIDS     0.059568
Name: RETRNED, dtype: float64


In [18]:
# 1. Filter the dataset to ONLY keep rows where TOTHRDIVR is 2, 3, or 4
# This drops all the -9 to -6 values, AND the 5 (data not available)
clean_diversion_df = df[df['TOTHRDIVR'].isin([2, 3, 4])]

# 2. Calculate the correlation between the cleaned column and readmittance
correlation = clean_diversion_df['TOTHRDIVR'].corr(clean_diversion_df['RETRNED'])

print(f"Correlation between Ambulance Diversion Hours and Readmittance: {correlation:.4f}")

Correlation between Ambulance Diversion Hours and Readmittance: 0.2350


In [None]:
import pandas as pd
import numpy as np

# 1. Define your specific features
comstat_cols = [f'COMSTAT{i}' for i in range(1, 31)]
other_cols = [
    'CAUSE1', 'CAUSE2', 'CAUSE3', 
    'AGE', 'AGEDAYS', 'AGER', 
    'CHF', 'ASTHMA', 'CKD', 'CAD', 
    'ADMITHOS', 'BOARDHOS', 
    'BPSYS', 'BPDIAS', 'BPAP',
    'RETRNED' # Don't forget the target variable!
]

all_target_cols = comstat_cols + other_cols

# 2. Keep only the columns that actually exist in your dataframe
# This prevents KeyError if your specific year's dataset is missing a column
existing_cols = [col for col in all_target_cols if col in df.columns]
df_subset = df[existing_cols].copy()

# 3. Clean the CDC's negative "missing data" codes
# We replace -9 (Blank), -8 (Unknown), and -7 (Not Applicable) with NaN
# Note: Blood pressure (BPSYS) can't naturally be negative, so this is safe.
df_subset = df_subset.replace([-9, -8, -7], np.nan)

# 4. Calculate correlation with RETRNED
# numeric_only=True skips the alphanumeric CAUSE codes so it doesn't crash
correlations = df_subset.corr(numeric_only=True)['RETRNED'].abs().sort_values(ascending=False)

# 5. Drop RETRNED from the results (since it correlates 1.0 with itself) 
# and print the top 20 most correlated features
print("--- Top Correlated Clinical Features with Readmittance ---")
print(correlations.drop('RETRNED', errors='ignore').head(20))

(147427,)
--- Top Correlated Clinical Features with Readmittance ---
COMSTAT29    0.670820
COMSTAT23    0.430007
COMSTAT27    0.284463
ADMITHOS     0.086705
COMSTAT26    0.072836
COMSTAT9     0.069184
COMSTAT25    0.063564
COMSTAT12    0.063471
COMSTAT11    0.060895
AGEDAYS      0.060664
COMSTAT10    0.056763
COMSTAT24    0.055972
COMSTAT17    0.055415
COMSTAT7     0.053268
COMSTAT16    0.051019
COMSTAT6     0.049456
COMSTAT22    0.048773
COMSTAT13    0.045633
COMSTAT20    0.043863
COMSTAT19    0.043228
Name: RETRNED, dtype: float64


In [23]:

# 1. Force the column to be numerical. 
# errors='coerce' turns any text or weird byte-strings (like b'710-') directly into NaN
clean_col = pd.to_numeric(df['COMSTAT29'], errors='coerce')

# 2. Filter out the CDC's negative missing codes (keep only values > 0)
# COMSTAT valid values are usually 1, 2, or 3
clean_col = clean_col[clean_col > 0]

# 3. Count how many valid rows are left
remaining_count = clean_col.count()

print(f"Total rows in original dataset: {len(df)}")
print(f"Valid values left in COMSTAT29: {remaining_count}")

with pd.option_context('display.max_rows', None):
    print(clean_col)

Total rows in original dataset: 147427
Valid values left in COMSTAT29: 77
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
17286    1.0
17332    1.0
17898    1.0
17928    1.0
19652    1.0
20886    1.0
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
17286    1.0
17332    1.0
17898    1.0
17928    1.0
19652    1.0
20886    1.0
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
17286    1.0
17332    1.0
17898    1.0
17928    1.0
19652    1.0
20886    1.0
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
17286    1.0
17332    1.0
17898    1.0
17928    1.0
19652    1.0
20886    1.0
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
17286    1.0
17332    1.0
17898    1.0
17928    1.0
19652    1.0
20886    1.0
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
17286    1.0
17332    1.0
17898    1.0
17928    1.0
19652    1.0
20886    1.0
2544     2.0
2655     1.0
7953     2.0
15004    1.0
15010    1.0
172

In [24]:
# 1. Force the column to be numerical. 
# errors='coerce' turns any text or weird byte-strings (like b'710-') directly into NaN
clean_col = pd.to_numeric(df['COMSTAT23'], errors='coerce')

# 2. Filter out the CDC's negative missing codes (keep only values > 0)
# COMSTAT valid values are usually 1, 2, or 3
clean_col = clean_col[clean_col > 0]

# 3. Count how many valid rows are left
remaining_count = clean_col.count()

print(f"Total rows in original dataset: {len(df)}")
print(f"Valid values left in COMSTAT23: {remaining_count}")

with pd.option_context('display.max_rows', None):
    print(clean_col)

Total rows in original dataset: 147427
Valid values left in COMSTAT23: 378
185      1.0
781      1.0
795      1.0
816      1.0
823      1.0
2544     1.0
2564     1.0
2594     1.0
2655     1.0
4049     1.0
5904     1.0
6595     1.0
6614     1.0
6633     1.0
6634     1.0
6646     1.0
6676     1.0
6701     1.0
7953     2.0
8054     1.0
11422    1.0
13761    1.0
13774    1.0
14921    1.0
14936    2.0
14951    2.0
14978    1.0
14985    2.0
15004    1.0
15008    1.0
15010    1.0
15023    1.0
15024    1.0
17283    1.0
17286    1.0
17332    1.0
17891    1.0
17896    1.0
17898    1.0
17928    1.0
17930    1.0
17940    1.0
19626    1.0
19627    1.0
19652    2.0
19659    1.0
19668    1.0
19682    1.0
20707    1.0
20747    1.0
20822    1.0
20859    1.0
20886    1.0
20979    1.0
185      1.0
781      1.0
795      1.0
816      1.0
823      1.0
2544     1.0
2564     1.0
2594     1.0
2655     1.0
4049     1.0
5904     1.0
6595     1.0
6614     1.0
6633     1.0
6634     1.0
6646     1.0
6676     1.0
67

In [25]:
df_age_check = df[['AGE', 'RETRNED']].copy()

# 2. Replace any CDC negative "missing data" codes with NaN so they are ignored in the math
df_age_check = df_age_check.replace([-9, -8, -7], np.nan)

# 3. Calculate the correlation
age_corr = df_age_check['AGE'].corr(df_age_check['RETRNED'])

print(f"Correlation between AGE and Readmittance: {age_corr:.4f}")

Correlation between AGE and Readmittance: -0.0228


In [26]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# 1. Define the top features you found
top_features = [
    'COMSTAT29', 'COMSTAT23', 'COMSTAT27', 'ADMITHOS', 'COMSTAT26', 
    'COMSTAT9', 'COMSTAT25', 'COMSTAT12', 'COMSTAT11', 'AGEDAYS', 
    'COMSTAT10', 'COMSTAT24', 'COMSTAT17', 'COMSTAT7', 'COMSTAT16', 
    'COMSTAT6', 'COMSTAT22', 'COMSTAT13', 'COMSTAT20', 'COMSTAT19'
]

# 2. Prep your Data (RFE algorithms will crash if there are NaNs)
# We fill NaNs with -1 so the Random Forest knows it represents "missing/blank"
X = df[top_features].fillna(-1)

# Ensure your target variable has no missing values either
# We'll drop rows where the target (RETRNED) is unknown
valid_y_index = df['RETRNED'].dropna().index
X = X.loc[valid_y_index]
y = df['RETRNED'].loc[valid_y_index]

# 3. Set up the Base Model
# Random Forest is great here because it handles the -1 "missing" values well
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# 4. Set up and Train RFE
# Let's tell RFE to keep only the top 5 absolute best features
print("Training RFE... this might take a few seconds.")
rfe_selector = RFE(estimator=model, n_features_to_select=5, step=1)
rfe_selector = rfe_selector.fit(X, y)

# 5. Display the Results
results = pd.DataFrame({
    'Feature': X.columns,
    'Kept by RFE': rfe_selector.support_, # True if it's in the top 5
    'Ranking': rfe_selector.ranking_      # 1 means top feature, 2 is next to be dropped, etc.
}).sort_values(by='Ranking')

print("\n--- RFE Results ---")
print(results)

Training RFE... this might take a few seconds.

--- RFE Results ---
      Feature  Kept by RFE  Ranking
3    ADMITHOS         True        1
5    COMSTAT9         True        1
13   COMSTAT7         True        1
9     AGEDAYS         True        1
15   COMSTAT6         True        1
8   COMSTAT11        False        2
10  COMSTAT10        False        3
0   COMSTAT29        False        4
17  COMSTAT13        False        5
7   COMSTAT12        False        6
14  COMSTAT16        False        7
2   COMSTAT27        False        8
19  COMSTAT19        False        9
18  COMSTAT20        False       10
12  COMSTAT17        False       11
1   COMSTAT23        False       12
4   COMSTAT26        False       13
16  COMSTAT22        False       14
11  COMSTAT24        False       15
6   COMSTAT25        False       16


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Define your winning features from RFE
best_features = ['ADMITHOS', 'COMSTAT9', 'COMSTAT7', 'AGEDAYS', 'COMSTAT6']

# 2. Prep the data (fill missing values with -1 so the model doesn't crash)
X = df[best_features].fillna(-1)

# Ensure target variable has no missing values
valid_indices = df['RETRNED'].dropna().index
X = X.loc[valid_indices]
y = df['RETRNED'].loc[valid_indices]

# 3. Split the data (80% for training, 20% for testing)
# stratify=y ensures the 80/20 split keeps the same ratio of readmitted patients
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Train the Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# 5. Make predictions on the unseen test data
y_pred = model.predict(X_test)

# 6. Grade the results
accuracy = accuracy_score(y_test, y_pred)
print(f"--- Model Accuracy: {accuracy * 100:.2f}% ---\n")

# Print the detailed breakdown
print(classification_report(y_test, y_pred))

--- Model Accuracy: 92.50% ---

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96    190787
         1.0       0.70      0.01      0.03     15611

    accuracy                           0.92    206398
   macro avg       0.81      0.51      0.49    206398
weighted avg       0.91      0.92      0.89    206398



In [28]:
# 1. Get the accuracy on the data the model was TRAINED on
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# 2. You already have the test accuracy from the previous step
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy:  {test_accuracy * 100:.2f}%")

# 3. Calculate the gap
print(f"Difference (Overfit Gap): {(train_accuracy - test_accuracy) * 100:.2f}%")

Training Accuracy: 92.50%
Testing Accuracy:  92.50%
Difference (Overfit Gap): 0.01%


In [29]:
from sklearn.metrics import log_loss

# 1. Ask the model for the PROBABILITY of readmission, not just a 0 or 1
# predict_proba returns two columns: [Probability of 0, Probability of 1]
y_pred_probs = model.predict_proba(X_test)

# 2. Calculate the Log Loss
# A lower log loss means a better, more confident model!
rf_loss = log_loss(y_test, y_pred_probs)

print(f"Random Forest Log Loss: {rf_loss:.4f}")

Random Forest Log Loss: 0.2557


In [None]:
import pandas as pd

# 1. Create a new "unseen" patient record.
# We put the data in a DataFrame so the column names match perfectly.
# Remember: We filled missing data with -1 during training, so do the same here if a value is unknown!
new_patient = pd.DataFrame([{
    'ADMITHOS': 0,    # Example: 1 if they were admitted to the hospital, 0 if not
    'COMSTAT9': 1,   # Example: -1 because they didn't get a 9th medication
    'COMSTAT7': 1,   # Example: -1 because they didn't get a 7th medication
    'AGEDAYS': 364,    # Example: -1 (CDC uses this for infants, usually blank for adults)
    'COMSTAT6': 2     # Example: 1 meaning their 6th drug was a single-entity drug
}])

# 2. Get the "Hard" Prediction (0 or 1)
# 0 = Will NOT return, 1 = WILL return
prediction = model.predict(new_patient)

# 3. Get the "Soft" Prediction (Probability / Risk Score)
# In healthcare, doctors don't just want a 0 or 1; they want a risk percentage!
# predict_proba returns a list: [Probability of 0, Probability of 1]
probabilities = model.predict_proba(new_patient)
risk_score = probabilities[0][1] * 100 # Grabbing the probability of '1' and converting to %

# 4. Print the results clearly
print("--- New Patient Prediction ---")
if prediction[0] == 1:
    print("ðŸš¨ High Risk: The model predicts this patient WILL be readmitted.")
else:
    print("âœ… Low Risk: The model predicts this patient will NOT be readmitted.")

print(f"Exact Readmission Risk Score: {risk_score:.1f}%")

--- New Patient Prediction ---
âœ… Low Risk: The model predicts this patient will NOT be readmitted.
Exact Readmission Risk Score: 4.3%


In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Define our new, robust clinical features
clinical_features = [
    'AGE', 'IMMEDR', 'INJURY', 
    'BPSYS', 'PULSE', 
    'CHF', 'COPD', 'DIABETES', 'CKD', 'ASTHMA'
]
target = 'RETRNED'

# 2. Keep only columns that exist in your specific dataset to prevent errors
existing_features = [col for col in clinical_features if col in df.columns]
df_model = df[existing_features + [target]].copy()

# 3. Clean the CDC negative missing codes
df_model = df_model.replace([-9, -8, -7], np.nan)
df_model = df_model.dropna(subset=[target])

# Fill missing feature data with -1 so the tree can process them as "Unknown"
X = df_model[existing_features].fillna(-1)
y = df_model[target]

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Train the Model (Keeping class_weight='balanced')
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    random_state=42, 
    n_jobs=-1,
    class_weight='balanced'
)
rf_model.fit(X_train, y_train)

# 6. Make Predictions & Grade the Model
y_pred = rf_model.predict(X_test)

print("--- Model Performance (Clinical Features) ---")
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%\n")
print("--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred))
print("--- Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred))

--- Model Performance (Clinical Features) ---
Overall Accuracy: 80.57%

--- Detailed Classification Report ---
              precision    recall  f1-score   support

         0.0       0.96      0.83      0.89     27256
         1.0       0.20      0.54      0.30      2230

    accuracy                           0.81     29486
   macro avg       0.58      0.68      0.59     29486
weighted avg       0.90      0.81      0.84     29486

--- Confusion Matrix ---
[[22552  4704]
 [ 1024  1206]]


In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# --- STEP 1: DYNAMIC COLUMN CHECK ---
# NHAMCS columns can vary by year. Let's find the right ones.
cols = df.columns.tolist()

admit_col = 'ADMITHOS' if 'ADMITHOS' in cols else ('ADMIT' if 'ADMIT' in cols else None)
return_col = 'RETRNED' if 'RETRNED' in cols else ('RETVISIT' if 'RETVISIT' in cols else None)

# --- STEP 2: SAFE FILTERING ---
if admit_col:
    # Try to filter for discharged patients, but if it wipes out the data, keep all
    temp_filter = df[df[admit_col].isin([2, 3])].copy()
    df_filtered = temp_filter if len(temp_filter) > 0 else df.copy()
else:
    df_filtered = df.copy()

# Ensure we have data
print(f"Rows after filtering: {len(df_filtered)}")

# --- STEP 3: FEATURE MAPPING ---
# Use the comorbidity and age variables you identified
chronic_flags = [c for c in ['CHF', 'ASTHMA', 'CKD', 'CAD'] if c in cols]
comstat_features = [f'COMSTAT{i}' for i in range(1, 11) if f'COMSTAT{i}' in cols]
vitals = [v for v in ['BPSYS', 'PULSE'] if v in cols]

features = ['AGE'] + chronic_flags + comstat_features + vitals

# Clean target
if return_col:
    df_filtered['target'] = (df_filtered[return_col] == 1).astype(int)
else:
    raise ValueError("Could not find the Return/Readmission column in your dataset!")

# Final check to prevent n_samples=0
if len(df_filtered) < 10:
    raise ValueError("Dataset is too small after filtering. Check your ADMITHOS codes.")

# --- STEP 4: TRAIN WITH WEIGHTS ---
X = df_filtered[features].fillna(0) # Fill NaNs to prevent crashes
y = df_filtered['target']
weights = np.where((df_filtered['AGE'] > 65) & (df_filtered['target'] == 1), 15.0, 1.0)

# Unpack carefully
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, weights, test_size=0.2, stratify=y if y.nunique() > 1 else None, random_state=42
)

rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train, sample_weight=w_train)

# --- STEP 5: PREDICT ---
# Create a dummy row for the Senior to ensure columns match exactly
senior_profile = pd.DataFrame(0, index=[0], columns=features)
senior_profile['AGE'] = 82
if 'CHF' in features: senior_profile['CHF'] = 1
if 'BPSYS' in features: senior_profile['BPSYS'] = 175

risk = rf_model.predict_proba(senior_profile)[0][1]
print(f"High-Risk Senior Corrected Risk: {risk*100:.1f}%")

Rows after filtering: 147427
High-Risk Senior Corrected Risk: 1.0%


In [None]:
print("If this risk is above 20%, we would flag this patient for extra care coordination!")
print("This is a simplified example, but in a real hospital setting, we'd use this risk score to trigger interventions like follow-up calls, home health visits, or medication reviews to help prevent readmission.")
print("nothing")
print("something")

If this risk is above 20%, we would flag this patient for extra care coordination!
This is a simplified example, but in a real hospital setting, we'd use this risk score to trigger interventions like follow-up calls, home health visits, or medication reviews to help prevent readmission.


In [68]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt

# 1. Target Preparation
# Standardize RETRNED: 1 = Yes, 0 = No/Other
# In NHAMCS, 1 is Yes, 2 is No. Ensure it's 0 and 1 for the model.
df_model = df.copy()
df_model['target'] = df_model['RETRNED'].map({1: 1, 2: 0})
df_model = df_model.dropna(subset=['target'])

# 2. Comprehensive Leakage & Metadata Removal
# We must remove everything that is determined AFTER the patient is triaged.
# This ensures the model only uses "Entry" data (History, Vitals, Triage).
leaky_prefixes = [
    'EOUT', 'ADISP', 'ADMIT', 'HDDIAG', 'RX', 'MED', 'PROC', 
    'TRAN', 'OBS', 'STAY', 'LOS', 'HDSTAT', 'DOA', 'DIEDED',
    'LWBS', 'LBTC', 'LEFTAMA', 'NODISP', 'NOFU', 'RETREFFU'
]

# Specifically drop ID, year, and weighting columns (Metadata)
metadata_cols = ['PATWT', 'EDWT', 'HOSPCODE', 'PATCODE', 'YEAR', 'CSTRATM', 'CPSUM']

cols_to_drop = ['RETRNED', 'target']
for col in df_model.columns:
    if any(col.startswith(pref) for pref in leaky_prefixes):
        cols_to_drop.append(col)
    if col in metadata_cols:
        cols_to_drop.append(col)

X = df_model.drop(columns=[c for c in cols_to_drop if c in df_model.columns])
y = df_model['target']

# 3. Handle NHAMCS Special Missing Codes (-9, -8, -7)
X = X.replace([-9, -8, -7], np.nan)

# 4. Handle Categorical Features
for col in X.select_dtypes(include=['object', 'string']).columns:
    X[col] = X[col].astype('category')

# 5. Train-Test Split (with stratification to keep return-visit ratio even)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Model Training (Adjusted for Class Imbalance)
# Calculate the ratio of Non-Returns to Returns to help the model "pay attention" to the 1s
ratio = (y == 0).sum() / (y == 1).sum()

model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    scale_pos_weight=ratio, # Crucial for improving Recall (finding the return patients)
    enable_categorical=True,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

print("Training the realistic model...")
model.fit(X_train, y_train)

# 7. Evaluate
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

print(f"\nRealistic ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(classification_report(y_test, y_pred))

# 8. Feature Importance
top_x = 20
importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
importances = importances.sort_values(by='Importance', ascending=False).head(top_x)

print(f"\n--- Top {top_x} Realistic Predictors ---")
print(importances)

  X = X.replace([-9, -8, -7], np.nan)


Training the realistic model...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1.]