In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m40.5 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.2


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
# Load cleaned data
df = pd.read_csv("../data/cleaned_austin_v1.csv")
y = df['survived']
X = df.drop(columns=['id', 'survived'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

(11494, 152) (2874, 152)
survived
1    0.633722
0    0.366278
Name: proportion, dtype: float64
survived
1    0.633612
0    0.366388
Name: proportion, dtype: float64


In [3]:
# Train XGBoost model
xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
)
xgb.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [4]:
# Predict on test set
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

In [5]:
# Calculate and display metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_roc_auc = roc_auc_score(y_test, y_proba_xgb)
xgb_cm = confusion_matrix(y_test, y_pred_xgb)

print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")
print(f"ROC AUC: {xgb_roc_auc:.4f}")
print(f"Confusion Matrix:\n{xgb_cm}")

Accuracy: 0.7550
Precision: 0.7663
Recall: 0.8825
F1 Score: 0.8203
ROC AUC: 0.8135
Confusion Matrix:
[[ 563  490]
 [ 214 1607]]


In [6]:
# Feature importance dataframe
feature_importances_xgb = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb.feature_importances_
}).sort_values(by='importance', ascending=False)
feature_importances_xgb.head(20)

Unnamed: 0,feature,importance
30,missing_acceptance_rate,0.079421
55,property_type_Entire rental unit,0.051573
33,host_response_time_unknown,0.027425
0,host_is_superhost,0.022343
3,host_total_listings_count,0.018446
29,missing_response_rate,0.014605
76,property_type_Private room in rental unit,0.012926
11,number_of_reviews,0.01291
47,property_type_Entire condo,0.01268
112,neighbourhood_cleansed_78705,0.012523


In [8]:
# Save the processed data
xgb_results = pd.DataFrame({
    "accuracy": [xgb_accuracy],
    "precision": [xgb_precision],
    "recall": [xgb_recall],
    "f1_score": [xgb_f1],
    "roc_auc": [xgb_roc_auc]
})
xgb_results.to_csv("../results/xgboost_results.csv", index=False)
feature_importances_xgb.to_csv("../results/xgboost_feature_importances.csv", index=False)