In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
# Load cleaned data
df = pd.read_csv("../data/cleaned_austin_v1.csv")
y = df['survived']
X = df.drop(columns=['id', 'survived'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

(11494, 152) (2874, 152)
survived
1    0.633722
0    0.366278
Name: proportion, dtype: float64
survived
1    0.633612
0    0.366388
Name: proportion, dtype: float64


In [2]:
# Train Random Forest model
rf_clf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
    )

rf_clf.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [3]:
# Predict on test set
y_pred_rf = rf_clf.predict(X_test)
y_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

In [4]:
# Metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_roc_auc = roc_auc_score(y_test, y_proba_rf)
rf_cm = confusion_matrix(y_test, y_pred_rf)

print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")
print(f"ROC AUC: {rf_roc_auc:.4f}")
print(f"Confusion Matrix:\n{rf_cm}")

Accuracy: 0.7502
Precision: 0.7517
Recall: 0.9044
F1 Score: 0.8210
ROC AUC: 0.8138
Confusion Matrix:
[[ 509  544]
 [ 174 1647]]


In [5]:
# Feature importance dataframe
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_clf.feature_importances_
}).sort_values(by='importance', ascending=False)
feature_importances.head(20)

Unnamed: 0,feature,importance
21,host_tenure_days,0.065244
9,availability_365,0.064371
10,price,0.05762
3,host_total_listings_count,0.050558
12,reviews_per_month,0.047681
11,number_of_reviews,0.047135
19,review_scores_value,0.032645
7,maximum_nights,0.031738
13,review_scores_rating,0.030109
15,review_scores_cleanliness,0.029965


In [6]:
# Save the processed data
rf_result = pd.DataFrame({
    "accuracy": [rf_accuracy],
    "precision": [rf_precision],
    "recall": [rf_recall],
    "f1_score": [rf_f1],
    "roc_auc": [rf_roc_auc]
})
rf_result.to_csv("../results/random_forest_results.csv", index=False)
feature_importances.to_csv("../results/random_forest_feature_importances.csv", index=False)