In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
df = pd.read_csv("../dataset/titanic_processed.csv", index_col=0)

In [3]:
X = df.drop(columns=["survived"])
y = df["survived"]

In [4]:
X = pd.get_dummies(X, drop_first=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
lr = LogisticRegression(max_iter=5000, solver='lbfgs', random_state=42)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
print("🔹 Logistic Regression Report:\n", classification_report(y_test, y_pred_lr))
print("🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("🔹 ROC AUC Score:", roc_auc_score(y_test, lr.predict_proba(X_test_scaled)[:,1]))

🔹 Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       110
           1       0.82      0.78      0.80        69

    accuracy                           0.85       179
   macro avg       0.84      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179

🔹 Confusion Matrix:
 [[98 12]
 [15 54]]
🔹 ROC AUC Score: 0.8740447957839264


In [8]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("🔹 Random Forest Report:\n", classification_report(y_test, y_pred_rf))
print("🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("🔹 ROC AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

🔹 Random Forest Report:
               precision    recall  f1-score   support

           0       0.82      0.81      0.81       110
           1       0.70      0.71      0.71        69

    accuracy                           0.77       179
   macro avg       0.76      0.76      0.76       179
weighted avg       0.77      0.77      0.77       179

🔹 Confusion Matrix:
 [[89 21]
 [20 49]]
🔹 ROC AUC Score: 0.8390645586297759


In [9]:
cv_lr = cross_val_score(lr, X, y, cv=5, scoring="accuracy")
cv_rf = cross_val_score(rf, X, y, cv=5, scoring="accuracy")

print("🔁 CV Accuracy (Logistic Regression):", np.round(cv_lr.mean(), 4))
print("🔁 CV Accuracy (Random Forest):", np.round(cv_rf.mean(), 4))

🔁 CV Accuracy (Logistic Regression): 0.8092
🔁 CV Accuracy (Random Forest): 0.8171


In [10]:
joblib.dump(rf, "../models/rf_model.pkl")
joblib.dump(X.columns.tolist(), "../models/feature_columns.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']