In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

# Load training and testing datasets
train_df = pd.read_csv("assets/titanic-machine_learning_from_disaster/train.csv")
test_df = pd.read_csv("assets/titanic-machine_learning_from_disaster/test.csv")


# Preprocessing function
def preprocess_data(df, is_train=True):
    df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])  # Drop irrelevant columns
    
    # Convert categorical variables to numeric
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    df["Embarked"] = df["Embarked"].map({"C": 0, "Q": 1, "S": 2})
    
    # Handle missing values
    df = df.fillna(df.mean())  # Fill missing values with column mean
    
    # Drop "Survived" only if it's training data
    if not is_train:
        return df
    return df.drop(columns=["Survived"]), df["Survived"]

# Apply preprocessing
X_train, y_train = preprocess_data(train_df, is_train=True)
X_test = preprocess_data(test_df, is_train=False)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)

Best Parameters: {'C': 0.01, 'solver': 'saga'}


In [2]:
# Define models with cross-validation
models = {
    "Simple Logistic Regression": LogisticRegression(solver="liblinear"),
    "L1 (Lasso) Regularized Logistic Regression (C=0.01)": LogisticRegression(penalty="l1", solver="liblinear", C=0.01),
    "L2 (Ridge) Regularized Logistic Regression (C=0.001)": LogisticRegression(penalty="l2", solver="liblinear", C=0.001),
}

# Store results
cv_scores = {}
train_acc = {}
train_auc = {}
conf_matrices = {}

for name, model in models.items():
    # Perform cross-validation (5-fold)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores[name] = scores.mean()

    # Fit model on full training data
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)  # Training predictions for evaluation
    y_pred_test = model.predict(X_test)  # Predictions on test data

    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_pred_train)
    
    train_acc[name] = acc_train
    train_auc[name] = auc_train
    conf_matrices[name] = confusion_matrix(y_train, y_pred_train)

    print(f"\n{name}:")
    print(f"Cross-Validation Accuracy: {cv_scores[name]:.4f}")
    print(f"Train Set Accuracy: {acc_train:.4f}")
    print(f"Train AUC Score: {auc_train:.4f}")
    print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
    print("Train Classification Report:\n", classification_report(y_train, y_pred_train))

    # Save test predictions to CSV
    test_predictions_df = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived": y_pred_test})
    test_predictions_df.to_csv(f"{name.replace(' ', '_')}_predictions.csv", index=False)


Simple Logistic Regression:
Cross-Validation Accuracy: 0.7845
Train Set Accuracy: 0.8013
Train AUC Score: 0.7826
Train Confusion Matrix:
 [[474  75]
 [102 240]]
Train Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84       549
           1       0.76      0.70      0.73       342

    accuracy                           0.80       891
   macro avg       0.79      0.78      0.79       891
weighted avg       0.80      0.80      0.80       891


L1 (Lasso) Regularized Logistic Regression (C=0.01):
Cross-Validation Accuracy: 0.7867
Train Set Accuracy: 0.7868
Train AUC Score: 0.7669
Train Confusion Matrix:
 [[468  81]
 [109 233]]
Train Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       549
           1       0.74      0.68      0.71       342

    accuracy                           0.79       891
   macro avg       0.78      0.77      0.77   

In [8]:
warnings.filterwarnings("ignore")

ensemble_model = VotingClassifier(estimators=[
    ('logreg', LogisticRegression(C=0.01, solver='saga')),
    ('random_forest', RandomForestClassifier(n_estimators=200, max_depth=6)),
    ('xgb', XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.2, eval_metric="logloss"))
], voting='soft')

ensemble_model.fit(X_train, y_train)
y_train_pred = ensemble_model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")

Ensemble Model Accuracy: 0.9102
