In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("titanic_modified_dataset.csv")
X_features = df.drop(columns=["Survived"]).to_numpy()
X_features = np.hstack((np.ones((X_features.shape[0], 1)), X_features))
y_target = df["Survived"].to_numpy()
df.head(), df.shape, X_features.shape, y_target.shape

In [None]:
random_state = 2

X_train, X_val, y_train, y_val = train_test_split(
    X_features, y_target, test_size=0.2, random_state=random_state
)
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.125, random_state=random_state
)

X_train.shape, X_val.shape, X_test.shape

In [None]:
normalizer = StandardScaler()
# Chuẩn hóa, giữ lại cột bias
X_train[:, 1:] = normalizer.fit_transform(X_train[:, 1:])
X_val[:, 1:] = normalizer.transform(X_val[:, 1:])
X_test[:, 1:] = normalizer.transform(X_test[:, 1:])

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def predict(X, theta):
    dot_product = np.dot(X, theta)
    return sigmoid(dot_product)


def compute_loss(y_hat, y):
    # Đảm bảo y_hat nằm trong khoảng [1e-7, 1 - 1e-7] để tránh log(0)
    y_hat = np.clip(y_hat, 1e-7, 1 - 1e-7)
    return (-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat)).mean()


def compute_gradient(X, y, y_hat):
    return np.dot(X.T, (y_hat - y)) / y.size


def update_theta(theta, gradient, learning_rate):
    return theta - learning_rate * gradient


def compute_accuracy(y_hat, y):
    return np.mean(np.round(y_hat) == y)

In [None]:
lr = 0.01
epochs = 100
batch_size = 16

np.random.seed(random_state)
theta = np.random.uniform(size=X_train.shape[1])
train_accs = []
train_losses = []
val_accs = []
val_losses = []


for epoch in range(epochs):
    train_batch_losses = []
    train_batch_accuracies = []
    val_batch_losses = []
    val_batch_accuracies = []

    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i : i + batch_size]
        y_batch = y_train[i : i + batch_size]

        y_hat = predict(X_batch, theta)
        loss = compute_loss(y_hat, y_batch)
        train_batch_losses.append(loss)

        accuracy = compute_accuracy(y_hat, y_batch)
        train_batch_accuracies.append(accuracy)

        gradient = compute_gradient(X_batch, y_batch, y_hat)
        theta = update_theta(theta, gradient, lr)

        # Validation
        y_val_hat = predict(X_val, theta)
        val_loss = compute_loss(y_val_hat, y_val)
        val_batch_losses.append(val_loss)
        val_accuracy = compute_accuracy(y_val_hat, y_val)
        val_batch_accuracies.append(val_accuracy)

    train_losses.append(np.mean(train_batch_losses))
    train_accs.append(np.mean(train_batch_accuracies))
    val_losses.append(np.mean(val_batch_losses))
    val_accs.append(np.mean(val_batch_accuracies))

    print(
        f"Epoch {epoch + 1}/{epochs} - "
        f"Train Loss: {train_losses[-1]:.4f}, "
        f"Train Accuracy: {train_accs[-1]:.4f}, "
        f"Validation Loss: {val_losses[-1]:.4f}, "
        f"Validation Accuracy: {val_accs[-1]:.4f}"
    )

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

ax[0, 0].plot(train_losses)
ax[0, 0].set(xlabel="Epoch", ylabel="Loss")
ax[0, 0].set_title("Training Loss")

ax[0, 1].plot(val_losses, color="orange")
ax[0, 1].set(xlabel="Epoch", ylabel="Loss")
ax[0, 1].set_title("Validation Loss")

ax[1, 0].plot(train_accs)
ax[1, 0].set(xlabel="Epoch", ylabel="Accuracy")
ax[1, 0].set_title("Training Accuracy")

ax[1, 1].plot(val_accs, color="orange")
ax[1, 1].set(xlabel="Epoch", ylabel="Accuracy")
ax[1, 1].set_title("Validation Accuracy")

plt.show()

In [None]:
val_set_acc = compute_accuracy(predict(X_val, theta), y_val)
test_set_acc = compute_accuracy(predict(X_test, theta), y_test)
print("Evaluation on validation and test set:")
print(f"Validation Accuracy: {val_set_acc}")
print(f"Test Accuracy: {test_set_acc}")