In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df = pd.read_csv("titanic.csv")

In [3]:
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)
embarked_mode = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(embarked_mode)

def get_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

df['Title'] = df['Name'].apply(get_title)

df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
df['Is_Alone'] = 0
df.loc[df['Family_Size'] == 1, 'Is_Alone'] = 1

df['Sex'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int)
df = pd.get_dummies(df, columns=['Embarked', 'Title','Is_Alone'], drop_first=True)
df = df.drop(['Name', 'Ticket', 'SibSp', 'Parch', 'PassengerId','Cabin'], axis=1)

In [4]:
def split_data(X, y, ts=0.2, rs=42):
    np.random.seed(rs)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    X = X[idx]
    y = y[idx]
    sp = int((1 - ts) * X.shape[0])
    return X[:sp], X[sp:], y[:sp], y[sp:]

def norm(X_tr, X_te):
    mn = X_tr.min(axis=0)
    mx = X_tr.max(axis=0)
    X_tr_n = (X_tr - mn) / (mx - mn + 1e-8)
    X_te_n = (X_te - mn) / (mx - mn + 1e-8)
    return X_tr_n, X_te_n

def accuracy(yt, yp):
    return np.mean(yt == yp)

def precision(yt, yp):
    tp = np.sum((yt == 1) & (yp == 1))
    fp = np.sum((yt == 0) & (yp == 1))
    return tp / (tp + fp + 1e-8)

def recall(yt, yp):
    tp = np.sum((yt == 1) & (yp == 1))
    fn = np.sum((yt == 1) & (yp == 0))
    return tp / (tp + fn + 1e-8)

def f1_score(yt, yp):
    prec = precision(yt, yp)
    rec = recall(yt, yp)
    return 2 * (prec * rec) / (prec + rec + 1e-8)

def confusion_matrix(yt, yp):
    tn = np.sum((yt == 0) & (yp == 0))
    fp = np.sum((yt == 0) & (yp == 1))
    fn = np.sum((yt == 1) & (yp == 0))
    tp = np.sum((yt == 1) & (yp == 1))
    return np.array([[tn, fp], [fn, tp]])

In [5]:
X = df.drop(columns=['Survived']).values
y = df['Survived'].values
X_tr, X_te, y_tr, y_te = split_data(X, y, ts=0.2, rs=42)
X_tr_n, X_te_n = norm(X_tr, X_te)
X_tr_n = np.array(X_tr_n, dtype=np.float64)
y_tr = np.array(y_tr, dtype=np.float64)
X_te_n = X_te_n.astype(np.float64)

In [7]:
class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.explained_variance = None
        
    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean
        cov_matrix = np.cov(X_centered.T)
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        self.components = eigenvectors[:, :self.n_components]
        self.explained_variance = eigenvalues[:self.n_components] / np.sum(eigenvalues)
        
    def transform(self, X):
        X_centered = X - self.mean
        return np.dot(X_centered, self.components)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)


In [8]:
pca = PCA(n_components=8)
X_tr_pca = pca.fit_transform(X_tr_n)
X_te_pca = pca.transform(X_te_n)

In [9]:
print(f"orginal :  {X_tr_n.shape[1]}")
print(f"reduced: {X_tr_pca.shape[1]}")
print(f"explained variance: {pca.explained_variance}")
print(f"cumulative variance: {np.cumsum(pca.explained_variance)}")

orginal :  12
reduced: 8
explained variance: [0.39486244 0.16463508 0.13487671 0.11752416 0.09083875 0.03631981
 0.02893547 0.01408696]
cumulative variance: [0.39486244 0.55949752 0.69437423 0.81189839 0.90273715 0.93905696
 0.96799242 0.98207939]


In [10]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)
        self.w = np.zeros(n_features)
        self.b = 0
        
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]
                    
    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)

In [11]:
svm_no_pca = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm_no_pca.fit(X_tr_n, y_tr)
y_pred_svm_no_pca = svm_no_pca.predict(X_te_n)
y_pred_svm_no_pca = np.where(y_pred_svm_no_pca == -1, 0, 1)

In [12]:
acc_svm_no_pca = accuracy(y_te, y_pred_svm_no_pca)
prec_svm_no_pca = precision(y_te, y_pred_svm_no_pca)
rec_svm_no_pca = recall(y_te, y_pred_svm_no_pca)
f1_svm_no_pca = f1_score(y_te, y_pred_svm_no_pca)
cm_svm_no_pca = confusion_matrix(y_te, y_pred_svm_no_pca)

In [14]:
print(f"accuracy:  {acc_svm_no_pca:.4f} ({acc_svm_no_pca*100:.2f}%)")
print(f"precision: {prec_svm_no_pca:.4f}")
print(f"recall:    {rec_svm_no_pca:.4f}")
print(f"F1:  {f1_svm_no_pca:.4f}")
print("\nconfusion matrix:")
print(cm_svm_no_pca)

accuracy:  0.8101 (81.01%)
precision: 0.7541
recall:    0.7077
F1:  0.7302

confusion matrix:
[[99 15]
 [19 46]]


In [18]:
print("SVM WITH PCA\n")

svm_with_pca = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm_with_pca.fit(X_tr_pca, y_tr)
y_pred_svm_pca = svm_with_pca.predict(X_te_pca)
y_pred_svm_pca = np.where(y_pred_svm_pca == -1, 0, 1)

acc_svm_pca = accuracy(y_te, y_pred_svm_pca)
prec_svm_pca = precision(y_te, y_pred_svm_pca)
rec_svm_pca = recall(y_te, y_pred_svm_pca)
f1_svm_pca = f1_score(y_te, y_pred_svm_pca)
cm_svm_pca = confusion_matrix(y_te, y_pred_svm_pca)

print(f"accuracy:  {acc_svm_pca:.4f} ({acc_svm_pca*100:.2f}%)")
print(f"precision: {prec_svm_pca:.4f}")
print(f"recall:    {rec_svm_pca:.4f}")
print(f"f1:  {f1_svm_pca:.4f}")
print("\nconfusion matrix:")
print(cm_svm_pca)


SVM WITH PCA

accuracy:  0.8101 (81.01%)
precision: 0.7541
recall:    0.7077
f1:  0.7302

confusion matrix:
[[99 15]
 [19 46]]


In [22]:
# sane results with or w/o pca in svm

In [23]:
class LogReg:
    def __init__(self, lr=0.01, itr=1000):
        self.lr = lr
        self.itr = itr
        self.w = None
        self.b = None
        self.loss = []
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
    
    def fit(self, X, y):
        n = X.shape[0]
        m = X.shape[1]
        self.w = np.zeros(m)
        self.b = 0
        
        for i in range(self.itr):
            z = np.dot(X, self.w) + self.b
            yp = self.sigmoid(z)
            dw = (1/n) * np.dot(X.T, (yp - y))
            db = (1/n) * np.sum(yp - y)
            self.w -= self.lr * dw
            self.b -= self.lr * db
            l = -(1/n) * np.sum(y * np.log(yp + 1e-8) + (1 - y) * np.log(1 - yp + 1e-8))
            self.loss.append(l)
    
    def predict_proba(self, X):
        z = np.dot(X, self.w) + self.b
        return self.sigmoid(z)
    
    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

In [34]:
print("\nLOGISTIC REGRESSION WITHOUT PCA\n")

logreg_no_pca = LogReg(lr=0.1, itr=2000)
logreg_no_pca.fit(X_tr_n, y_tr)
y_pred_logreg_no_pca = logreg_no_pca.predict(X_te_n)

acc_logreg_no_pca = accuracy(y_te, y_pred_logreg_no_pca)
prec_logreg_no_pca = precision(y_te, y_pred_logreg_no_pca)
rec_logreg_no_pca = recall(y_te, y_pred_logreg_no_pca)
f1_logreg_no_pca = f1_score(y_te, y_pred_logreg_no_pca)
cm_logreg_no_pca = confusion_matrix(y_te, y_pred_logreg_no_pca)

print(f"accuracy:  {acc_logreg_no_pca:.4f} ({acc_logreg_no_pca*100:.2f}%)")
print(f"precision: {prec_logreg_no_pca:.4f}")
print(f"recall:    {rec_logreg_no_pca:.4f}")
print(f"f1:  {f1_logreg_no_pca:.4f}")
print("\nconfusion matrix:")
print(cm_logreg_no_pca)




LOGISTIC REGRESSION WITHOUT PCA

accuracy:  0.8156 (81.56%)
precision: 0.7667
recall:    0.7077
f1:  0.7360

confusion matrix:
[[100  14]
 [ 19  46]]


In [35]:


print("\nLOGISTIC REGRESSION WITH PCA\n")

logreg_pca = LogReg(lr=0.1, itr=2000)
logreg_pca.fit(X_tr_pca, y_tr)
y_pred_logreg_pca = logreg_pca.predict(X_te_pca)

acc_logreg_pca = accuracy(y_te, y_pred_logreg_pca)
prec_logreg_pca = precision(y_te, y_pred_logreg_pca)
rec_logreg_pca = recall(y_te, y_pred_logreg_pca)
f1_logreg_pca = f1_score(y_te, y_pred_logreg_pca)
cm_logreg_pca = confusion_matrix(y_te, y_pred_logreg_pca)

print(f"accuracy:  {acc_logreg_pca:.4f} ({acc_logreg_pca*100:.2f}%)")
print(f"precision: {prec_logreg_pca:.4f}")
print(f"recall:    {rec_logreg_pca:.4f}")
print(f"f1:  {f1_logreg_pca:.4f}")
print("\nconfusion matrix:")
print(cm_logreg_pca)




LOGISTIC REGRESSION WITH PCA

accuracy:  0.8324 (83.24%)
precision: 0.7692
recall:    0.7692
f1:  0.7692

confusion matrix:
[[99 15]
 [15 50]]
