In [12]:
import numpy as np
import pandas as pd
from numba import jit, njit
from scipy.special import expit, logit

df = pd.read_csv('spam.data', engine='pyarrow', sep=' ', header=None)
X = df.iloc[:, :-1].to_numpy()
Y = df.iloc[:, -1].to_numpy(dtype=np.int8)

# X_scale = (X-X.mean(axis=0))/X.std(axis=0)
X_scale = X

In [13]:
@njit()
def sigmoid(x: np.ndarray):
    return np.exp(-np.logaddexp(0, -x))

@njit()
def confusion_matrix(true, pred):
  '''Computes a confusion matrix using numpy for two np.arrays
  true and pred.

  Results are identical (and similar in computation time) to: 
    "from sklearn.metrics import confusion_matrix"

  However, this function avoids the dependency on sklearn.'''

  K = len(np.unique(true)) # Number of classes 
  result = np.zeros((K, K), dtype=np.int32)

  for i in range(len(true)):
    result[true[i]][pred[i]] += 1

  return result

In [14]:
@njit()
def discriminant_func(X: np.ndarray, S_inv: np.ndarray, mu_j: np.ndarray, pi_j: float):
    return X @ S_inv @ mu_j - 1/2 * mu_j.T @ S_inv @ mu_j + np.log(pi_j)

@njit()
def qda_func(X: np.ndarray, S: np.ndarray, mu_j: np.ndarray, pi_j: float):
    S_inv = np.linalg.inv(S)

    r_j2 = np.sum(((X-mu_j) @ S_inv) * (X-mu_j), axis=1)
    return -1/2*np.log(np.linalg.det(S)) - 1/2*r_j2 + np.log(pi_j), r_j2

def discrinimant_analysis(X: np.ndarray, Y: np.ndarray, atype: str='LDA'):
    n = len(Y)
    n_0 = (1-Y).sum()
    n_1 = Y.sum()

    pi_0 = n_0/n
    pi_1 = n_1/n

    X_0 = X[Y == 0, :]
    X_1 = X[Y == 1, :]

    mu_0 = X_0.mean(axis=0, keepdims=True)
    mu_1 = X_1.mean(axis=0, keepdims=True)

    S_0 = ((X_0 - mu_0).T @ (X_0 - mu_0))/n_0
    S_1 = ((X_1 - mu_1).T @ (X_1 - mu_1))/n_1

    if atype == 'LDA':
        S = (n_0*S_0 + n_1*S_1)/n
        S_inv = np.linalg.inv(S)
        
        delta_0 = discriminant_func(X, S_inv, mu_0.T, pi_0)
        delta_1 = discriminant_func(X, S_inv, mu_1.T, pi_1)

        return np.squeeze(delta_1 > delta_0)*1
    else:
        delta_0, r_02 = qda_func(X, S_0, mu_0, pi_0)
        delta_1, r_12 = qda_func(X, S_1, mu_1, pi_1)

        return (r_12 < r_02 + 2*np.log(pi_1/pi_0) + np.log(np.linalg.det(S_0)/np.linalg.det(S_1)))*1

In [15]:
# Y_lda = discrinimant_analysis(X, Y)
# lda_mat = confusion_matrix(Y, Y_lda)
# lda_mat

In [16]:
# Y_qda = discrinimant_analysis(X, Y, atype='QDA')
# qda_mat = confusion_matrix(Y, Y_qda)
# qda_mat

In [17]:
def logistic_func(X: np.ndarray, beta_j: np.ndarray):
    X1 = np.insert(X, 0, 1, axis=1)
    return expit(X1 @ beta_j)

def logistic_regression(X: np.ndarray, Y: np.ndarray, beta_j: np.ndarray, p_j: np.ndarray):
    diag_W = p_j*(1-p_j)
    logit_j = X @ beta_j
    Z = logit_j + (Y - p_j)/diag_W
    W = np.diag(diag_W.squeeze())

    # Negative Log-likelihood
    L = -(Y*logit_j - np.log( 1 + np.exp(logit_j) )).sum()

    X_inv = np.linalg.inv( X.T @ W @ X )

    return X_inv @ X.T @ W @ Z, L

In [18]:
# X1 = np.insert(X_scale, 0, 1, axis=1)
# if len(Y.shape) == 1:
#     Y1 = np.expand_dims(Y, axis=1)
# else:
#     Y1 = Y

# beta_hat = np.zeros((X1.shape[1], 1))
# p_hat = expit(X1 @ beta_hat)

# L_old = L_new = L_diff = np.inf
# tol = 1e-9

# while L_diff > tol:
#     L_old = L_new
#     beta_hat, L_new = logistic_regression(X1, Y1, beta_hat, p_hat)
#     p_hat: np.ndarray = expit(X1 @ beta_hat)
#     p_hat[p_hat == 1.0] = 1-tol
#     L_diff = np.abs(L_old - L_new)
#     print(L_diff)

# Y_pred = 1*(p_hat>0.5).squeeze()
# print(confusion_matrix(Y.squeeze(), Y_pred))

In [19]:
# from sklearn.tree import DecisionTreeClassifier

# classifier = DecisionTreeClassifier(random_state=42)
# classifier.fit(X, Y)
# Y_pred = classifier.predict(X)

# print(confusion_matrix(Y_pred.squeeze(), Y.squeeze()))

In [20]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, random_state=42, shuffle=True)

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

Fold 0:
  Train: index=[   0    1    2 ... 4595 4598 4599]
  Test:  index=[   8   12   14   17   19   23   26   29   33   45   51   61   69   70
   73   79   80   84   90   93   95   96   99  100  109  111  113  120
  122  132  134  135  139  144  149  150  151  152  157  166  168  175
  177  179  180  184  188  191  192  196  199  203  205  211  214  220
  230  233  238  239  240  251  252  254  270  274  279  287  289  290
  291  296  297  298  305  308  309  314  315  318  322  332  350  351
  354  356  360  366  367  371  376  387  393  402  410  414  415  416
  426  429  432  433  438  443  445  450  452  457  461  463  465  468
  471  472  478  486  490  494  495  497  505  506  511  530  533  534
  538  544  551  555  561  565  567  568  582  584  586  589  594  596
  598  599  602  605  618  621  625  626  633  642  650  655  657  668
  670  677  680  683  691  693  705  718  720  721  731  734  746  751
  759  764  776  787  789  790  798  800  802  803  805  807  810  811
  8

In [21]:
%reset -f