In [137]:
from itertools import chain, combinations

import numpy as np
import pandas as pd
from scipy import stats as st

np.set_printoptions(suppress = True)
pd.set_option('display.width', 100)

data = pd.read_csv('coris.csv', usecols=lambda x: x != 'row.names')

X_features = data.columns.to_list()
X_features.remove('chd')
Y = data['chd'].to_numpy()[:, None]
data

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.00,5.73,23.11,1,49,25.30,97.20,52,1
1,144,0.01,4.41,28.61,0,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,1,52,29.14,3.81,46,0
3,170,7.50,6.41,38.03,1,51,31.99,24.26,58,1
4,134,13.60,3.50,27.78,1,60,25.99,57.34,49,1
...,...,...,...,...,...,...,...,...,...,...
457,214,0.40,5.98,31.72,0,64,28.45,0.00,58,0
458,182,4.20,4.41,32.10,0,52,28.61,18.72,52,1
459,108,3.00,1.59,15.23,0,40,20.09,26.64,55,0
460,118,5.40,11.61,30.79,0,64,27.35,23.97,40,0


In [138]:
def regression(X: np.ndarray, Y: np.ndarray, feature_set: list[str] = None):
    # Num of samples
    n = X.shape[0]

    # Insert (1, X)
    X1 = np.insert(X, 0, 1, axis=1)

    # Num of features
    k = X1.shape[1]

    X_inv = np.linalg.inv(X1.T @ X1)

    # Fitted beta
    beta_hat: np.ndarray = X_inv @ X1.T @ Y

    # Residuals
    epsilon_hat: np.ndarray = Y - X1 @ beta_hat
    # Residuals error
    rss = (epsilon_hat.T @ epsilon_hat).squeeze()
    sigma2_hat = 1/(n-k) * rss

    se_hat = np.sqrt(np.diag( sigma2_hat * X_inv ))[:, None]

    # t-statistics
    t_scores = beta_hat / se_hat
    p_values = 2 * (1 - st.t.cdf(np.abs(t_scores), n-1))

    fit_results = np.concatenate((beta_hat, se_hat, t_scores, p_values), axis=1)
    index = ['const']+list(feature_set) if feature_set is not None else None
    fit_df = pd.DataFrame(
        fit_results, 
        index=index, 
        columns=['beta_hat', 'se_hat', 't_scores', 'p_values']
    )

    return fit_df, rss, sigma2_hat

def log_likelihood(rss: float, n: int) -> float:
    return -n/2*np.log(2*np.pi) - n/2*np.log(rss/n) - n/2

def aic(S: tuple[str] | list[str], data: pd.DataFrame, Y: np.ndarray):
    X = data[list(S)].to_numpy()
    _, rss, _ = regression(X, Y)

    n = X.shape[0]
    k = X.shape[1]

    return 2*(k+1) - 2*log_likelihood(rss, n)

In [139]:
# Backward

S = X_features.copy()
score = (aic(S, data, Y), S)

for _ in range(len(X_features)):
    for x_feature in S:
        S_new = S.copy()
        S_new.remove(x_feature)

        if (aic_score := aic(S_new, data, Y)) < score[0]:
            print(len(S_new), score)
            print()
            score = (aic_score, S_new)

    S = score[1]

8 (520.4113241731397, ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age'])

8 (520.0446551788808, ['tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age'])

8 (518.6494770521196, ['sbp', 'tobacco', 'ldl', 'famhist', 'typea', 'obesity', 'alcohol', 'age'])

7 (518.494546311615, ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'age'])

7 (518.0704768681712, ['tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'age'])

6 (516.7173170095111, ['sbp', 'tobacco', 'ldl', 'famhist', 'typea', 'obesity', 'age'])



In [140]:
%reset -f