In [39]:
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from preprocessing import *

In [2]:
np.random.seed(42)

In [6]:
#Load Data
x_train_raw, x_test_raw,y_train, train_ids, test_ids = load_csv_data('/Users/mpecaut/Documents/EPFL/MASTER/MA3/ML/projects/project1/dataset')

In [7]:
#Reformat y_train to have 0's instead of -1's
y_train_original = y_train.copy()
y_train[y_train == -1] = 0

In [8]:
#replace NaN by a float to treat NaN as a categorical feature
x_test_raw = np.nan_to_num(x_train_raw, nan = -10.0)
x_test_raw.shape

(328135, 321)

In [12]:
#look at ratio of heart attack/no heart attack in y
print(len(y_train[y_train == 0])/len(y_train))
print(len(y_train[y_train==1])/len(y_train))
print(len(y_train))


0.911697929205967
0.08830207079403295
328135


In [None]:
num_features = x_train_raw.shape[1]
numerical_idx = {}
categorical_idx = {}
for feature in range(num_features) : 
    full_column = x_train_raw[:,feature]
    nb_unique_vals = len(np.unique(full_column))
    if nb_unique_vals < 20 : 
        categorical_idx[feature] = nb_unique_vals
    else : 
        numerical_idx[feature] = nb_unique_vals

display(categorical_idx)
display(numerical_idx)

#to test my one hot encode, i use feature 1 on following cell 

{1: 12,
 3: 12,
 5: 2,
 6: 2,
 9: 2,
 10: 3,
 11: 2,
 12: 2,
 13: 3,
 14: 3,
 15: 17,
 16: 13,
 17: 12,
 18: 2,
 19: 2,
 20: 3,
 21: 3,
 22: 2,
 23: 3,
 24: 5,
 26: 8,
 30: 4,
 31: 5,
 32: 5,
 33: 8,
 34: 7,
 35: 5,
 36: 4,
 37: 7,
 38: 5,
 39: 4,
 40: 4,
 41: 5,
 42: 5,
 43: 4,
 44: 4,
 45: 5,
 46: 4,
 47: 4,
 48: 7,
 50: 2,
 51: 7,
 52: 7,
 53: 5,
 54: 5,
 55: 9,
 56: 5,
 57: 5,
 58: 9,
 60: 11,
 61: 5,
 64: 5,
 65: 5,
 66: 5,
 67: 5,
 68: 5,
 69: 5,
 70: 5,
 71: 5,
 72: 5,
 73: 6,
 74: 5,
 75: 11,
 76: 6,
 87: 5,
 95: 5,
 96: 5,
 97: 6,
 98: 14,
 99: 9,
 100: 5,
 102: 14,
 103: 5,
 104: 5,
 106: 12,
 107: 5,
 108: 6,
 109: 4,
 115: 8,
 116: 5,
 117: 5,
 118: 6,
 119: 18,
 120: 8,
 121: 7,
 122: 16,
 123: 5,
 124: 5,
 125: 9,
 126: 5,
 127: 9,
 128: 8,
 129: 8,
 130: 11,
 131: 9,
 132: 5,
 133: 5,
 134: 4,
 135: 4,
 136: 5,
 137: 8,
 138: 8,
 139: 8,
 140: 8,
 141: 5,
 142: 5,
 144: 5,
 146: 4,
 147: 9,
 148: 12,
 149: 19,
 151: 9,
 152: 8,
 153: 7,
 154: 10,
 155: 5,
 156: 5,
 157: 

{0: 53,
 2: 414,
 4: 31,
 7: 21780,
 8: 21780,
 25: 26,
 27: 34,
 28: 33,
 29: 34,
 49: 99,
 59: 26,
 62: 551,
 63: 143,
 77: 41,
 78: 49,
 79: 44,
 80: 59,
 81: 110,
 82: 135,
 83: 109,
 84: 116,
 85: 104,
 86: 134,
 88: 78,
 89: 116,
 90: 195,
 91: 79,
 92: 100,
 93: 169,
 94: 102,
 101: 29,
 105: 413,
 110: 80,
 111: 73,
 112: 48,
 113: 42,
 114: 46,
 143: 137,
 145: 73,
 150: 26,
 195: 94,
 197: 93,
 219: 1306,
 220: 1305,
 222: 5528,
 226: 20453,
 228: 107,
 229: 216913,
 248: 63,
 250: 53,
 251: 104,
 252: 542,
 253: 3525,
 262: 36,
 264: 237,
 266: 98,
 267: 122,
 268: 97,
 269: 103,
 270: 93,
 271: 118,
 276: 501,
 277: 1025,
 285: 30,
 286: 30,
 287: 163,
 288: 156,
 291: 193,
 292: 167,
 293: 111,
 294: 95,
 295: 730,
 296: 577,
 297: 97,
 299: 858,
 300: 673,
 301: 2667,
 302: 535,
 303: 416,
 304: 1374}

In [38]:
x_train_one_hot_encode = x_train_raw[:,1]
unique_vals = np.unique(x_train_one_hot_encode)
print(np.unique(x_train_one_hot_encode)) #my array has value from 1 to 12 : 12 categories -> want to create Nx12 matrix filled up with 0 except for when sample N belong to category i : filled with 1 
#create maping : 
mapping = {float(val) : i for i,val in enumerate(unique_vals)}
display(mapping)
x_mapped =  np.array([mapping[v] for v in x_train_one_hot_encode])
x_encoded = np.eye(len(unique_vals))[x_mapped]

display(x_encoded)
#x_train_encoded = np.eye(len(np.unique(x_train_one_hot_encode)))[x_train_one_hot_encode]

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]


{1.0: 0,
 2.0: 1,
 3.0: 2,
 4.0: 3,
 5.0: 4,
 6.0: 5,
 7.0: 6,
 8.0: 7,
 9.0: 8,
 10.0: 9,
 11.0: 10,
 12.0: 11}

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(328135, 12))

In [None]:
numerical_idx, categorical_idx = find_categorical(x_train_raw)

In [43]:
for i in categorical_idx: 
    x = x_train_raw[:,i]
    print(x)

[11. 12. 10. ... 10. 12.  9.]
[11. 12. 10. ... 10. 12.  9.]
[2015. 2015. 2015. ... 2015. 2015. 2015.]
[1100. 1200. 1100. ... 1100. 1100. 1100.]
[nan  1.  1. ...  1.  1.  1.]
[nan  1.  1. ...  1.  1.  1.]
[nan nan nan ... nan nan nan]
[nan  1.  1. ...  1.  1.  1.]
[nan  1.  2. ...  1.  1.  2.]
[nan nan nan ... nan nan nan]
[nan  2.  2. ...  1.  3.  2.]
[nan  1.  1. ...  0.  2.  1.]
[nan  1.  1. ...  1.  1.  1.]
[ 1. nan nan ... nan nan nan]
[ 1. nan nan ... nan nan nan]
[ 2. nan nan ... nan nan nan]
[ 1. nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[ 1. nan nan ... nan nan nan]
[ 1. nan nan ... nan nan nan]
[2. 4. 2. ... 3. 3. 2.]
[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 2.]
[2. 2. 2. ... 2. 2. 1.]
[1. 1. 1. ... 1. 1. 1.]
[3. 1. 3. ... 3. 3. 1.]
[nan  1. nan ... nan nan  2.]
[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]
[2. 2. 1. ... 1. 2. 2.]
[2. 2. 2. ... 2. 2. 2.]
[2. 2. 2. ... 2. 1. 2.]
[nan nan nan ... nan  1. nan]
[2. 2. 2. ... 2. 2. 2.]
[2. 2. 2. ... 2. 2. 2.]
[2. 

### Malamud function for OLS - just did copy past

In [7]:

def ols_matrix(X, y):
    """Perform OLS regression using matrix inversion."""
    X_mat = np.column_stack((np.ones(len(X)), X))  # Add intercept term
    beta = np.linalg.pinv(X_mat.T @ X_mat) @ X_mat.T @ y  # OLS formula: (X'X)^-1 X'Y
    y_pred = X_mat @ beta  # Predicted values

    # Compute residuals
    residuals = y - y_pred
    n, k = X_mat.shape

    # Compute residual variance
    residual_var = np.sum(residuals**2) / (n - k)

    # Compute standard errors
    XTX_inv = np.linalg.inv(X_mat.T @ X_mat)
    std_errors = np.sqrt(np.diag(residual_var * XTX_inv))

    # Compute t-statistics
    t_stats = beta / std_errors

   

    return {
        'Intercept': beta[0],
        'Coefficient': beta[1],
        'Std Error (Coef)': std_errors[1],
        'T-Statistic (Coef)': t_stats[1],
        'T-Statistic (Intercept)': t_stats[0]
    }



In [15]:
ols_reg = ols_matrix(x_train,y_train)

LinAlgError: SVD did not converge