In [43]:
import numpy as np
import matplotlib.pyplot as plt
import random
try:
    import importlib
    importlib.reload(h)
    importlib.reload(f)
    importlib.reload(d)
except NameError: # It hasn't been imported yet
    import helpers as h
    import implementations as f
    import data_processing as d

# Data preprocessing and feature selections

## Data preprocessing

In [3]:
#For this to work, the data folder needs to be one level above the project folder and the folder name needs
#to be 'data'
data_folder = '../data/'
x_train, x_test, y_train, train_ids, test_ids = h.load_csv_data(data_folder)


In [4]:
#load all the features names and remove the ID column
features_name = np.genfromtxt('../data/x_train.csv', delimiter=',', dtype=str, max_rows=1)[1:] 

## Feature selection

one paper on internet suggests to use these features : 

 _RFHYPE5, TOLDHI2, _CHOLCHK, _BMI5, SMOKE100, CVDSTRK3, DIABETE3, _TOTINDA, _FRTLT1, _VEGLT1, _RFDRHV5, HLTHPLN1, MEDCOST, GENHLTH, MENTHLTH, PHYSHLTH, DIFFWALK, SEX, _AGEG5YR, EDUCA, and INCOME2

 then, iterating through them, it removes the missing values, made the data binary when possible, removed the 'don't know, not sure', and ordinal (categorical) variables ares changed to 0,1,2,..., and renamed them

# Main

In [5]:
#Select the important features
features_list = ['_RFHYPE5', 'TOLDHI2', '_CHOLCHK', '_BMI5', 'SMOKE100', 'CVDSTRK3', 'DIABETE3', '_TOTINDA', '_FRTLT1', '_VEGLT1', '_RFDRHV5', 
                 'HLTHPLN1', 'MEDCOST', 'GENHLTH', 'MENTHLTH', 'PHYSHLTH', 'DIFFWALK', 'SEX', '_AGEG5YR', 'EDUCA', 'INCOME2']

#Create a mask to filter the data
mask = np.isin(features_name, features_list)

x_train_featured = x_train[:, mask]
x_test_featured = x_test[:, mask]

In [6]:
#remove all missing values on X and remove corresponding lines in Y and ids

x_train_featured_clean = x_train_featured[~np.isnan(x_train_featured).any(axis=1)]
#x_test_featured_clean = x_test_featured[~np.isnan(x_test_featured).any(axis=1)]

y_train_clean = y_train[~np.isnan(x_train_featured).any(axis=1)]

train_ids_filtered = train_ids[~np.isnan(x_train_featured).any(axis=1)]
#test_ids_filtered = test_ids[~np.isnan(x_test_featured).any(axis=1)]

print(x_train_featured_clean.shape, y_train_clean.shape, x_test_featured.shape, train_ids_filtered.shape)


(257733, 21) (257733,) (109379, 21) (257733,)


### We want to clean the data for each feature, making them binary for yes/no, etc... and rename them

In [7]:
#train data

x_train_processed, y_train_processed, ids_train_processed = d.feature_processing(x_train_featured_clean, y_train_clean, train_ids_filtered)

In [8]:
#Test data
x_test_processed = d.feature_processing_test(x_test_featured)

### Now that the preprocessing has been done, we can format the data to be used by the algorithms

In [9]:
tX_train = np.c_[np.ones((len(y_train_processed), 1)), x_train_processed]
tX_test = np.c_[np.ones((len(x_test_featured), 1)), x_test_featured]

## And then, we can run the algorithms

1. MSE gradient descent

In [10]:
#Compute gradient descent with MSE as loss function (see functions.py for the function)

initial_w = [random.choice([1, -1]) for i in range(len(tX_train[0]))]

w_mse_gd, loss_mse_gd = f.mean_squared_error_gd(y_train_processed, tX_train, initial_w, 500, 0.01)

Gradient Descent(499/500): loss=0.36610420165584884


In [11]:
#Test the model on the test sample. Do we need to standardize ?

y_test = tX_test.dot(w_mse_gd)
y_test_rounded = np.where(y_test > 0, 1, -1) #not sure about this line

print('weights = \n\n', w_mse_gd,'\n\n Loss = ', loss_mse_gd,'\n\n*****************************************************************************',
      ' \n\n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded == 1)/len(y_test_rounded))

weights = 

 [ 0.9983649  -0.16200943  0.00430593  0.001176    0.79282177  0.74400033
  0.31161574  0.76798885 -0.26295255 -0.30374897 -0.12552476 -0.02685832
 -0.7805071  -0.36720938 -0.54901789 -0.82418507 -0.01715525 -1.00861971
  0.79307391  0.21966954  0.38087663 -0.56989337] 

 Loss =  0.36610420165584884 

*****************************************************************************  

 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.04727598533539345


2. MSE SGD

In [16]:
w_mse_sgd, loss_mse_sgd = f.mean_squared_error_sgd(y_train_processed, tX_train, initial_w, 500, 0.0001)

SGD iter. 499/499: loss=1.9697986341411864, w0=1.1354000674791846, w1=-0.6413476775707898


In [17]:
y_test_sgd = tX_test.dot(w_mse_sgd)
y_test_rounded_sgd = np.where(y_test_sgd > 0, 1, -1)

print('weights = \n', w_mse_sgd,'\n Loss = ', loss_mse_sgd,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded_sgd == 1)/len(y_test_rounded_sgd))

weights = 
 [ 1.13540007 -0.64134768 -0.05683514  0.21212907  1.00469463  1.12510268
  1.06869715  1.12562852 -0.74617025 -0.91480522 -0.28146503 -0.06965598
 -0.88484654 -0.89438242 -0.9100599  -0.87255593  0.2335261  -0.96389908
  1.13267781  1.02203206  1.00867986 -0.96641201] 
 Loss =  1.9697986341411864 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.13274943087795646


3. Least squares

In [18]:
w_ls, loss_ls = f.least_squares(y_train_processed, tX_train)

In [19]:

y_test_ls = tX_test.dot(w_ls)
y_test_ls = np.where(y_test_ls > 0, 1, -1)

print('weights = \n', w_ls,'\n Loss = ', loss_ls,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ls == 1)/len(y_test_ls))

weights = 
 [-4.23458311e-01  6.69229048e-02  1.84255656e-03 -2.88677208e-04
 -1.72692968e-02 -1.45978253e-02 -7.62584533e-02 -3.73059100e-01
 -4.33297730e-02 -1.06050366e-01  1.03039949e-03 -6.91952059e-03
 -9.97705263e-02 -4.55439527e-02 -6.73076101e-02  2.69260672e-02
  2.23087967e-02 -2.36533255e-01  3.86932707e-02 -6.06623971e-03
 -8.09010894e-03 -9.17165058e-03] 
 Loss =  0.14512298742909366 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0014993737371890399


4. Ridge regression

In [31]:
w_ridge, loss_ridge = f.ridge_regression(y_train_processed, tX_train, 0.1)

In [32]:
y_test_ridge = tX_test.dot(w_ridge)
y_test_ridge = np.where(y_test_ridge > 0, 1, -1)

print('weights = \n', w_ridge,'\n Loss = ', loss_ridge,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ridge == 1)/len(y_test_ridge))

weights = 
 [-0.05446848  0.01909874  0.00402915 -0.00136054 -0.01866968 -0.03347632
 -0.07015045 -0.10700854 -0.06969889 -0.07622882 -0.04981145 -0.02171525
 -0.07341898 -0.04008595 -0.06586933 -0.03631186  0.01438435 -0.01918326
 -0.03391057 -0.01702109 -0.01232494 -0.01216396] 
 Loss =  0.149955122105239 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  1.8285045575476097e-05


5. Logistic regression

In [37]:
y_train_processed_logreg = np.where(y_train_processed == 1, 1, 0)

In [59]:
w_logreg, loss_logreg = f.logistic_regression(y_train_processed_logreg, tX_train,np.zeros(22),200, 0.1)

Gradient Descent(0/199): loss=0.7334222943120227, w0=-0.040599017831350614, w1=-0.09388219228446126
Gradient Descent(1/199): loss=0.5039173063536879, w0=-0.03159831712146977, w1=-0.06308501345033278
Gradient Descent(2/199): loss=0.32940776728793036, w0=-0.023563175321223904, w1=-0.034601898395042616
Gradient Descent(3/199): loss=0.32452908449651496, w0=-0.0202363658371515, w1=-0.019933155058361823
Gradient Descent(4/199): loss=0.34103798151574394, w0=-0.024910102653780547, w1=-0.031202820035394305
Gradient Descent(5/199): loss=0.3164223870335607, w0=-0.02091128744753009, w1=-0.013252799527350114
Gradient Descent(6/199): loss=0.3339948863536741, w0=-0.026879583586654387, w1=-0.02700788358035513
Gradient Descent(7/199): loss=0.3113204183989352, w0=-0.02311165613296598, w1=-0.009735638290876935
Gradient Descent(8/199): loss=0.3283683288358732, w0=-0.029124308305221572, w1=-0.023647992108958425
Gradient Descent(9/199): loss=0.3078863184776599, w0=-0.025412727385532043, w1=-0.00668021173056

In [60]:
y_test_logreg = tX_test.dot(w_logreg)
y_test_logreg = np.where(y_test_logreg > 0.5, 1, 0)

print('weights = \n', w_logreg,'\n Loss = ', loss_logreg,'\n*****************************************************************************',
        ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_logreg == 1)/len(y_test_logreg))

weights = 
 [-0.14176837  0.15730149  0.03346908  0.00515173 -0.03886945 -0.11844364
 -0.31634258 -0.27480137 -0.33766603 -0.31785316 -0.29703101 -0.10415193
 -0.22435663 -0.20535771 -0.30740196 -0.10683725  0.17908248 -0.03747371
 -0.11498309 -0.04570317 -0.03555813 -0.0252816 ] 
 Loss =  0.26745057030222974 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.009160807833313524
