In [2]:
import numpy as np
import matplotlib.pyplot as plt
import random
try:
    import importlib
    importlib.reload(h)
    importlib.reload(f)
    importlib.reload(d)
except NameError: # It hasn't been imported yet
    import helpers as h
    import functions as f
    import data_processing as d

# Data preprocessing and feature selections

## Data preprocessing

In [4]:
#For this to work, the data folder needs to be one level above the project folder and the folder name needs
#to be 'data'
data_folder = '../data/'
x_train, x_test, y_train, train_ids, test_ids = h.load_csv_data(data_folder)


In [15]:
test_ids

array([328135, 328136, 328137, ..., 437511, 437512, 437513])

In [5]:
#load all the features names and remove the ID column
features_name = np.genfromtxt('../data/x_train.csv', delimiter=',', dtype=str, max_rows=1)[1:] 

## Feature selection

one paper on internet suggests to use these features : 

 _RFHYPE5, TOLDHI2, _CHOLCHK, _BMI5, SMOKE100, CVDSTRK3, DIABETE3, _TOTINDA, _FRTLT1, _VEGLT1, _RFDRHV5, HLTHPLN1, MEDCOST, GENHLTH, MENTHLTH, PHYSHLTH, DIFFWALK, SEX, _AGEG5YR, EDUCA, and INCOME2

 then, iterating through them, it removes the missing values, made the data binary when possible, removed the 'don't know, not sure', and ordinal (categorical) variables ares changed to 0,1,2,..., and renamed them

# Main

In [24]:
#Select the important features
features_list = ['_RFHYPE5', 'TOLDHI2', '_CHOLCHK', '_BMI5', 'SMOKE100', 'CVDSTRK3', 'DIABETE3', '_TOTINDA', '_FRTLT1', '_VEGLT1', '_RFDRHV5', 
                 'HLTHPLN1', 'MEDCOST', 'GENHLTH', 'MENTHLTH', 'PHYSHLTH', 'DIFFWALK', 'SEX', '_AGEG5YR', 'EDUCA', 'INCOME2']

#Create a mask to filter the data
mask = np.isin(features_name, features_list)

x_featured = x_train[:, mask]
x_test_featured = x_test[:, mask]

In [25]:
#remove all missing values on X and remove corresponding lines in Y and ids

x_train_featured_clean = x_featured[~np.isnan(x_featured).any(axis=1)]
#x_test_featured_clean = x_test_featured[~np.isnan(x_test_featured).any(axis=1)]

y_train_clean = y_train[~np.isnan(x_featured).any(axis=1)]

train_ids_filtered = train_ids[~np.isnan(x_featured).any(axis=1)]
#test_ids_filtered = test_ids[~np.isnan(x_test_featured).any(axis=1)]

print(x_train_featured_clean.shape, y_train_clean.shape, x_test_featured.shape, train_ids_filtered.shape)


(257733, 21) (257733,) (109379, 21) (257733,)


### We want to clean the data for each feature, making them binary for yes/no, etc... and rename them

In [26]:
#train data

x_train_processed, y_train_processed, ids_train_processed = d.feature_processing(x_train_featured_clean, y_train_clean, train_ids_filtered)

In [9]:
#Test data
#x_test_processed, y_test_processed, ids_test_processed = d.feature_processing(x_test_featured_clean,np.ones(len(x_test_featured_clean)), test_ids_filtered)

### Now that the preprocessing has been done, we can format the data to be used by the algorithms

In [29]:
tX_train = np.c_[np.ones((len(y_train_processed), 1)), x_train_processed]
tX_test = np.c_[np.ones((len(x_test), 1)), x_test_featured]

In [30]:
tX_test.shape

(109379, 22)

## And then, we can run the algorithms

1. MSE gradient descent

In [41]:
#Compute gradient descent with MSE as loss function (see functions.py for the function)

initial_w = [random.choice([1, -1]) for i in range(len(tX_train[0]))]

loss_mse_gd, w_mse_gd = f.mean_squared_error_gd(y_train_processed, tX_train, initial_w, 200, 0.01)

Gradient Descent(199/199): loss=0.6564220127901458


In [42]:
#Test the model on the test sample. Do we need to standardize ?

y_test = tX_test.dot(w_mse_gd)
y_test_rounded = np.where(y_test > 0, 1, -1) #not sure about this line

print('weights = \n\n', w_mse_gd,'\n\n Loss = ', loss_mse_gd,'\n\n*****************************************************************************',
      ' \n\n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded == 1)/len(y_test_rounded))

weights = 

 [ 0.96909138 -0.16080975  0.00532468 -0.01263067  0.87080396 -0.83911177
 -0.8652867   0.84927538  0.2189146   0.53502819  0.15335661 -0.06288844
 -0.97434237 -0.70586766  0.50893218 -0.92353659  0.04739167  1.00946872
 -0.8902922   0.74136761 -0.73924173 -0.69446739] 

 Loss =  0.6564220127901458 

*****************************************************************************  

 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.7841724645498679


In [36]:
y_test_rounded.shape

(109379,)

In [35]:
h.create_csv_submission(test_ids, y_test_rounded, 'submission.csv')

2. MSE SGD

In [43]:
loss_mse_sgd, w_mse_sgd = f.mean_squared_error_sgd(y_train_processed, tX_train, initial_w, 300, 0.001)

SGD iter. 0/299: loss=16.569797445000003, w0=1.0057567, w1=-0.9884866
SGD iter. 1/299: loss=127.66835690929922, w0=1.0217359588632453, w1=-0.9565280822735097
SGD iter. 2/299: loss=16.403821803904922, w0=1.027463754565591, w1=-0.9393446951664727
SGD iter. 3/299: loss=6.016517171418726, w0=1.0309326210004421, w1=-0.928938095861919
SGD iter. 4/299: loss=44.83882847741122, w0=1.040402449771588, w1=-0.9005286095484814
SGD iter. 5/299: loss=163.4850281715464, w0=1.0584847631286531, w1=-0.8281993561202212
SGD iter. 6/299: loss=36.84751964074203, w0=1.0670693446141704, w1=-0.7938610301781526
SGD iter. 7/299: loss=0.16484806229861784, w0=1.0676435363283192, w1=-0.7927126467498548
SGD iter. 8/299: loss=131.14347469574156, w0=1.0514482608014415, w1=-0.8412984733304878
SGD iter. 9/299: loss=0.5675677455569745, w0=1.0503828334265755, w1=-0.8423639007053538
SGD iter. 10/299: loss=8.840032541922145, w0=1.04617806647895, w1=-0.850773434600605
SGD iter. 11/299: loss=2.5125056441463913, w0=1.04841972017

In [44]:
y_test_sgd = tX_test.dot(w_mse_sgd)
y_test_rounded_sgd = np.where(y_test_sgd > 0, 1, -1)

print('weights = \n', w_mse_sgd,'\n Loss = ', loss_mse_sgd,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded_sgd == 1)/len(y_test_rounded_sgd))

weights = 
 [ 1.02002222 -0.69625629 -0.04484852  0.04037839  0.97347682 -0.93044266
 -0.97467541  0.99463631  0.89890713  0.95639175  0.88569948 -0.61152395
 -1.00416119 -0.93668369  0.86975533 -0.96138052 -0.03191353  0.99446869
 -0.96492499  0.99373941 -0.95773467 -0.94086369] 
 Loss =  0.021615558953509077 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.6719297122848079


3. Least squares

In [91]:
loss_ls, w_ls = f.least_squares(y_train_processed, tX_train)

In [92]:
y_test_ls = tX_test.dot(w_ls)
y_test_ls = np.where(y_test_ls > 0, 1, -1)

print('weights = \n', w_ls,'\n Loss = ', loss_ls,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ls == 1)/len(y_test_ls))

weights = 
 [-4.23458311e-01  6.69229048e-02  1.84255656e-03 -2.88677208e-04
 -1.72692968e-02 -1.45978253e-02 -7.62584533e-02 -3.73059100e-01
 -4.33297730e-02 -1.06050366e-01  1.03039949e-03 -6.91952059e-03
 -9.97705263e-02 -4.55439527e-02 -6.73076101e-02  2.69260672e-02
  2.23087967e-02 -2.36533255e-01  3.86932707e-02 -6.06623971e-03
 -8.09010894e-03 -9.17165058e-03] 
 Loss =  0.14512298742909363 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0036185509994469465


4. Ridge regression

In [93]:
loss_ridge, w_ridge = f.ridge_regression(y_train_processed, tX_train, 0.1)

In [79]:
y_test_ridge = tX_test.dot(w_ridge)
y_test_ridge = np.where(y_test_ridge > 0.5, 1, 0)

print('weights = \n', w_ridge,'\n Loss = ', loss_ridge,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ridge == 1)/len(y_test_ridge))

weights = 
 [-0.05446848  0.01909874  0.00402915 -0.00136054 -0.01866968 -0.03347632
 -0.07015045 -0.10700854 -0.06969889 -0.07622882 -0.04981145 -0.02171525
 -0.07341898 -0.04008595 -0.06586933 -0.03631186  0.01438435 -0.01918326
 -0.03391057 -0.01702109 -0.01232494 -0.01216396] 
 Loss =  0.149955122105239 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0


5. ridge & least square polynomial regression

In [102]:
loss_log, w_log = f.logistic_regression(y_train_processed, tX_train, 100, 0.000001, initial_w)

y_test_log = tX_test.dot(w_log)
print(y_test_log)
y_test_log = np.where(y_test_log > 0, 1, -1)

print('weights = \n', w_log,'\n Loss = ', loss_log,'\n*****************************************************************************',
        ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_log == 1)/len(y_test_log))

Gradient Descent(0/99): loss=-1903764.6888709585, w0=0.8285066393687142, w1=0.5788726652137013
Gradient Descent(1/99): loss=-6393559.696314749, w0=0.6739094924049219, w1=0.22137822314805816
Gradient Descent(2/99): loss=-10119382.338791667, w0=0.5193124923635676, w1=-0.13611577696355825
Gradient Descent(3/99): loss=-13845203.169172822, w0=0.3647154923635435, w1=-0.49360977696361913
Gradient Descent(4/99): loss=-17571023.999199484, w0=0.21011849236354352, w1=-0.8511037769636192
Gradient Descent(5/99): loss=-21296844.829225928, w0=0.055521492363543534, w1=-1.2085977769636191
Gradient Descent(6/99): loss=-25022665.65925239, w0=-0.09907550763645645, w1=-1.5660917769636191
Gradient Descent(7/99): loss=-28748486.48927884, w0=-0.25367250763645643, w1=-1.923585776963619
Gradient Descent(8/99): loss=-32474307.3193053, w0=-0.4082695076364564, w1=-2.281079776963619
Gradient Descent(9/99): loss=-36200128.149331756, w0=-0.5628665076364564, w1=-2.638573776963619
Gradient Descent(10/99): loss=-3992594