In [23]:
import numpy as np
import matplotlib.pyplot as plt
import random
try:
    import importlib
    importlib.reload(h)
    importlib.reload(f)
    importlib.reload(d)
except NameError: # It hasn't been imported yet
    import helpers as h
    import implementations as f
    import data_processing as d

In [24]:
#Functions testing

y = np.array([0.1, 0.3, 0.5])
tx = np.array([[2.3, 3.2],
       [1. , 0.1],
       [1.4, 2.3]])
initial_w = np.array([0.5, 1. ])

print(f.mean_squared_error_gd(y, tx, initial_w, 2, 0.1))


Gradient Descent(1/2): loss=0.05153391102516714
(0.05153391102516714, array([-0.0505865,  0.203718 ]))


# Data preprocessing and feature selections

## Data preprocessing

In [2]:
#For this to work, the data folder needs to be one level above the project folder and the folder name needs
#to be 'data'
data_folder = '../data/'
x_train, x_test, y_train, train_ids, test_ids = h.load_csv_data(data_folder)


In [3]:
#load all the features names and remove the ID column
features_name = np.genfromtxt('../data/x_train.csv', delimiter=',', dtype=str, max_rows=1)[1:] 

## Feature selection

one paper on internet suggests to use these features : 

 _RFHYPE5, TOLDHI2, _CHOLCHK, _BMI5, SMOKE100, CVDSTRK3, DIABETE3, _TOTINDA, _FRTLT1, _VEGLT1, _RFDRHV5, HLTHPLN1, MEDCOST, GENHLTH, MENTHLTH, PHYSHLTH, DIFFWALK, SEX, _AGEG5YR, EDUCA, and INCOME2

 then, iterating through them, it removes the missing values, made the data binary when possible, removed the 'don't know, not sure', and ordinal (categorical) variables ares changed to 0,1,2,..., and renamed them

# Main

In [4]:
#Select the important features
features_list = ['_RFHYPE5', 'TOLDHI2', '_CHOLCHK', '_BMI5', 'SMOKE100', 'CVDSTRK3', 'DIABETE3', '_TOTINDA', '_FRTLT1', '_VEGLT1', '_RFDRHV5', 
                 'HLTHPLN1', 'MEDCOST', 'GENHLTH', 'MENTHLTH', 'PHYSHLTH', 'DIFFWALK', 'SEX', '_AGEG5YR', 'EDUCA', 'INCOME2']

#Create a mask to filter the data
mask = np.isin(features_name, features_list)

x_train_featured = x_train[:, mask]
x_test_featured = x_test[:, mask]

In [5]:
#remove all missing values on X and remove corresponding lines in Y and ids

x_train_featured_clean = x_train_featured[~np.isnan(x_train_featured).any(axis=1)]
#x_test_featured_clean = x_test_featured[~np.isnan(x_test_featured).any(axis=1)]

y_train_clean = y_train[~np.isnan(x_train_featured).any(axis=1)]

train_ids_filtered = train_ids[~np.isnan(x_train_featured).any(axis=1)]
#test_ids_filtered = test_ids[~np.isnan(x_test_featured).any(axis=1)]

print(x_train_featured_clean.shape, y_train_clean.shape, x_test_featured.shape, train_ids_filtered.shape)


(257733, 21) (257733,) (109379, 21) (257733,)


### We want to clean the data for each feature, making them binary for yes/no, etc... and rename them

In [6]:
#train data

x_train_processed, y_train_processed, ids_train_processed = d.feature_processing(x_train_featured_clean, y_train_clean, train_ids_filtered)

In [7]:
#Test data
x_test_processed = d.feature_processing_test(x_test_featured)

### Now that the preprocessing has been done, we can format the data to be used by the algorithms

In [8]:
tX_train = np.c_[np.ones((len(y_train_processed), 1)), x_train_processed]
tX_test = np.c_[np.ones((len(x_test_featured), 1)), x_test_featured]

## And then, we can run the algorithms

1. MSE gradient descent

In [20]:
#Compute gradient descent with MSE as loss function (see functions.py for the function)

initial_w = [random.choice([1, -1]) for i in range(len(tX_train[0]))]

loss_mse_gd, w_mse_gd = f.mean_squared_error_gd(y_train_processed, tX_train, initial_w, 500, 0.01)

Gradient Descent(499/499): loss=0.32769387068796324


In [22]:
#Test the model on the test sample. Do we need to standardize ?

y_test = tX_test.dot(w_mse_gd)
y_test_rounded = np.where(y_test > 0, 1, -1) #not sure about this line

print('weights = \n\n', w_mse_gd,'\n\n Loss = ', loss_mse_gd,'\n\n*****************************************************************************',
      ' \n\n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded == 1)/len(y_test_rounded))

weights = 

 [-1.12003736  0.04369493  0.00924979  0.00405113 -0.76925385  0.55948445
  0.34281681 -0.99374111 -0.17802971  0.18945982 -0.10338647 -0.01428739
  0.63213463  0.17630682 -0.48662352  0.69059892 -0.0125298  -1.04848411
  0.62677997 -0.43484478  0.50208397  0.40486598] 

 Loss =  0.32769387068796324 

*****************************************************************************  

 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.03527185291509339


In [30]:
h.create_csv_submission(test_ids, y_test_rounded, 'submission.csv')

2. MSE SGD

In [38]:
loss_mse_sgd, w_mse_sgd = f.mean_squared_error_sgd(y_train_processed, tX_train, initial_w, 100, 0.001)

SGD iter. 99/99: loss=0.8913555580611868, w0=-1.0957512141863186, w1=0.6218335454261331


In [39]:
y_test_sgd = tX_test.dot(w_mse_sgd)
y_test_rounded_sgd = np.where(y_test_sgd > 0, 1, -1)

print('weights = \n', w_mse_sgd,'\n Loss = ', loss_mse_sgd,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded_sgd == 1)/len(y_test_rounded_sgd))

weights = 
 [-1.09575121  0.62183355 -0.01184333 -0.06391322 -1.02483247  0.91982737
  0.9950234  -1.06192009 -1.07145493  0.95977936 -1.28126005  0.47825127
  0.87661805  0.98765948 -1.00156047  0.90492215  0.07786516 -1.02807118
  0.90289457 -1.01769473  0.9575455   0.94399984] 
 Loss =  0.8913555580611868 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.08267583356951518


In [36]:
h.create_csv_submission(test_ids, y_test_rounded_sgd, 'submission_sgd.csv')

3. Least squares

In [40]:
loss_ls, w_ls = f.least_squares(y_train_processed, tX_train)

In [41]:

y_test_ls = tX_test.dot(w_ls)
y_test_ls = np.where(y_test_ls > 0, 1, -1)

print('weights = \n', w_ls,'\n Loss = ', loss_ls,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ls == 1)/len(y_test_ls))

weights = 
 [-4.23458311e-01  6.69229048e-02  1.84255656e-03 -2.88677208e-04
 -1.72692968e-02 -1.45978253e-02 -7.62584533e-02 -3.73059100e-01
 -4.33297730e-02 -1.06050366e-01  1.03039949e-03 -6.91952059e-03
 -9.97705263e-02 -4.55439527e-02 -6.73076101e-02  2.69260672e-02
  2.23087967e-02 -2.36533255e-01  3.86932707e-02 -6.06623971e-03
 -8.09010894e-03 -9.17165058e-03] 
 Loss =  0.14512298742909363 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0014993737371890399


4. Ridge regression

In [53]:
loss_ridge, w_ridge = f.ridge_regression(y_train_processed, tX_train, 0.01)

In [54]:
y_test_ridge = tX_test.dot(w_ridge)
y_test_ridge = np.where(y_test_ridge > 0, 1, -1)

print('weights = \n', w_ridge,'\n Loss = ', loss_ridge,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ridge == 1)/len(y_test_ridge))

weights = 
 [-0.16377828  0.04761936  0.0022347  -0.00075101 -0.03589838 -0.03794182
 -0.08334175 -0.30033357 -0.05525385 -0.10661762 -0.01748007 -0.01083984
 -0.11183888 -0.04632714 -0.07325671 -0.03681777  0.01920381 -0.08080507
 -0.01634787 -0.01536691 -0.01296922 -0.01597534] 
 Loss =  0.14581618262297555 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0007222593002313058


5. Logistic regression

In [100]:
loss_logreg, w_logreg = f.logistic_regression(y_train_processed, tX_train, 100, 1, np.random.randn(len(tX_train[0])))

Gradient Descent(0/99): loss=12.664567912987415, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(1/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(2/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(3/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(4/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(5/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(6/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(7/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(8/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(9/99): loss=17.640293270782664, w0=-0.6112201862116786, w1=-3.161829064379262
Gradient Descent(10/99): loss=17.640293270782664, 

In [101]:
y_test_logreg = tX_test.dot(w_logreg)
y_test_logreg = np.where(y_test_logreg > 0, 1, -1)

print('weights = \n', w_logreg,'\n Loss = ', loss_logreg,'\n*****************************************************************************',
        ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_logreg == 1)/len(y_test_logreg))

weights = 
 [-0.61122019 -3.16182906 -3.76541213 -2.6172506   1.32280547  0.20572008
 -1.30555144  0.01649966 -0.96933361 -0.23708139 -4.47291248 -6.77729425
 -1.58715381 -1.29099027  0.63929557 -1.23620535 -7.10154332 -1.24994876
  3.39785215 -0.736755   -0.82514383  0.06352064] 
 Loss =  17.640293270782664 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0
