In [9]:
import numpy as np
import matplotlib.pyplot as plt
import random
try:
    import importlib
    importlib.reload(h)
    importlib.reload(f)
    importlib.reload(d)
except NameError: # It hasn't been imported yet
    import helpers as h
    import implementations as f
    import data_processing as d

# Data preprocessing and feature selections

## Data preprocessing

#For this to work, the data folder needs to be one level above the project folder and the folder name needs
#to be 'data'
data_folder = '../data/'
x_train, x_test, y_train, train_ids, test_ids = h.load_csv_data(data_folder)


In [10]:
x_train = np.load("../data/x_train.npy")
x_test = np.load("../data/x_test.npy")
y_train = np.load("../data/y_train.npy")
train_ids = np.load("../data/trains_ids.npy")
test_ids = np.load("../data/x_train.npy")

In [11]:
#features_named all the features names and remove the ID column
features_name = np.genfromtxt('../data/x_train.csv', delimiter=',', dtype=str, max_rows=1)[1:] 

In [12]:
features_name

array(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES',
       'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1',
       'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE',
       'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH',
       'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS',
       'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW',
       'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2',
       'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA',
       'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3',
       'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3',
       'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK',
       'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2',
       'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5',
    

## Feature selection

one paper on internet suggests to use these features : 

 _RFHYPE5, TOLDHI2, _CHOLCHK, _BMI5, SMOKE100, CVDSTRK3, DIABETE3, _TOTINDA, _FRTLT1, _VEGLT1, _RFDRHV5, HLTHPLN1, MEDCOST, GENHLTH, MENTHLTH, PHYSHLTH, DIFFWALK, SEX, _AGEG5YR, EDUCA, and INCOME2

 then, iterating through them, it removes the missing values, made the data binary when possible, removed the 'don't know, not sure', and ordinal (categorical) variables ares changed to 0,1,2,..., and renamed them

# Main

In [8]:
#Select the important features
features_list = ['_RFHYPE5', 'TOLDHI2', '_CHOLCHK', '_BMI5', 'SMOKE100', 'CVDSTRK3', 'DIABETE3', '_TOTINDA', '_FRTLT1', '_VEGLT1', '_RFDRHV5', 
                 'HLTHPLN1', 'MEDCOST', 'GENHLTH', 'MENTHLTH', 'PHYSHLTH', 'DIFFWALK', 'SEX', '_AGEG5YR', 'EDUCA', 'INCOME2']

#Create a mask to filter the data
mask = np.isin(features_name, features_list)

x_train_featured = x_train[:, mask]
x_test_featured = x_test[:, mask]

In [6]:
#remove all missing values on X and remove corresponding lines in Y and ids

x_train_featured_clean = x_train_featured[~np.isnan(x_train_featured).any(axis=1)]
#x_test_featured_clean = x_test_featured[~np.isnan(x_test_featured).any(axis=1)]

y_train_clean = y_train[~np.isnan(x_train_featured).any(axis=1)]

train_ids_filtered = train_ids[~np.isnan(x_train_featured).any(axis=1)]
#test_ids_filtered = test_ids[~np.isnan(x_test_featured).any(axis=1)]

print(x_train_featured_clean.shape, y_train_clean.shape, x_test_featured.shape, train_ids_filtered.shape)


(257733, 21) (257733,) (109379, 21) (257733,)


### We want to clean the data for each feature, making them binary for yes/no, etc... and rename them

In [7]:
#train data

x_train_processed, y_train_processed, ids_train_processed = d.feature_processing(x_train_featured_clean, y_train_clean, train_ids_filtered)

In [8]:
#Test data
x_test_processed = d.feature_processing_test(x_test_featured)

### Now that the preprocessing has been done, we can format the data to be used by the algorithms

In [9]:
tX_train = np.c_[np.ones((len(y_train_processed), 1)), x_train_processed]
tX_test = np.c_[np.ones((len(x_test_featured), 1)), x_test_featured]

## And then, we can run the algorithms

1. MSE gradient descent

In [219]:
#Compute gradient descent with MSE as loss function (see functions.py for the function)


tX_train_train = tX_train[:int(len(tX_train)*0.7)]
y_train_train = y_train_processed[:int(len(tX_train)*0.7)]
tX_train_test = tX_train[int(len(tX_train)*0.7):]
y_train_test = y_train_processed[int(len(tX_train)*0.7):]

initial_w = [random.choice([1, -1]) for i in range(len(tX_train_train[0]))]


w_mse_gd, loss_mse_gd = f.mean_squared_error_gd(y_train_train, tX_train_train, np.ones(len(tX_train_train[0])), 800, 0.01)



Gradient Descent(799/800): loss=0.1897731751797139


In [220]:
y_pred = tX_train_test.dot(w_mse_gd)
y_pred = np.where(y_pred > 0, 1, -1)

_,_,_,_,f1 = f.confusion_matrix(y_train_test, y_pred)

print("Accuracy: ", np.sum(y_pred == y_train_test)/len(y_train_test))
print("F1 score: ", f1)


Accuracy:  0.8930303401670198
F1 score:  0.0847813061713601


In [180]:
#Test the model on the test sample. Do we need to standardize ?

y_test = tX_test.dot(w_mse_gd)
y_test_rounded = np.where(y_test > 0, 1, -1) #not sure about this line

print('weights = \n\n', w_mse_gd,'\n\n Loss = ', loss_mse_gd,'\n\n*****************************************************************************',
      ' \n\n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded == 1)/len(y_test_rounded))

weights = 

 [ 0.20016236 -0.12801528  0.00898558 -0.0014339   0.59881605  0.22863561
 -0.10955409  0.00920166 -0.22065606 -0.04104045 -0.1761094  -0.02571184
  0.0342869   0.00815134 -0.075883    0.21763749 -0.00166043  0.73671946
  0.10441582  0.00314219  0.14855273  0.15085362] 

 Loss =  0.20383784711309283 

*****************************************************************************  

 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.012305835672295413


Lets run some cross validation to see the best initial weights (as a function of the proportion of 1, -1 and 0)


2. MSE SGD

In [225]:
w_mse_sgd, loss_mse_sgd = f.mean_squared_error_sgd(y_train_train, tX_train_train, initial_w, 1000, 0.0001)

SGD iter. 9999/9999: loss=1.1143493933198159, w0=-0.9329338588478854, w1=0.5625435719796987


In [223]:
y_pred = tX_train_test.dot(w_mse_sgd)
y_pred = np.where(y_pred > 0, 1, -1)

_,_,_,_,f1 = f.confusion_matrix(y_train_test, y_pred)

print("Accuracy: ", np.sum(y_pred == y_train_test)/len(y_train_test))

print("F1 score: ", f1)

Accuracy:  0.6104973826572594
F1 score:  0.2515139281388777


In [17]:
y_test_sgd = tX_test.dot(w_mse_sgd)
y_test_rounded_sgd = np.where(y_test_sgd > 0, 1, -1)

print('weights = \n', w_mse_sgd,'\n Loss = ', loss_mse_sgd,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_rounded_sgd == 1)/len(y_test_rounded_sgd))

weights = 
 [ 1.13540007 -0.64134768 -0.05683514  0.21212907  1.00469463  1.12510268
  1.06869715  1.12562852 -0.74617025 -0.91480522 -0.28146503 -0.06965598
 -0.88484654 -0.89438242 -0.9100599  -0.87255593  0.2335261  -0.96389908
  1.13267781  1.02203206  1.00867986 -0.96641201] 
 Loss =  1.9697986341411864 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.13274943087795646


3. Least squares

In [18]:
w_ls, loss_ls = f.least_squares(y_train_train, tX_train_train)

In [226]:
y_pred = tX_train_test.dot(w_ls)
y_pred = np.where(y_pred > 0, 1, -1)

_,_,_,_,f1 = f.confusion_matrix(y_train_test, y_pred)

print("Accuracy: ", np.sum(y_pred == y_train_test)/len(y_train_test))

print("F1 score: ", f1)

Accuracy:  0.9067035487315954
F1 score:  0.04651994990159242


In [19]:

y_test_ls = tX_test.dot(w_ls)
y_test_ls = np.where(y_test_ls > 0, 1, -1)

print('weights = \n', w_ls,'\n Loss = ', loss_ls,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ls == 1)/len(y_test_ls))

weights = 
 [-4.23458311e-01  6.69229048e-02  1.84255656e-03 -2.88677208e-04
 -1.72692968e-02 -1.45978253e-02 -7.62584533e-02 -3.73059100e-01
 -4.33297730e-02 -1.06050366e-01  1.03039949e-03 -6.91952059e-03
 -9.97705263e-02 -4.55439527e-02 -6.73076101e-02  2.69260672e-02
  2.23087967e-02 -2.36533255e-01  3.86932707e-02 -6.06623971e-03
 -8.09010894e-03 -9.17165058e-03] 
 Loss =  0.14512298742909366 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.0014993737371890399


4. Ridge regression

In [31]:
w_ridge, loss_ridge = f.ridge_regression(y_train_processed, tX_train, 0.1)

In [32]:
y_test_ridge = tX_test.dot(w_ridge)
y_test_ridge = np.where(y_test_ridge > 0, 1, -1)

print('weights = \n', w_ridge,'\n Loss = ', loss_ridge,'\n*****************************************************************************',
      ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_ridge == 1)/len(y_test_ridge))

weights = 
 [-0.05446848  0.01909874  0.00402915 -0.00136054 -0.01866968 -0.03347632
 -0.07015045 -0.10700854 -0.06969889 -0.07622882 -0.04981145 -0.02171525
 -0.07341898 -0.04008595 -0.06586933 -0.03631186  0.01438435 -0.01918326
 -0.03391057 -0.01702109 -0.01232494 -0.01216396] 
 Loss =  0.149955122105239 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  1.8285045575476097e-05


5. Logistic regression

In [228]:
y_train_processed_logreg = np.where(y_train_processed == 1, 1, 0)
y_train_train_lg = np.where(y_train_train == 1, 1, 0)

In [240]:
w_logreg, loss_logreg = f.logistic_regression(y_train_train_lg, tX_train_train,np.ones(22),500, 0.1)

Gradient Descent(0/499): loss=21.96877831777393, w0=0.9093797833110306, w1=0.7806904468969754
Gradient Descent(1/499): loss=8.516230403823496, w0=0.8187595680658941, w1=0.5613808965387262
Gradient Descent(2/499): loss=1.1112130549467802, w0=0.7286759324390027, w1=0.34323625815741576
Gradient Descent(3/499): loss=1.0287880971182712, w0=0.724688268692458, w1=0.3382666493352598
Gradient Descent(4/499): loss=0.9734401744169507, w0=0.7227414855200985, w1=0.33966505190774937
Gradient Descent(5/499): loss=0.9284300324438038, w0=0.7206397572223124, w1=0.3403953756696024
Gradient Descent(6/499): loss=0.8917895612248119, w0=0.7182297083472068, w1=0.3399488031710274
Gradient Descent(7/499): loss=0.8613608549225421, w0=0.7155614087749902, w1=0.3384931066951417
Gradient Descent(8/499): loss=0.8353535631998257, w0=0.7127087064730241, w1=0.3362884006516644
Gradient Descent(9/499): loss=0.8125198922615304, w0=0.7097237609525731, w1=0.3335244674526573
Gradient Descent(10/499): loss=0.7920565838584024, 

In [241]:
y_pred = tX_train_test.dot(w_logreg)
y_pred = np.where(y_pred > 0.5, 1, 0)
y_pred = np.where(y_pred == 1, 1, -1)

_,_,_,_,f1 = f.confusion_matrix(y_train_test, y_pred)

print("Accuracy: ", np.sum(y_pred == y_train_test)/len(y_train_test))

print("F1 score: ", f1)

Accuracy:  0.9054255151525762
F1 score:  0.008443465491923641


In [85]:
y_test_logreg = tX_test.dot(w_logreg)
y_test_logreg = np.where(y_test_logreg > 0.5, 1, 0)

print('weights = \n', w_logreg,'\n Loss = ', loss_logreg,'\n*****************************************************************************',
        ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_logreg == 1)/len(y_test_logreg))

weights = 
 [-0.14176837  0.15730149  0.03346908  0.00515173 -0.03886945 -0.11844364
 -0.31634258 -0.27480137 -0.33766603 -0.31785316 -0.29703101 -0.10415193
 -0.22435663 -0.20535771 -0.30740196 -0.10683725  0.17908248 -0.03747371
 -0.11498309 -0.04570317 -0.03555813 -0.0252816 ] 
 Loss =  0.26745057030222974 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.009160807833313524


In [116]:
w_reg_logreg, loss_reg_logreg = f.reg_logistic_regression(y_train_processed_logreg, tX_train, 0.01, np.ones(22), 100, 0.1)

Gradient Descent(0/99): loss=21.930727286892388, w0=0.9084009821717713, w1=0.7795541111935378
Gradient Descent(1/99): loss=8.474978621057401, w0=0.8168935647377576, w1=0.5593286709508312
Gradient Descent(2/99): loss=1.112755674914998, w0=0.7260332139209921, w1=0.34053076285489187
Gradient Descent(3/99): loss=1.0292131955241746, w0=0.7214682859988492, w1=0.33565670633552386
Gradient Descent(4/99): loss=0.9720318875039671, w0=0.7188306896184088, w1=0.3367921321756957
Gradient Descent(5/99): loss=0.9256597286828387, w0=0.7160268703935094, w1=0.33721570066806733
Gradient Descent(6/99): loss=0.8879895731118403, w0=0.7129063352623713, w1=0.3364261892619551
Gradient Descent(7/99): loss=0.8567271875426155, w0=0.7095237161169086, w1=0.33460774515773006
Gradient Descent(8/99): loss=0.8299952733430804, w0=0.7059554768504959, w1=0.33203024436617773
Gradient Descent(9/99): loss=0.8065019449064174, w0=0.7022547164454287, w1=0.328886788625222
Gradient Descent(10/99): loss=0.7854250756153601, w0=0.698

In [117]:
y_test_reg_logreg = tX_test.dot(w_reg_logreg)
y_test_reg_logreg = np.where(y_test_reg_logreg > 0.5, 1, 0)

print('weights = \n', w_reg_logreg,'\n Loss = ', loss_reg_logreg,'\n*****************************************************************************',
        ' \n Train sample : \n', 'Heart attack rate = ', np.count_nonzero(y_train_processed == 1)/len(y_train_processed), '\n \n Test sample : \n', 'Heart attack rate = ', np.count_nonzero(y_test_reg_logreg == 1)/len(y_test_reg_logreg))


weights = 
 [ 0.41740406 -0.0347784   0.02820227 -0.02871117  0.76251219  0.49102874
  0.28948992  0.32714718 -0.18646387  0.39548634 -0.86924372 -0.34607578
  0.36652173  0.4430387   0.29726436  0.45858772  0.00361668  0.76280634
  0.43376869  0.55855625  0.64858325  0.66830989] 
 Loss =  0.3882537507258965 
*****************************************************************************  
 Train sample : 
 Heart attack rate =  0.09400982168649387 
 
 Test sample : 
 Heart attack rate =  0.013146947768767313


In [119]:
y_sub = np.where(y_test_reg_logreg == 1, 1, -1)
h.create_csv_submission(test_ids, y_sub, 'submission_reg_logreg2.csv')