## Quation 5 part a

In [241]:
import numpy as np
from sklearn import linear_model

raw_heart_data = np.loadtxt('heart.csv',delimiter=',', dtype=str)

# delete first row which are just column names
column_names = raw_heart_data[0].T
raw_heart_data = np.delete(raw_heart_data, 0, axis=0)

# change from type str to float
raw_heart_data = raw_heart_data.astype(float)

# randomize the data
np.random.shuffle(raw_heart_data)
print(raw_heart_data)
print(np.shape(raw_heart_data))

[[62.  0.  0. ...  3.  2.  0.]
 [52.  1.  0. ...  0.  0.  0.]
 [62.  1.  2. ...  3.  3.  1.]
 ...
 [57.  1.  0. ...  0.  1.  1.]
 [54.  1.  0. ...  1.  3.  0.]
 [40.  1.  0. ...  0.  3.  0.]]
(303, 14)


In [242]:
# separate data

heart_data = raw_heart_data[:,0:13]
heart_labels = raw_heart_data[:,13].T

training_data = heart_data[0:200]
test_data = heart_data[200:303]
training_labels = heart_labels[0:200]
test_labels = heart_labels[200:303]

print(np.shape(training_data))
print(np.shape(test_data))
print(np.shape(training_labels))
print(np.shape(test_labels))

(200, 13)
(103, 13)
(200,)
(103,)


In [243]:
# fit logistic regression

logistic_model = linear_model.LogisticRegression(max_iter=1000).fit(training_data, training_labels)
w = logistic_model.coef_
w

array([[ 0.00742989, -1.1784425 ,  0.87922993, -0.01628068, -0.00322369,
        -0.41193656,  0.35555269,  0.02012489, -0.58441923, -0.52096278,
         0.4736624 , -0.85890855, -0.6033157 ]])

In [244]:
# sort indicies of highest magnitude to lowest
sorted_indicies = np.argsort(np.absolute(w[0]) * -1)

# three most influential feature
print("Three most influential features")
for i in sorted_indicies[0:3]:
    print("Index " + str(i) + " " + str(column_names[i]))

Three most influential features
Index 1 sex
Index 2 cp
Index 11 ca


## Question 5 part b

In [245]:
# get test error
test_accuracy = logistic_model.score(test_data, test_labels)
print("Test Error: " + str(1 - test_accuracy))

Test Error: 0.13592233009708743


Question 5 part c

In [246]:
k = 5
total_error = 0
size = len(training_data) // k
for i in range(5):
    total_error += 1 - logistic_model.score(training_data[(i*size):(i*size+size)], training_labels[(i*size):(i*size+size)])
average_error = total_error / 5
print('5-fold cross-validation error: ' + str(average_error))

5-fold cross-validation error: 0.16999999999999998


The test error and the cross-validation error are both about the same with an error of 0.145

In [264]:
s_feature_names = column_names[1:1+1]
s_feature_names = np.append(s_feature_names, column_names[3])
s_feature_names

array(['sex', 'trestbps'], dtype='<U8')

## Question 6 part a

In [278]:
not_s = training_data
not_s_feature_names = column_names

# find the first feature to be in S
min_error = 1
min_feature = 0
for j in range(len(not_s[0])):
    k = 5
    total_error = 0

    sparse_model = linear_model.LogisticRegression(max_iter=1000).fit(not_s[:,j:j+1], training_labels)

    # estimate error using k-fold cross validation
    size = len(training_data) // k
    for i in range(5):
        total_error += 1 - sparse_model.score(not_s[(i*size):(i*size+size),j:j+1], training_labels[(i*size):(i*size+size)])
    average_error = total_error / 5
    if average_error < min_error:
        min_error = average_error
        min_feature = j

print('Feature selected at k = 1: ' + not_s_feature_names[min_feature])

# move the name of feature selected from the feature not in s to in s
s_feature_names = not_s_feature_names[min_feature:min_feature+1]
not_s_feature_names = np.delete(not_s_feature_names, min_feature)

# move the actual feature from not in s to in s
s = not_s[:, min_feature:min_feature+1]
not_s = np.delete(not_s, min_feature, axis=1)

# find all the next features to be in S
for h in range(len(not_s[0])):
    min_error = 1
    min_feature = 0
    for j in range(len(not_s[0])):
        k = 5
        total_error = 0

        sparse_model = linear_model.LogisticRegression(max_iter=1000).fit(np.hstack((s, not_s[:,j:j+1])), training_labels)

        size = len(training_data) // k
        for i in range(5):
            total_error += 1 - sparse_model.score(np.hstack((s[(i*size):(i*size+size)], not_s[(i*size):(i*size+size),j:j+1])), training_labels[(i*size):(i*size+size)])
        average_error = total_error / 5
        if average_error < min_error:
            min_error = average_error
            min_feature = j

    print('Feature selected at k = ' + str(h+2) + ': ' + not_s_feature_names[min_feature])

    s_feature_names = np.append(s_feature_names, not_s_feature_names[min_feature])
    not_s_feature_names = np.delete(not_s_feature_names, min_feature)
    s = np.hstack((s, not_s[:, min_feature:min_feature+1]))
    not_s = np.delete(not_s, min_feature, axis=1)



Feature selected at k = 1: cp
Feature selected at k = 2: ca
Feature selected at k = 3: thal
Feature selected at k = 4: trestbps
Feature selected at k = 5: ï»¿age
Feature selected at k = 6: chol
Feature selected at k = 7: fbs
Feature selected at k = 8: thalach
Feature selected at k = 9: sex
Feature selected at k = 10: slope
Feature selected at k = 11: oldpeak
Feature selected at k = 12: exang
Feature selected at k = 13: restecg
