# Predicting the Likelihood of Heart Disease within a Patient

## Introduction
[REPLACE]

## Methods & Results


In [5]:
#loading all packages
library(tidymodels)
library(tidyverse)
library(repr)
library(rvest)
library(readxl)
library(RColorBrewer)
library(cowplot)
     

In [6]:
#read csv file from UCI
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
heart_data <- read_csv(url, col_names = FALSE)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [7]:
# Cleaning and wrangling data
# Added meaningful column names. 
# We changed the orignal attribute name "num" to "Heart_Disease" because "num" had little meaning

set.seed(1000)

heart_data <- rename(heart_data,
                     Age = X1,
                     Sex = X2,
                     Chest_Pain_Type = X3,
                     Resting_Blood_Pressure = X4,
                     Serum_Cholestoral = X5,
                     Fasting_Blood_Sugar = X6,
                     Resting_Electrocardiographic_Results = X7,
                     Maximum_Heart_Rate = X8,
                     Exercise_Induced_Angina = X9,
                     ST_Depression = X10,
                     Slope_Peak_excercise = X11,
                     Major_Vessels = X12,
                     Thalassemia = X13,
                     Heart_Disease = X14)



We are predicting if the patient has heart disease so we want to change it into a factor.

In [8]:
heart_data$Heart_Disease <- as.factor(heart_data$Heart_Disease)

We only want to know if each patient is tested positive or negative for heart disease. This means we only need the numbers 0 (negative) and 1 (postive) and want to remove other numbers. We reassigned the numbers 2, 3, and 4 to 1 because numbers that are greater 1 also mean that the patient has heart disease.

In [9]:
heart_data$Heart_Disease[heart_data$Heart_Disease== "4"]<- "1"
heart_data$Heart_Disease[heart_data$Heart_Disease== "3"]<- "1"
heart_data$Heart_Disease[heart_data$Heart_Disease== "2"]<- "1"


Here we summarized the data in one table. [ADD MORE EXPLAINATION]

In [10]:

summary_table <- heart_data |> 
                   group_by(Heart_Disease) |>
                   summarize(number_patients = n(),
                           mean_age = mean(Age, na.rm = TRUE),   
                           median_age = median(Age, na.rm = TRUE),
                           mean_resting_blood_pressure = mean(Resting_Blood_Pressure, na.rm = TRUE),
                           median_resting_blood_pressure = median(Resting_Blood_Pressure, na.rm = TRUE),  
                           mean_max_heart_rate = mean(Maximum_Heart_Rate, na.rm = TRUE),
                           median_max_heart_rate = median(Maximum_Heart_Rate, na.rm = TRUE),
                           number_rows_missing_data = sum(heart_data=="?"))
                           #total_entries = sum(number_patients)) #how to get 303?
summary_table


Heart_Disease,number_patients,mean_age,median_age,mean_resting_blood_pressure,median_resting_blood_pressure,mean_max_heart_rate,median_max_heart_rate,number_rows_missing_data
<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
0,164,52.58537,52,129.25,130,158.378,161,6
1,139,56.6259,58,134.5683,130,139.259,142,6


In [19]:
set.seed(1000)

heart_cut <- heart_data |> select(Age, Sex, Heart_Disease, Maximum_Heart_Rate)
heart_split <- initial_split(heart_cut, prop = 0.75, strata = Heart_Disease) 
heart_train <- training(heart_split)   
heart_test <- testing(heart_split)


#### Table Legend
Table 1 <br>
Table 2 <br>
Table 3 <br>

#### Figure Legend
Figure 1 <br>
Figure 2 <br>
Figure 3 <br>

#### Table 1: Training Data

In [16]:
head(heart_train)

Age,Sex,Heart_Disease,Maximum_Heart_Rate
<dbl>,<dbl>,<fct>,<dbl>
63,1,0,150
37,1,0,187
41,0,0,172
57,0,0,163
57,1,0,148
44,1,0,173


#### Table 2: Testing Data

In [17]:
head(heart_test)

Age,Sex,Heart_Disease,Maximum_Heart_Rate
<dbl>,<dbl>,<fct>,<dbl>
67,1,1,108
56,1,0,178
63,1,1,147
56,0,0,153
57,1,0,174
54,1,0,160


#### Table 3: KNN Calculations

In [20]:

set.seed(200)

heart_initial_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")


heart_recipe <- recipe(Heart_Disease ~ Age + Sex + Maximum_Heart_Rate, data = heart_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

heart_vfold <- vfold_cv(heart_train, v = 5, strata = Heart_Disease)

k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))

heart_results <- workflow() |>
       add_recipe(heart_recipe) |>
       add_model(heart_initial_spec) |>
       tune_grid(resamples = heart_vfold, grid = k_vals) |>
       collect_metrics()

set.seed(200)
accuracies <- heart_results |>
  filter(.metric == "accuracy") |> 
    arrange(desc(mean)) |>
    slice(1)

accuracies

final_k <- accuracies |> select(n)

final_k
     


[33m![39m [33mFold1: internal:
  No observations were detected in `truth` for level(s): '2', '3', '4'
  Computation will proceed by ignoring those levels.[39m

[33m![39m [33mFold2: internal:
  No observations were detected in `truth` for level(s): '2', '3', '4'
  Computation will proceed by ignoring those levels.[39m

[33m![39m [33mFold3: internal:
  No observations were detected in `truth` for level(s): '2', '3', '4'
  Computation will proceed by ignoring those levels.[39m

[33m![39m [33mFold4: internal:
  No observations were detected in `truth` for level(s): '2', '3', '4'
  Computation will proceed by ignoring those levels.[39m

[33m![39m [33mFold5: internal:
  No observations were detected in `truth` for level(s): '2', '3', '4'
  Computation will proceed by ignoring those levels.[39m



neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
26,accuracy,multiclass,0.7449846,5,0.01451897,Preprocessor1_Model06


n
<int>
5


In [None]:
set.seed(200)
heart_final_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = final_k) |>
  set_engine("kknn") |>
  set_mode("classification")

heart_final_fit <- workflow() |>
       add_recipe(heart_recipe) |>
       add_model(heart_final_spec) |>
       fit(data = heart_train)

heart_testing <- predict(heart_final_fit, heart_test) |> bind_cols(heart_test)

predictions <- heart_testing |> conf_mat(truth = Heart_Disease, estimate = .pred_class)

predictions

prediction_accuracy <- heart_testing |> metrics(truth = Heart_Disease, estimate = .pred_class)

prediction_accuracy

## Discussion




#### Conclusion & Future Areas of Investigation


## References
