In [1]:
heart_subset <- heart_train |> 
  select(age, 
         sex, 
         chest_pain, 
         resting_blood_pressure,
         cholesterol,
         fasting_blood_sugar,
         rest_ecg,
         max_heart_rate,
         exercise_induced_angina,
         ST_depression,
         ST_slope,
         thalassemia, 
         diagnosis)

names <- colnames(heart_subset |> select(-diagnosis))

heart_subset

ERROR: Error in select(heart_train, age, sex, chest_pain, resting_blood_pressure, : could not find function "select"


In [2]:
# create an empty tibble to store the results
accuracies <- tibble(size = integer(), 
                     model_string = character(), 
                     accuracy = numeric())

# create a model specification
knn_spec <- nearest_neighbor(weight_func = "rectangular", 
                             neighbors = tune()) |>
     set_engine("kknn") |>
     set_mode("classification")

# create a 5-fold cross-validation object
heart_vfold <- vfold_cv(heart_train, v = 5, strata = diagnosis)

# store the total number of predictors
n_total <- length(names)

# stores selected predictors
selected <- c()

# for every size from 1 to the total number of predictors
for (i in 1:n_total) {
    # for every predictor still not added yet
    accs <- list()
    models <- list()
    for (j in 1:length(names)) {
        # create a model string for this combination of predictors
        preds_new <- c(selected, names[[j]])
        model_string <- paste("diagnosis", "~", paste(preds_new, collapse="+"))

        # create a recipe from the model string
       heart_recipe<- recipe(as.formula(model_string), 
                                data = heart_train) |>
                          step_scale(all_predictors()) |>
                          step_center(all_predictors())

        # tune the KNN classifier with these predictors, 
        # and collect the accuracy for the best K
        acc <- workflow() |>
          add_recipe(heart_recipe) |>
          add_model(knn_spec) |>
          tune_grid(resamples = heart_vfold, grid = 10) |>
          collect_metrics() |>
          filter(.metric == "accuracy") |>
          summarize(mx = max(mean))
        acc <- acc$mx |> unlist()

        # add this result to the dataframe
        accs[[j]] <- acc
        models[[j]] <- model_string
    }
    jstar <- which.max(unlist(accs))
    accuracies <- accuracies |> 
      add_row(size = i, 
              model_string = models[[jstar]], 
              accuracy = accs[[jstar]])
    selected <- c(selected, names[[jstar]])
    names <- names[-jstar]
}
accuracies

ERROR: Error in tibble(size = integer(), model_string = character(), accuracy = numeric()): could not find function "tibble"


In [None]:
choose_preds <- ggplot(accuracies, aes(x = size, y = accuracy)) +
  geom_point() +
  geom_line() +
  xlab("Number of Predictors") +
  ylab("Estimated Accuracy") +
  scale_x_continuous(breaks = 1:12) + 
  theme(text = element_text(size = 12))

choose_preds

In [None]:
library(GGally)
#testing correlation coefficients in order to narrow down important varibles - I only used ones with numerical values as the 0,1s wouldnt show anything. 
options(repr.plot.width=10, repr.plot.height=10)
heart_select <- heart_train|> 
    select("age", "resting_blood_pressure", "cholesterol", "max_heart_rate","ST_depression") 
heart_ggpairs <- heart_select|>
    ggpairs(mapping = aes(alpha = 0.4)) +
    theme(text = element_text(size=15))
heart_ggpairs

In [3]:
Age is really well correlated with all of them. 
Max heart rate and ST_depression 

- * 5% p-value 
- ** 1% p-value
- *** 0.1% p-value 
    Max heart rate and ST_depression 

ERROR: Error in parse(text = x, srcfile = src): <text>:1:5: unexpected symbol
1: Age is
        ^


In [4]:
#first accuracy test, all predictors 
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
            set_engine("kknn") |>
            set_mode("classification")

recipe_12 <- recipe(diagnosis ~ age + sex + chest_pain + resting_blood_pressure + cholesterol + fasting_blood_sugar + rest_ecg + max_heart_rate + exercise_induced_angina + ST_depression + ST_slope + thalassemia, data = heart_train)

heart_vfold <- vfold_cv(heart_train, v = 5, strata = diagnosis)

fit_12 <- workflow() |>
                    add_recipe(recipe_12) |>
                    add_model(knn_spec) |>
                    fit(data = heart_train)|> 
                    tune_grid(resamples = heart_vfold, grid = 10) |>
                      collect_metrics() |>
                      filter(.metric == "accuracy")|> 
                    summarize(best_accuracy = max(mean))
fit_12

ERROR: Error in set_mode(set_engine(nearest_neighbor(weight_func = "rectangular", : could not find function "set_mode"


In [None]:
#Clean and wrangle your data into a tidy format
heart_data_2 <- na.omit(heart_data)
colnames(heart_data_2) <- c("ID", "age", "sex", "location", "chest_pain", "resting_blood_pressure", "cholesterol", "fasting_blood_sugar", "rest_ecg", "max_heart_rate", "exercise_induced_angina", "ST_depression", "ST_slope", "num_major_vessels", "thalassemia", "diagnosis")

#Datasets of other locations do not have inputs for some variables. So will limit dataframe to Cleveland.
heart_data_3 <- heart_data_2|>
    filter(location == "Cleveland")

# Select relevant columns and convert diagnosis and other specified columns to binary
heart_data_4 <- heart_data_3 |>
  select(-location, -num_major_vessels) |>
  mutate(diagnosis = ifelse(diagnosis > 0, 1, 0)) |>
  mutate(sex = ifelse(sex == "female", 0, 1)) |>
  mutate(chest_pain = as.numeric(ifelse(chest_pain == "typical angina", 0, 1))) |>
  mutate(fasting_blood_sugar = as.numeric(ifelse(fasting_blood_sugar == "lower than 120mg/dl", 0, 1))) |>
  mutate(rest_ecg = as.numeric(ifelse(rest_ecg == "normal", 0, 1))) |>
  mutate(exercise_induced_angina = as.numeric(ifelse(exercise_induced_angina == "no", 0, 1))) |>
  mutate(ST_slope = as.numeric(ifelse(ST_slope == "upsloping", 0, 1))) |>
  mutate(thalassemia = as.numeric(ifelse(thalassemia == "normal", 0, 1)))

heart_data_4 <- heart_data_4|>
    mutate(diagnosis = as_factor(diagnosis))
heart_data_4