In [7]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)
source('tests.R')
source('cleanup.R')

diabetes_data <- read_csv("group project/diabetes_012_health_indicators_BRFSS2015.csv")
diabetes_data
diabetes_data <- diabetes_data |>
                mutate(Diabetes_012 = as_factor(Diabetes_012))|>
                select(Diabetes_012, BMI, Age) 

set.seed(1)
diabetes_split <- initial_split(diabetes_data, prop =0.75, strata = Diabetes_binary)
diabetes_train <-training(diabetes_split)
diabetes_test <-testing(diabetes_split)


knn_spec <-nearest_neighbor(weight_func ="rectangular", neighbors =3) |>  
set_engine("kknn") |>  
set_mode("classification")

diabetes_recipe <-recipe(Diabetes_012 ~ ., data = diabetes_train) |>  
    step_scale(all_predictors()) |>
    step_center(all_predictors())

knn_fit <-workflow() |>  
add_recipe(diabetes_recipe) |>  
add_model(knn_spec) |>  
fit(data =diabetes_train)

diabetes_test_predictions <-predict(knn_fit, diabetes_test) |>  
    bind_cols(diabetes_test)

#compute the accuracy
diabetes_test_predictions |>  
metrics(truth =Diabetes_012, estimate =.pred_class) |>  
filter(.metric =="accuracy")

diabetes_test_predictions

#cross-validation
diabetes_vfold <-vfold_cv(diabetes_train, v =5, strata = Class)
diabetes_resample_fit <- workflow() |>
        add_recipe(diabetes_recipe) |>
        add_model(knn_spec) |>
        fit_resamples(resamples = diabetes_vfold)

diabetes_resample_fit
#for accuracy
diabetes_metrics <- collect_metrics(diabetes_resample_fit)

#finding K
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
        set_engine("kknn")|>
        set_mode("classification")
knn_tune

#not sure that kvals, maybe just = 10
kvals = tibble(neighbors =seq(1,10,1))
               
knn_results <- workflow()|>
        add_recipe(diabetes_recipe) |>
        add_model(knn_tune)|>
        tune_grid(resamples = diabetes_vfold, grid = kvals) |>
        collect_metrics()
               
knn_results
               
accuracies <-knn_results |>
  filter(.metric =="accuracy")
accuracies

accuracy_vs_k <-ggplot(accuracies, aes(x =neighbors, y =mean)) +
  geom_point() +
  geom_line() +
  labs(x ="Neighbors", y ="Accuracy Estimate") +
  theme(text =element_text(size =12))
accuracy_vs_k
               



“cannot open file 'tests.R': No such file or directory”


ERROR: Error in file(filename, "r", encoding = encoding): cannot open the connection
