In [None]:
install.packages("kknn")
install.packages("gridExtra")
library(tidyverse)
library(tidymodels)
library(gridExtra)
library(dplyr)
library(kknn)
library(class)

set.seed(5)

In [None]:
#water <- read_csv("train.csv")
water_train <- read_csv("water_train_new.csv")
water_train <- mutate(water_train, urban_bi = 
                case_when(urban == "R" ~ 0,
                          urban == "U" ~ 1))
water_test <- read_csv("water_test_new.csv")
water_test <- mutate(water_test, urban_bi = 
                case_when(urban == "R" ~ 0,
                          urban == "U" ~ 1))

In [None]:
# water_split <- initial_split(water, prop = 0.75, strata = water_index)
# water_train <- training(water_split)
# water_test <- testing(water_split)

In [None]:
# nrow(water_train)
# nrow(water_test)
# nrow(water)

In [None]:
water_recipe <- recipe(water_index ~ asset_index, data = water_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

water_spec <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("regression")

water_vfold <- vfold_cv(water_train, v = 5, strata = water_index)

water_wkflw <- workflow() |>
  add_recipe(water_recipe) |>
  add_model(water_spec)

water_wkflw

In [None]:
gridvals <- tibble(neighbors = seq(from = 1, to = 200, by = 3))

water_results <- water_wkflw |>
  tune_grid(resamples = water_vfold, grid = gridvals) |>
  collect_metrics() |>
  filter(.metric == "rmse")

# show the results
water_results

In [None]:
# show only the row of minimum RMSPE
water_min <- water_results |>
  filter(mean == min(mean))

water_min

In [None]:
#smallest RMSPE occurs when K = 145. 

In [None]:
#Using water_train 

kmin <- water_min |> pull(neighbors)

water_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = kmin) |>
  set_engine("kknn") |>
  set_mode("regression")

water_fit <- workflow() |>
  add_recipe(water_recipe) |>
  add_model(water_spec) |>
  fit(data = water_train)

water_summary <- water_fit |>
  predict(water_test) |>
  bind_cols(water_test) |>
  metrics(truth = water_index, estimate = .pred) |>
  filter(.metric == 'rmse')

water_summary 

In [None]:
water_preds <- tibble(asset_index = seq(from = -3, to = 3, by = 0.01))

water_preds <- water_fit |>
  predict(water_preds) |>
  bind_cols(water_preds)

water_preds

plot_final <- ggplot(water_train, aes(x = asset_index, y = water_index)) +
  geom_point(alpha = 0.4) +
  geom_line(data = water_preds, 
            mapping = aes(x = asset_index, y = .pred), 
            color = "blue") +
  xlab("Asset Index") +
  ylab("Water Index") +
  ggtitle(paste0("K = ", kmin)) + 
  theme(text = element_text(size = 12))

plot_final

In [None]:
# using test_masked
test_masked_new <- read.csv("test_masked_new.csv")
test_masked_new <- mutate(test_masked_new, urban_bi = 
                case_when(urban == "R" ~ 0,
                          urban == "U" ~ 1))

X_train <- water_train[, -asset_index] # remove the target column from training data
X_test <- test_masked_new[, -asset_index] # remove the target column from test data

Y_train <- water_train[, water_index] # select only the target column from training data

knn_model <- knn.reg(X_train, X_test, Y_train, kmin)

Y_pred <- knn_model$water_index

test_results <- cbind(test_masked_new, Y_pred)
test_results