In [None]:
library(tidyverse)
library(tidymodels)
library(readxl)
library(dplyr)
library(gridExtra)
options(repr.matrix.max.rows = 6)

In [None]:
# reading the data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls"
download.file(url, "data.xls")
training_data <- read_excel("data.xls", sheet = 2) # sheet 2 => training data
testing_data <- read_excel("data.xls", sheet = 3) # sheet 3 => testing data


# select only the columns we need and making our class as a factor
training_data <- training_data %>%
    select(1:6) %>%
    mutate(UNS = as_factor(UNS))

testing_data <- testing_data %>%
    select(1:6) %>%
    mutate(UNS = as_factor(UNS))

#training_data
# testing_data

In [None]:
set.seed(1)
# split the data into 5 folds
data_vfold <- vfold_cv(training_data, v = 5, strata = UNS)

# scale and center our data
data_recipe <- recipe(UNS ~ PEG, data = training_data) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())

# create model specification
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("classification")

# set up differnt K
k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))

# predict?
knn_results <- workflow() %>%
  add_recipe(data_recipe) %>%
  add_model(knn_spec) %>%
  tune_grid(resamples = data_vfold, grid = k_vals) %>%
  collect_metrics() 

# get our accuracies
accuracies <- knn_results %>%
    filter(.metric == "accuracy")

# plot accuracy vs K and decide on K
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
  geom_point() +
  geom_line() +
  labs(x = "Neighbors", y = "Accuracy Estimate")

# accuracy_vs_k

best_k <- accuracies %>%
    select(mean, neighbors) %>%
    arrange(desc(mean)) %>%
    slice(1)

# best_k
# the best K is 11

In [None]:
# now that we know our K = 11, we train the model again using K = 11

# make four new observations of different PEGs (exam scores)
high <- tibble(PEG = 0.7)
mid <- tibble(PEG = 0.4)
low <- tibble(PEG = 0.2)
very_low <- tibble(PEG = 0.0)

# make model specification using 11 neighbors
knn_spec <- nearest_neighbor(weight_func = 'rectangular', neighbors = 11) %>%
    set_engine('kknn') %>%
    set_mode('classification')

# fit model specification to our training data
knn_fit <- knn_spec %>%
    fit(UNS ~ PEG, data = training_data)

# predict all four made up values
# predict(knn_fit, high)
# predict(knn_fit, mid)
# predict(knn_fit, low)
# predict(knn_fit, very_low)


In [None]:
# we need to display each UNS as a bar on x- axis 
# and their average PEG on the y-axis

average_UNS <- training_data %>%
    select('UNS', 'PEG') %>% # select the columns we need
    group_by(UNS) %>% # group_by each UNS class
    summarize(average_PEG = mean(PEG)) %>% # find the average PEG
    arrange(desc(average_PEG))

# plot the bars, re-ordering the bars in increasing order
plot <- average_UNS %>%
    ggplot(aes(fct_reorder(UNS, average_PEG), average_PEG, fill = UNS)) +
    geom_bar(position = 'stack', stat = 'identity') + 
    geom_hline(yintercept = high[[1]], col = 'Green') +
    geom_hline(yintercept = mid[[1]], col = 'Purple') +
    geom_hline(yintercept = low[[1]], col = 'Cyan') +
    geom_hline(yintercept = very_low[[1]], col = 'Red') +
    geom_text(aes(1, high[[1]] + 0.01, label = 'UNS = High')) +
    geom_text(aes(1, mid[[1]] + 0.01, label = 'UNS = Mid')) +
    geom_text(aes(1, low[[1]] + 0.01, label = 'UNS = Low')) +
    geom_text(aes(1, very_low[[1]] + 0.01, label = 'UNS = Very low')) +
    xlab('User knowledge level (UNS)') +
    ylab('Average exam score (PEG)') +
    ggtitle('User knowledge levels average exam scores')
    
# average_UNS
plot