In [6]:
#CAUTION: Takes a long time to load.
install.packages("themis")

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [4]:
#load libraries
library(tidyverse)
library(repr)
library(tidymodels)
library(themis)
options(repr.matrix.max.rows = 10)

In [None]:
#Add columns
unscaled_data <- read_csv("pulsar_data.csv", 
                        col_names = c("mean_integrated_profile", 
                                      "stand_dev_integrated_profile", 
                                      "exc_kurtosis_integrated_profile", 
                                      "skew_integrated_profile",
                                      "mean_dmsnr", 
                                      "stand_dev_dmsnr", 
                                      "exc_kurtosis_dmsnr", 
                                      "skew_dmsnr", "class")) 

unscaled_data <- unscaled_data |>
    mutate(class = as_factor(class))
#pulsar_data 

#Class proportions in pulsar dataset (Imbalanced)
num_obs <- nrow(unscaled_data)
pulsar_proportions <- unscaled_data |>
    group_by(class) |>
    summarize(n = n()) |>
    mutate(percent = 100*n/nrow(unscaled_data))
pulsar_proportions

set.seed(1)
pulsar_split <- initial_split(unscaled_data, prop = 0.75, strata = class)
pulsar_train <- training(pulsar_split)
pulsar_test <- testing(pulsar_split) 

#Scale data and Upsample to balance data
pulsar_recipe <- recipe(class ~ skew_integrated_profile + skew_dmsnr, data = pulsar_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors()) #|>
    step_upsample(class, over_ratio = 1, skip = FALSE) |>
  #  prep()

#check proportions of upsampled data
#pulsar_data <- pulsar_data
 #   group_by(class) |>
  #  summarize(n = n())
#pulsar_data

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
       set_engine("kknn") |>
       set_mode("classification")

pulsar_fit <- workflow() |>
       add_recipe(pulsar_recipe) |>
       add_model(knn_spec) |>
       fit(data = pulsar_train)
pulsar_fit


pulsar_test_predictions <- predict(pulsar_fit, pulsar_test) |>
    bind_cols(pulsar_test)
#pulsar_test_predictions

pulsar_prediction_accuracy <- pulsar_test_predictions |>
    metrics(truth = class, estimate = .pred_class) |>
    filter(.metric == "accuracy")
pulsar_prediction_accuracy

confusion <- pulsar_test_predictions |>
    conf_mat(truth = class, estimate = .pred_class)
confusion

In [7]:
#10 fold cross validation
pulsar_vfold <- vfold_cv(pulsar_train, v = 10, strata = class)

pulsar_resample_fit <- workflow() |>
       add_recipe(pulsar_recipe) |>
       add_model(knn_spec) |>
       fit_resamples(resamples = pulsar_vfold)

pulsar_metrics <- collect_metrics(pulsar_resample_fit)
pulsar_metrics

.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
accuracy,binary,0.9699767,10,0.0005568752,Preprocessor1_Model1
roc_auc,binary,0.9061469,10,0.0044618624,Preprocessor1_Model1


In [None]:
#CAUTION: Takes a long time to load.
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
            set_engine("kknn") |>
            set_mode("classification")
#knn_tune

k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))

knn_results <- workflow() |>
               add_recipe(pulsar_recipe) |>
               add_model(knn_tune) |>
               tune_grid(resamples = pulsar_vfold, grid = k_vals) |>
               collect_metrics()
#knn_results

accuracies <- knn_results |> 
       filter(.metric == "accuracy")

accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate")  
accuracy_versus_k


In [None]:
#we can have a separate cell for each combination of predictors that we try
#pick one K value to use for all the combinations, to avoid having to find the best K each time (which could vary)
#we could call the predictor combinations combo1, combo2, etc.
#compare their accuracies, choose the most accurate set of predictors