<h1><b>Final Project Report</h1>

In [1]:
library(tidyverse)
library(tidymodels)


# loading player data into players df
players <- read_csv("data/players.csv")

# Filtering data for amateur players, ommiting missing data
filtered_players <- players |>
filter(experience == "Amateur") |>
na.omit()

filtered_players

# setting seed for project
set.seed(1)

# creating split & training/testing sets
player_split <- initial_split(filtered_players, strata = played_hours, prop = 0.75)
player_training <- training(player_split)
player_testing <- testing(player_split)


# creating workflow 

# recipe
knn_recipe <- recipe(played_hours ~ Age, data = filtered_players) |>
step_scale(all_predictors()) |>
step_center(all_predictors())

# model object
knn_spec <- nearest_neighbor(weight_func = "rectangular", mode = "regression", engine = "kknn", neighbors = tune()) 

# Cross-Validation

# creating validation sets
player_vfold <- vfold_cv(player_training, v = 5, strata = played_hours)
grid_vals <- tibble(neighbors = seq(1, 15, 1))

# creating tuning workflow
player_workflow <- workflow() |>
add_recipe(knn_recipe) |>
add_model(knn_spec)

player_results <- player_workflow |>
tune_grid(resamples = player_vfold, grid = grid_vals) |>
collect_metrics() |>
filter(.metric == "rmse")

# plot showing mean rmspe vs K value

result_plot <- player_results |>
ggplot(aes(x = neighbors, y = mean)) + 
geom_line() +
geom_point()

result_plot

best_k <- 4

player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = best_k, engine = "kknn", mode = "regression")

player_fit <- workflow() |>
add_model(player_spec) |>
add_recipe(knn_recipe) |>
fit(data = player_training)

player_summary <- player_fit |>
predict(player_testing) |>
bind_cols(player_testing) |>
metrics(truth = played_hours, estimate = .pred) |>
filter(.metric == 'rmse')

player_summary

pred_grid <- tibble(

    Age =  seq(

        from = filtered_players |> select(Age) |> min(),
        to = filtered_players |> select(Age) |> max(),
        by = 1
        
    )
)

player_preds <- player_fit |>
predict(pred_grid) |>
bind_cols(pred_grid)

player_plot <- ggplot(filtered_players, aes(x = Age, y = played_hours)) +
  geom_point(alpha = 0.4) +
  geom_line(data = player_preds,
            mapping = aes(x = Age, y = .pred),
            color = "steelblue",
            linewidth = 1) +
  xlab("Age of players (Years)") +
  ylab("Hours Played & Contributed") +
  ggtitle(paste0("K = ", best_k)) +
  theme(text = element_text(size = 12))

player_plot

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

ERROR: Error: 'data/players.csv' does not exist in current working directory ('/home/jovyan/work/dsci-100-project-individual/Dsci100_Group_Project').
