In [None]:
# loads data 
# wrangles and cleans the data to the format necessary for the planned analysis
# performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 
# creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis
# performs the data analysis
# creates a visualization of the analysis 
# note: all figures should have a figure number and a legend




In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
library(janitor)

In [None]:
# Load and Wrangle
data <- read_csv("data/players.csv") |> 
clean_names() |>
mutate(experience = factor(experience, levels = c("Beginner", "Amateur", "Regular", "Veteran", "Pro"), ordered = TRUE)) |>
mutate(gender = fct_collapse(gender, Other = c("Agender", "Two-Spirited", "Other", "Prefer not to say")))|>
mutate(gender = factor(gender, levels = c("Male", "Female", "Non-binary", "Other"))) # lump two_spirit and agenedr with other due to limited occurnaces

data
# Summary
sum_data <- data |>
summarize(n = n(), 
          mean_played = mean(played_hours), 
          sd_played = sd(played_hours), 
          min_age = min(age, na.rm = TRUE), 
          max_age = max(age, na.rm = TRUE))

sum_data_by_exp <- data |>
group_by(experience) |>
summarize(n = n(), 
          mean_played = mean(played_hours), 
          mean_age = mean(age, na.rm = TRUE))

sum_data_by_gender <- data |>
group_by(gender) |>
summarize(n = n(), 
          mean_played = mean(played_hours), 
          mean_age = mean(age, na.rm = TRUE))

sum_data
sum_data_by_exp
sum_data_by_gender



In [None]:
p1_full <- ggplot(data, aes(played_hours)) +
geom_histogram(bins = 40, color = "black", fill = "skyblue") +
labs(
    title = "Figure 1A: Full Distribution of Played Hours",
    x = "Played Hours",
    y = "Count"
) +
theme_minimal()

p95 <- quantile(data$played_hours, 0.95, na.rm = TRUE)
p1_zoom <- ggplot(data |> filter(played_hours <= p95), aes(played_hours)) +
geom_histogram(bins = 40, color = "black", fill = "steelblue") +
labs(
    title = "Figure 1B: Zoomed Distribution (0â€“95th percentile)",
    x = "Played Hours",
    y = "Count"
) +
theme_minimal()

p1_full
p1_zoom


# Used a log scale for large discrepancies within y values so that we can actually see the boxplots
p2 <- ggplot(data, aes(experience, played_hours, fill = experience)) +
geom_boxplot(outlier.alpha = 0.4) +
scale_y_log10() +
labs(
    title = "Figure 2: Played Hours by Experience (Log Scale)",
    x = "Experience",
    y = "Played Hours (log scale)",
    fill = "Experience"
) +
theme_minimal()

p2

p3 <- ggplot(data, aes(gender, played_hours, fill = gender)) +
geom_boxplot(outlier.alpha = 0.4) +
scale_y_log10() +
labs(
    title = "Figure 3: Played Hours by Gender (Log Scale)",
    x = "Gender",
    y = "Played Hours (log scale)",
    fill = "Gender"
) +
theme_minimal()

p3

p4 <- ggplot(data, aes(x = age, y = played_hours, color = experience)) +
geom_point(alpha = 0.7) +
labs(
    title = "Figure 4: Age vs Played Hours",
    x = "Age",
    y = "Played Hours",
    color = "Experience"
) +
theme_minimal()
p4

In [None]:
# Knn
model_data <- data |>
  select(played_hours, age, experience, gender) |>
  filter(!is.na(played_hours)) |>
  mutate(experience = fct_drop(experience))

split <- initial_split(model_data, prop = 0.8) # no strata since catagorial, used 0.8 split since it is standard for large datasets
training_data <- training(split)
testing_data  <- testing(split)

training_data <- training_data %>%
  mutate(
    experience = fct_drop(experience),
    gender = fct_drop(gender)
  )

testing_data <- testing_data %>%
  mutate(
    experience = fct_drop(experience),
    gender = fct_drop(gender)
  )

testing_data <- testing_data %>%
  mutate(
    experience = factor(experience, levels = levels(training_data$experience), ordered = TRUE),
    gender = factor(gender, levels = levels(training_data$gender))
  )


knn_recipe <- recipe(played_hours ~ ., data = training_data) |>
  step_dummy(all_nominal_predictors(), one_hot = TRUE) |>  # converts experience and gender to dummy vars !!!!!
  step_zv(all_predictors()) |>
  step_normalize(all_predictors())

knn_model <- nearest_neighbor(mode = "regression", neighbors = tune(), weight_func = "rectangular") |>
  set_engine("kknn") |>
  set_mode("regression") 

data_vfold <- vfold_cv(training_data, v = 5)  # used 5 folds since we have small gorups of categorial predictors
k_grid <- tibble(neighbors = seq(3, 25, by = 2)) # 2 step to avoid excessive computation, range 3to 25 kinda arbitrary but is a good general rnage



knn_workflow <- workflow() |>
  add_model(knn_model) |>
  add_recipe(knn_recipe)

tune_results <- tune_grid(knn_workflow, resamples = data_vfold, grid = k_grid, metrics = metric_set(rmse, rsq, mae))

best_k <- select_best(tune_results, "rmse")
best_k

final_knn <- finalize_workflow(knn_workflow, best_k)

final_knn_fit <- fit(final_knn, data = training_data)

preds <- predict(final_knn_fit, testing_data) |>
  bind_cols(testing_data)

metrics(preds, truth = played_hours, estimate = .pred)

In [None]:
pred_plot <- ggplot(preds, aes(x = played_hours, y = .pred)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Figure 5: KNN Predicted vs Actual Played Hours",
    x = "Actual Played Hours",
    y = "Predicted Played Hours"
  ) +
  theme_minimal()

pred_plot

In [None]:
data %>%
  count(gender)