In [None]:
# loads data 
# wrangles and cleans the data to the format necessary for the planned analysis
# performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 
# creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis
# performs the data analysis
# creates a visualization of the analysis 
# note: all figures should have a figure number and a legend




In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
library(janitor)

In [None]:
# Load and Wrangle
data <- read_csv("data/players.csv") |> 
clean_names() |>
mutate(experience = factor(experience, levels = c("Beginner", "Amateur", "Regular", "Veteran", "Pro"), ordered = TRUE)) |>
mutate(gender = factor(gender, levels = c("Male", "Female", "Non-binary", "Prefer not to say", "Agender", "Two-Spirited", "Other")))

data
# Summary
sum_data <- data |>
summarize(n = n(), 
          mean_played = mean(played_hours), 
          sd_played = sd(played_hours), 
          min_age = min(age, na.rm = TRUE), 
          max_age = max(age, na.rm = TRUE))

sum_data_by_exp <- data |>
group_by(experience) |>
summarize(n = n(), 
          mean_played = mean(played_hours), 
          mean_age = mean(age, na.rm = TRUE))

sum_data_by_gender <- data |>
group_by(gender) |>
summarize(n = n(), 
          mean_played = mean(played_hours), 
          mean_age = mean(age, na.rm = TRUE))

sum_data
sum_data_by_exp
sum_data_by_gender



In [None]:
p1_full <- ggplot(data, aes(played_hours)) +
geom_histogram(bins = 40, color = "black", fill = "skyblue") +
labs(
    title = "Figure 1A: Full Distribution of Played Hours",
    x = "Played Hours",
    y = "Count"
) +
theme_minimal()

p95 <- quantile(data$played_hours, 0.95, na.rm = TRUE)
p1_zoom <- ggplot(data |> filter(played_hours <= p95), aes(played_hours)) +
geom_histogram(bins = 40, color = "black", fill = "steelblue") +
labs(
    title = "Figure 1B: Zoomed Distribution (0â€“95th percentile)",
    x = "Played Hours",
    y = "Count"
) +
theme_minimal()

p1_full
p1_zoom


# Used a log scale for large discrepancies within y values so that we can actually see the boxplots
p2 <- ggplot(data, aes(experience, played_hours, fill = experience)) +
geom_boxplot(outlier.alpha = 0.4) +
scale_y_log10() +
labs(
    title = "Figure 2: Played Hours by Experience (Log Scale)",
    x = "Experience",
    y = "Played Hours (log scale)",
    fill = "Experience"
) +
theme_minimal()

p2

p3 <- ggplot(data, aes(gender, played_hours, fill = gender)) +
geom_boxplot(outlier.alpha = 0.4) +
scale_y_log10() +
labs(
    title = "Figure 3: Played Hours by Gender (Log Scale)",
    x = "Gender",
    y = "Played Hours (log scale)",
    fill = "Gender"
) +
theme_minimal()

p3

p4 <- ggplot(data, aes(x = age, y = played_hours, color = experience)) +
geom_point(alpha = 0.7) +
labs(
    title = "Figure 4: Age vs Played Hours",
    x = "Age",
    y = "Played Hours",
    color = "Experience"
) +
theme_minimal()
p4

In [None]:
# Knn
