# DSCI 100 - Project Final Report - Group 8, Section 3
## Predicting Newsletter Subscription from Player Age and Playtime
Group Members
- Nick Huang (87451522)
- Kailey Hong (64795990)
- Ian Zhu (82209362)
- Jonathan Yan

# 1 | Introduction

kailey type stuff here this ur section

# 2 | Methods and Results

## Loading data into R and wrangling

**Loading necessary libraries, reading in player data using link directly from GitHub**

In [None]:
# loading all necessary libraries

library(tidyverse)
library(tidymodels)
library(repr)
library(infer)
options(repr.matrix.max.rows = 6)
source('cleanup.R')

In [None]:
players_url <- "https://raw.githubusercontent.com/nhuang07/dsci-100-2025w1-group-8/refs/heads/main/data/players.csv"
players <- read_csv(players_url)
head(players)

**Tidying the data**

The players dataset is already tidy, so there will be no tidying work done. While the experience and gender variables should be converted to the "factor" type instead of their current types, we will not be using these variables as predictors, so this work will not be shown below.

In [None]:
players <- players |>
                mutate(experience = as_factor(experience)) |>
                mutate(gender = as_factor(gender))
head(players)

**Computing the mean of the quantitative variables in the "players" dataset**

In [None]:
mean_players <- players |>
                    select(played_hours, Age) |>
                    map_df(mean, na.rm = TRUE) |>
                    mutate(played_hours = round(played_hours, digits = 2)) |>
                    mutate(Age = round(Age, digits = 2))
mean_players 

## Exploratory visualizations of the distribution of the predictor and response variables

In [None]:
# distribtuion of played hours plot

options(repr.plot.width = 8, repr.plot.height = 8)

played_hours_plot <- ggplot(players, aes(x = played_hours)) +
                      geom_histogram(bins = 25, fill = "blue", color = "white") +
                      labs(title = "Distribution of Played Hours", 
                           x = "Hours played on server (hrs)", 
                           y = "Count (# of players)")
played_hours_plot

In [None]:
# distribution of age plot

options(repr.plot.width = 8, repr.plot.height = 8)

age_plot <- ggplot(players, aes(x = Age)) +
                      geom_histogram(bins = 25, fill = "red", color = "white") +
                      labs(title = "Distribution of Age", 
                           x = "Age (years)", 
                           y = "Count (# of players)")
age_plot

In [None]:
# bar plot of subscription, with height of bars being the # of each

options(repr.plot.width = 8, repr.plot.height = 8)

subscription_plot <- ggplot(players, aes(x = subscribe, fill = subscribe)) +
                        geom_bar() +
                        labs(title = "Subscription status of players on server",
                             x = "Subscription status",
                             y = "Count",
                             fill = "Subscribed?")
subscription_plot

## Exploratory visualizations of age and hours played as predictors for the subscription status

In [None]:
# plot of age on the x-axis and played_hours on the y-axis, coloured in with "subscribe" variable
options(repr.plot.width = 10, repr.plot.height = 10)
age_to_played_hours_plot <- ggplot(players, aes(x = Age, y = played_hours, color = subscribe)) +
                                geom_point(alpha = 0.8) +
                                labs(title = "Age of player vs. hours played on minecraft server",
                                     x = "Age (in years)",
                                     y = "Hours played on PLAICraft.ai server",
                                     color = "Subscribed to a game related newsletter?")
age_to_played_hours_plot

In [None]:
players_split <- initial_split(players, prop = 0.7, strata = subscribe)  
players_train <- training(players_split)   
players_test <- testing(players_split)

players_recipe <- recipe(subscribe ~ played_hours + Age, data = players_train) |>
  step_mutate(subscribe = factor(subscribe)) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())
players_recipe
knn_spec <- nearest_neighbor(mode="classification",neighbors=tune(),weight_func="rectangular")|>
set_engine("kknn")
knn_workflow<-workflow()|>
add_recipe(players_recipe)|>
add_model(knn_spec)
set.seed(2020) # DO NOT REMOVE
players_vfold <- vfold_cv(players_train, v = 5, strata = subscribe)
players_vfold


In [None]:
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
       set_engine("kknn") |>
       set_mode("classification")

knn_tune
set.seed(1234) 

k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

knn_results <- workflow() |>
       add_recipe(players_recipe) |>
       add_model(knn_tune) |>
       tune_grid(resamples =players_vfold, grid =k_vals) |>
        collect_metrics()

knn_results
accuracies <- knn_results |> 
       filter(.metric == "accuracy")

accuracy_versus_k<- ggplot(accuracies, aes(x = neighbors, y =mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(0, 14, by = 1)) +  # adjusting the x-axis
       scale_y_continuous(limits = c(0.4, 1.0)) # adjusting the y-axis

accuracy_versus_k

# 3 | Discussion

jon write ur stuff here

# 4 | References

anyone add to this when necessary