In [None]:
library(tidyverse)
library(repr)
library(tidymodels)

In [None]:
# Load players dataset into notebook
players <- read_csv("players.csv")

## Relevant Background information 

The computer science research group at UBC (led by Frank Woods), set up a Minecraft server and created datasets that describe how people play video games. To help target recruitment efforts, our group has used their dataset to answer the question "which "kinds" of players are most likely to contribute a large amount of data so they can target those players in recruiting."

Our group examined whether a players characteristics, specficially age, are predictive of how many total hours they play. This represents how much data they contribute to the study, as being able to predict which players will generate more data can help the research team recruit new players effectively and create clear and more precise stratigies to pick new players up.

The dataset we used for this project is the (players.csv) file containing many different variables 

### Context of Minecraft 
Minecraft is a open-world sandbox game that players can freely explore, build structures, and interact with others in the world.

#### Reference 
Mojang Studios (2011)
https://www.minecraft.net/




In [None]:
# Wrangle data so there are no values with 'NA'
players <- players |>
  filter(!is.na(Age)) |>
  filter(!is.na(played_hours))

The data is in tidy format, as each row depicts a single observation (a unique player), each column represents a single variable (age, name, etc.), and each cell contains a single value.

In [None]:
# Selecting for variables of interest
players_data <- players |>
    select(played_hours, Age)

In [None]:
# Calculating summary statistics for variables of interest
# NOTE: MAYBE ADD IQR??
played_hours_summary <- players |>
    summarise(
        played_hours_mean = mean(played_hours),
        played_hours_median = median(played_hours),
        played_hours_range = max(played_hours) - min(played_hours),
        played_hours_sd = sd(played_hours)) |>
        round(2)
played_hours_summary

age_summary <- players |>
    summarise(
        age_mean = mean(Age),
        age_median = median(Age),
        age_range = max(Age) - min(Age),
        age_sd = sd(Age)) |>
        round(2)
age_summary


## **graph below shows no real trend, comment on this**

#### Figure 1: Scatterplot showing the relationship between Age and Hours Played with trend line

In [None]:
ggplot(players, aes(x=Age, y=played_hours))+
       geom_point(alpha = 0.4) +
       geom_smooth(method = "lm", se = FALSE, colour = "blue")
       labs(title = "Relationship Between Age and Hours Played",
            x = "Player Age (years)",
            y = "Time Played (hours)")

#### Figure 2: Distribution of Player Ages

In [None]:
ggplot(players_data, aes(x = Age)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Distribution of Player Ages", x = "Age (years)", y = "Number of Players")

### Figure 3: Distribution of Weekly Hours Played

In [None]:
ggplot(players_data, aes(x = played_hours)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Distribution of Played Hours", x = "Time played (hours)", y = "Number of Players")

In [None]:
# Splitting dataset into training and testing
players_split <- initial_split(players_data, prop = 0.7, strata = played_hours)
players_train <- training(players_split)
players_test <- testing(players_split)

In [None]:
# Training, evaluating, and tuning the model
players_recipe <- recipe(played_hours ~ Age, data = players_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

players_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

players_vfold <- vfold_cv(players_train, v = 10, strata = played_hours)

players_wkflw <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(players_spec)    

gridvals <- tibble (neighbors = seq(from = 1, to = 100, by = 3))

players_results <- players_wkflw |>
    tune_grid(resamples = players_vfold, grid = gridvals) |>
    collect_metrics() |>
    filter(.metric == "rmse")

players_min <- players_results |>
    filter(mean == min(mean))

kmin <- players_min |> pull (neighbors)

kmin

In [None]:
# Evaluate on the test set
players_spec2 <- nearest_neighbor(weight_func = "rectangular", neighbors = kmin) |>
    set_engine("kknn") |>
    set_mode("regression")

players_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(players_spec2) |>
    fit(data = players_train)

players_preds <- players_fit |>
    predict(players_test) |>
    bind_cols(players_test)

players_summary <- players_preds |>
    metrics(truth = played_hours, estimate = .pred) |>
    filter (.metric == "rmse")

players_summary

# results in RMSPE on test data

### Figure 4: Comparison of Actual and Predicted Hours Played vs Age

In [None]:
players_plot <- players_preds |>
    ggplot(aes(x=Age, y = played_hours)) +
    geom_point(alpha = 0.5) +
    geom_line(aes(y=.pred), color = "blue") +
    labs(x = "Age (years)", y = "Predicted time played (hours)", title = "Actual vs Predicted Hours Played by Age")
    
players_plot