# DSCI 100 Group 17 Final Project - Predicting Experience Level as a Means to Target Minecraft Players for Recruitment

### Introduction to Data Science in Player Behavior Analysis

Under the direction of Frank Wood, a computer science research team at UBC is gathering information on video game play. Their team has collected player and session data with the hopes that it will be helpful in focusing their recruitment efforts and ensureing that they have sufficient resources (such as server hardware and software licenses) to manage the volume of players they draw in. Here we will address if any correlations can be found between a Minecraft player's habits and the data that has been collected from the open-world Minecraft server and we would like to know which "kinds" of players are most likely to contribute a large amount of data so that we can target those players in our recruiting efforts; specifically **how can session start and end time help to predict the experience level of an individual Minecraft player** so we know where to target advertisements (i.e, at a professional gaming event vs. on kids youtube channels). The two data we sets we use come in the form of CSV files (which we have converted to html files for accessibility), the player's data set with 7 variables; experience, subscribe, hashedEmail, played_hours, name, gender, and age, the sessions data set with 5 variables; hashedEmail, start_time, end_time, original_start_time, and orginial_end_time. Players have around 100 observations while has about 1500 observations. The data is collected from Minecraft game logs and linked to their profile information under the hashedEmail, and we will use this alongside start_time, and end_time to predict experience through reading, wrangling, and clustering. These three key steps: data ingestion, preprocessing, and supervised learning, form the foundation of our analysis.

In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(lubridate)
options(repr.matrix.max.rows = 6)

In [None]:
player_url <- "https://raw.githubusercontent.com/mulch-eater-prime/DSCI-Group-17-Project/refs/heads/main/players%20(1).csv"
player <- read_csv(player_url)
player

In [None]:
sessions_url <- "https://raw.githubusercontent.com/mulch-eater-prime/DSCI-Group-17-Project/03abe89c68205649e71bfa9b8a9cf90efe553b24/sessions%20(1).csv"
sessions <- read_csv(sessions_url)
sessions

In [None]:
mashup <- player |>
    select(hashedEmail, experience, played_hours) |>
    merge(sessions, by = "hashedEmail") |>
    select(hashedEmail:end_time) |>
    separate(col = start_time, into = c("start_date", "start_time"), sep = " ") |>
    separate(col = end_time, into = c("end_date", "end_time"), sep = " ") |>
    select(experience, start_time, end_time, played_hours, hashedEmail)
mashup

In [None]:
msm <- mashup |>
    separate(col = start_time, into = c("start_hour", "start_minute"), sep = ":") |>
    separate(col = end_time, into = c("end_hour", "end_minute"), sep = ":") |>
    mutate(start_hour = as.numeric(start_hour)) |>
    mutate(start_minute = as.numeric(start_minute)) |>
    mutate(end_hour = as.numeric(end_hour)) |>
    mutate(end_minute = as.numeric(end_minute))
msm

In [None]:
msm_calc <- msm |>
    mutate(start_msm = start_hour*60 + start_minute) |>
    mutate(end_msm = end_hour * 60 + end_minute) |>
    mutate(playtime = end_msm - start_msm) |>
    select(experience, start_msm, end_msm, playtime, played_hours, hashedEmail)
msm_calc

In [None]:
options(repr.plot.width = 13, repr.plot.height = 10)
msm_plot <- msm_calc |> ggplot(aes(x = start_msm, y = end_msm, color = experience)) +
    geom_point(alpha = 0.25) + 
    labs(x = "Start Time (Minutes Since Midnight)", y = "End Time (Minutes Since Midnight)", color = "Experince Level") +
    theme(text = element_text(size = 15))
msm_plot

In [None]:
player_new <- player |> select(hashedEmail, experience)
msm_new_1 <- msm_calc |> select(hashedEmail, start_msm) |>
    group_by(hashedEmail) |>
    summarize(start = mean(start_msm))
msm_new_2 <- msm_calc |> select(hashedEmail, end_msm) |>
    group_by(hashedEmail) |>
    summarize(end = mean(end_msm))
msm_new <- msm_new_1 |> merge(msm_new_2, by = "hashedEmail") |>
    merge(player_new, by = "hashedEmail")
msm_new
options(repr.plot.width = 8, repr.plot.height = 6)

new_plot <- msm_new |> ggplot(aes(x = start, y = end, color = experience)) +
    geom_point(alpha = 0.6) +
    labs(x = "Start Time (minutes since midnight)", y = "End Time (minutes since midnight)", color = "experience level")
new_plot

In [None]:
minecraft_split <- initial_split(msm_new, prop = 0.75, strata = experience)
minecraft_train <- training(minecraft_split)
minecraft_test <- testing(minecraft_split)

minecraft_recipe <- recipe(experience ~ start + end, data = minecraft_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

minecraft_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

minecraft_vfold <- vfold_cv(minecraft_train, v = 10, strata = experience)

k_vals <- tibble(neighbors = seq(from = 1, to = 75, by = 5))

minecraft_results <- workflow() |>
    add_recipe(minecraft_recipe) |>
    add_model(minecraft_spec) |>
    tune_grid(resamples = minecraft_vfold, grid = k_vals) |>
    collect_metrics() |>
    filter(.metric == "accuracy")
minecraft_results

In [None]:
minecraft_best_k <- minecraft_results |> ggplot(aes(x = neighbors, y = mean)) +
    geom_point() + geom_line() + 
    labs(x = "Neighbors", y = "Accuracy") +
    theme(text = element_text(size = 15))
minecraft_best_k

minecraft_k <- minecraft_results |> arrange(desc(mean)) |>
    head(1) |> 
    pull(neighbors)
minecraft_k