# TITLE

### Intro

In [None]:
#loading in data and doing some wrangling
library(tidyverse)
library(tidymodels)
if (!dir.exists("data")) {
    dir.create("data")
}

download.file("https://raw.githubusercontent.com/nt8669/DSCI-100-007-24-Project/refs/heads/main/data/players.csv", "data/players.csv")
download.file("https://raw.githubusercontent.com/nt8669/DSCI-100-007-24-Project/refs/heads/main/data/sessions.csv", "data/sessions.csv")

players_data <- read_csv("data/players.csv")
sessions_data <- read_csv("data/sessions.csv")

sessions_data <- sessions_data |>
    mutate(start_time = as.POSIXlt(start_time, format = "%d/%m/%Y %H:%M")) |>
    mutate(end_time = as.POSIXlt(end_time, format = "%d/%m/%Y %H:%M"))
players_data <- players_data |>
    mutate(experience = as.factor(experience)) |>
    mutate(gender = as.factor(gender))

combined_data <- full_join(players_data, sessions_data, by = "hashedEmail") |>
    mutate(has_played = !is.na(start_time)) |>
    select(-original_start_time, -original_end_time)
head(combined_data)

### Summary

In [None]:
#finding the middle time of each play session
combined_data <- mutate(combined_data, mid_time = start_time + (end_time - start_time) / 2, 
                        mid_time_dateless = format(mid_time, "%H:%M:%S") |>
                        as.POSIXct(format = "%H:%M:%S")) |>
    filter(!is.na(Age))
#midtime datelss just sets the date to today so we can look at only time
# This one was really unintuitive and weird (why can't you add POSIXt objects??)
head(combined_data)

In [None]:
#Plots
options(repr.plot.width = 10)

age_hist <- players_data |>
    ggplot(aes(x = Age)) +
    geom_histogram(color = "dark blue", fill = "light blue", binwidth = 1) +
    labs(x = "User Age (years)", y = "Number of Users", title = "Figure 1: User Age Histogram")

age_v_playtime_low <- players_data |> 
    filter(played_hours < 4) |>
    ggplot(aes(x = Age, y = played_hours)) +
    geom_point(aes(shape = subscribe, color = gender)) +
    labs(y = "Total Playtime (hours)", x = "User Age (years)", title = "Figure 2a: Age-Playtime Scatterplot (low playtime)",
         color = "User's Gender", shape = "User is Subscribed to Newsletter")

age_v_playtime_high <- players_data |> 
    filter(played_hours >= 4) |>
    ggplot(aes(x = Age, y = played_hours)) +
    geom_point(aes(shape = subscribe, color = gender)) +
    labs(y = "Total Playtime (hours)", x = "User Age (years)", title = "Figure 2b: Age-Playtime Scatterplot (high playtime)",
         color = "User's Gender", shape = "User is Subscribed to Newsletter")

players_bar <- players_data |>
    ggplot(aes(x = gender, fill = subscribe)) +
    geom_bar(position = "fill") +
    labs(x = "User's Gender", y = "Portion of Users", 
         fill = "User is Subscribed to Newsletter",
         title = "Figure 3: Gender-Subscription Bar Plot")

mid_time_hist <- combined_data |>
    ggplot(aes(x = mid_time_dateless)) +
    geom_histogram(color = "dark blue", fill = "light blue", binwidth = 1800) +
    labs(x = "Mid Point of Play Session", y = "Number of Sessions", title = "Figure 4: Session Midpoint Histogram")

age_hist
age_v_playtime_low
age_v_playtime_high
players_bar
mid_time_hist

In [None]:
model_data <- combined_data |>
    filter(has_played == TRUE) |>
    group_by(hashedEmail) |>
    summarize(mean_mid_time = mean(mid_time_dateless)) |>
    full_join(players_data, by = "hashedEmail") |>
    mutate(mean_mid_time = as.POSIXct(mean_mid_time),
          subscribe = as.factor(subscribe)) |>
    filter(!is.na(mean_mid_time))
head(model_data)

In [None]:
set.seed(10844538)
combined_split <- initial_split(model_data, 0.85, strata = subscribe)
combined_training <- training(combined_split)
combined_testing <- testing(combined_split)

k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

#this sucked man
combined_recipe <- recipe(subscribe ~ Age+played_hours+mean_mid_time, data = model_data) |>
    step_time(mean_mid_time) |> #step_time is cool
    step_mutate(mean_mid_time = (mean_mid_time_hour*3600 +
                                 mean_mid_time_minute*60 +
                                 mean_mid_time_hour)) |> #this combines columns made by step_time into one
    step_normalize(all_predictors())

tuning_model <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

combined_vfold <- vfold_cv(model_data, v = 3, strata = subscribe)

# #only care abt accuracy here
tuning_accuracy <- workflow() |>
    add_model(tuning_model) |>
    add_recipe(combined_recipe) |>
    tune_grid(resamples = combined_vfold, grid = k_vals) |>
    collect_metrics() |>
    filter(.metric == "accuracy")

head(tuning_accuracy)

k_plot <- tuning_accuracy |> 
    ggplot(aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "K", y = "Mean Accuracy", title = "Figure 6: Tuning plot Accuracy vs K")

k_plot

Set K = 7. because of plot. Also important to note that when K gets larger than 9 or 10 the algorithm just assumes every point to be TRUE, which is an issue so we're not using that.

In [None]:
final_model <- nearest_neighbor(weight_func = "rectangular", neighbors = 7) |>
    set_engine("kknn") |>
    set_mode("classification")

combined_fit <- workflow() |>
    add_recipe(combined_recipe) |>
    add_model(final_model) |>
    fit(combined_training)

combined_test_preds <- predict(combined_fit, combined_testing) |>
    bind_cols(combined_testing)

test_accuracy <- combined_test_preds |>
    metrics(truth = subscribe, estimate = .pred_class) |>
    filter(.metric == "accuracy")

test_confmat <- combined_test_preds |>
    conf_mat(truth = subscribe, estimate = .pred_class)

test_confmat
test_accuracy

In [None]:
test_preds_plot <- combined_test_preds |>
    