In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
options(repr.matrix.max.rows = 6)

## **Introduction**

## **Methods & Results**

In [None]:
# loads data using read_csv function
players_origin <- read_csv("https://raw.githubusercontent.com/mcheng250/DSCI_Project_final_report/refs/heads/main/data/players.csv")
sessions_origin <- read_csv("https://raw.githubusercontent.com/mcheng250/DSCI_Project_final_report/refs/heads/main/data/sessions.csv")

In [None]:
# Clean up the sessions.csv: mutate start_time and end_time to get session_length, use dmy_hm to turn start_time and 
# end_time into proper format, use difftime to calculate session length
# wrangle the session_length so we have avg_session_length. 
# Clean other columns so all we left is hashedEmail and avg_session_length while rounding the avg_session_length 
# decimal place so the data looks more clean
sessions_tidy <- sessions_origin |>
                    mutate(start_time = dmy_hm(start_time),
                           end_time = dmy_hm(end_time),
                           session_length = as.numeric(difftime(end_time, start_time, units = "mins"))) |>
                    group_by(hashedEmail) |>
                    summarize(avg_session_length = round(mean(session_length, na.rm = TRUE),2))
head(sessions_tidy)

In [None]:
# Take out Age ,subscribe and hashedEmail from the player dataset to clean up players.csv
players_tidy <- players_origin |>
                    select(Age,subscribe,hashedEmail)  
head(players_tidy)

In [None]:
# Merge two datasets to make the final clean data
tidy_data <- players_tidy |>
                left_join(sessions_tidy, by = "hashedEmail") |>
                drop_na(avg_session_length)
head(tidy_data)

In [None]:
# summary of dataset
# The summary of mean value, min value and max value of avg_session_length
tidy_data |> summarize(mean_avg_session_length = round(mean(avg_session_length, na.rm = TRUE),2),
                       min_avg_session_length = round(min(avg_session_length, na.rm = TRUE),2),
                       max_avg_session_length = round(max(avg_session_length, na.rm = TRUE),2))
# The summary of mean value, min value and max value of Age
tidy_data |> summarize(mean_age = round(mean(Age, na.rm = TRUE),2),
                            min_age = round(min(Age, na.rm = TRUE),2),
                            max_age = round(max(Age, na.rm = TRUE),2))
# The summary of players whose subscribe is FALSE
tidy_data |> filter(subscribe == FALSE) |> 
                summarize(subscribe_FALSE = n())
# The summary of players whose subscribe is TRUE
tidy_data |> filter(subscribe == TRUE) |> 
                summarize(subscribe_TRUE = n())
# table of mean value summary
tidy_data |> summarize(mean_avg_session_length = round(mean(avg_session_length, na.rm = TRUE),2),
                                        mean_age = round(mean(Age, na.rm = TRUE),2)) |>
                              pivot_longer(cols = everything(),
                                           names_to = "variable", 
                                           values_to = "mean_value")

In [None]:
set.seed(3456) 
# Randomly take 75% of the data for the training set, and 25% for testing set
data_split <- initial_split(tidy_data, prop = 0.75, strata = subscribe)  
train_data <- training(data_split)   
test_data <- testing(data_split)
# Scaling the variable and create recipe
knn_recipe <- recipe(subscribe ~ Age + avg_session_length, data = train_data) |>
            step_scale(all_predictors()) |>
            step_center(all_predictors())

## **Discussion**

## **References**