# TITLE

### Intro

In [None]:
#loading in data and doing some wrangling
library(tidyverse)
if (!dir.exists("data")) {
    dir.create("data")
}

download.file("https://raw.githubusercontent.com/nt8669/DSCI-100-007-24-Project/refs/heads/main/data/players.csv", "data/players.csv")
download.file("https://raw.githubusercontent.com/nt8669/DSCI-100-007-24-Project/refs/heads/main/data/sessions.csv", "data/sessions.csv")

players_data <- read_csv("data/players.csv")
sessions_data <- read_csv("data/sessions.csv")

sessions_data <- sessions_data |>
    mutate(start_time = as.POSIXlt(start_time, format = "%d/%m/%Y %H:%M")) |>
    mutate(end_time = as.POSIXlt(end_time, format = "%d/%m/%Y %H:%M"))
players_data <- players_data |>
    mutate(experience = as.factor(experience)) |>
    mutate(gender = as.factor(gender))

combined_data <- full_join(players_data, sessions_data, by = "hashedEmail") |>
    mutate(has_played = !is.na(start_time)) |>
    select(-original_start_time, -original_end_time)
head(combined_data)

In [None]:
#finding the middle time of each play session
combined_data <- mutate(combined_data, mid_time = start_time + (end_time - start_time) / 2, 
                        mid_time_dateless = format(mid_time, "%H:%M:%S") |>
                        as.POSIXct(format = "%H:%M:%S"))
#midtime datelss just sets the date to today so we can look at only time
# This one was really unintuitive and weird (why can't you add POSIXt objects??)
head(combined_data)

In [None]:
#Plots
options(repr.plot.width = 10)

age_hist <- players_data |>
    ggplot(aes(x = Age)) +
    geom_histogram(color = "dark blue", fill = "light blue", binwidth = 1) +
    labs(x = "User Age (years)", y = "Number of Users", title = "Figure 1: User Age Histogram")

age_v_playtime_low <- players_data |> 
    filter(played_hours < 4) |>
    ggplot(aes(x = Age, y = played_hours)) +
    geom_point(aes(shape = subscribe, color = gender)) +
    labs(y = "Total Playtime (hours)", x = "User Age (years)", title = "Figure 2a: Age-Playtime Scatterplot (low playtime)",
         color = "User is Subscribed to Newsletter", shape = "User's Gender")

age_v_playtime_high <- players_data |> 
    filter(played_hours >= 4) |>
    ggplot(aes(x = Age, y = played_hours)) +
    geom_point(aes(shape = subscribe, color = gender)) +
    labs(y = "Total Playtime (hours)", x = "User Age (years)", title = "Figure 2b: Age-Playtime Scatterplot (high playtime)",
         color = "User is Subscribed to Newsletter", shape = "User's Gender")

players_bar <- players_data |>
    ggplot(aes(x = gender, fill = subscribe)) +
    geom_bar(position = "fill") +
    labs(x = "User's Gender", y = "Portion of Users", 
         fill = "User is Subscribed to Newsletter",
         title = "Figure 3: Gender-Subscription Bar Plot")

mid_time_hist <- combined_data |>
    ggplot(aes(x = mid_time_dateless)) +
    geom_histogram(color = "dark blue", fill = "light blue", binwidth = 1800) +
    labs(x = "Mid Point of Play Session", y = "Number of Sessions", title = "Figure 4: Session Midpoint Histogram")

age_hist
age_v_playtime_low
age_v_playtime_high
players_bar
mid_time_hist