In [1]:
library(tidyverse)
library(readr)
library(ggplot2)
library(dplyr)
library(lubridate)
library(knitr)

players <- read_csv("data/players.csv")
sessions <- read_csv("data/sessions.csv")

colnames(players)
colnames(sessions)

merged_data <- left_join(sessions, players, by = "player")

player_summary <- merged_data %>%
  group_by(player) %>%
  summarise(
    total_sessions = n(),
    total_playtime = sum(duration),
    avg_playtime = mean(duration),
    first_day = min(as_date(start)),
    week1_playtime = sum(duration[as_date(start) <= first_day + days(7)])
  )

threshold <- quantile(player_summary$total_playtime, 0.75)

player_summary <- player_summary %>%
  mutate(
    high_contributor = total_playtime >= threshold
  )

eda_data <- left_join(player_summary, players, by = "player")

ggplot(eda_data, aes(x = total_sessions, y = total_playtime, color = high_contributor)) +
  geom_point() +
  labs(
    title = "Sessions vs Total Playtime",
    x = "Total Sessions",
    y = "Total Playtime (minutes)",
    color = "High Contributor"
  ) +
  theme_minimal()

eda_summary <- eda_data %>%
  group_by(high_contributor) %>%
  summarise(
    avg_playtime = mean(total_playtime),
    avg_sessions = mean(total_sessions),
    total_players = n()
  )

knitr::kable(eda_summary, digits = 2)

eda_data <- eda_data %>%
  select(player, total_sessions, total_playtime, avg_playtime, first_day, week1_playtime, high_contributor) %>%
  arrange(desc(total_playtime))

eda_data

glimpse(eda_data)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


ERROR: Error: 'data/players.csv' does not exist in current working directory ('/home/jovyan/work').
