In [1]:
library(tidyverse)

# Basic results_df without extra info
results <- read_csv("https://raw.githubusercontent.com/petebrown/update-results/main/data/results_df.csv", show_col_types = FALSE)

# game_date and venue for join and wrangling
game_dates <- read_csv("https://raw.githubusercontent.com/petebrown/update-results/main/data/results_df.csv", show_col_types = FALSE) %>%
    select(game_date, venue)

# Extra infor for 2022/23 season
s2223 <- read_csv("./input/2022-23.csv", show_col_types = FALSE)
# Extra info for seasons prior to 2022/23
pre_2223 <- read_csv("https://raw.githubusercontent.com/petebrown/scrape-results-playwright/main/data/records.csv", show_col_types = FALSE) %>%
    filter(!game_date %in% s2223$game_date) %>%
    mutate(
        game_date = case_when(
            .default = game_date,
            game_date == "03 Jan 1998" ~ "13 Jan 1998",
            game_date == "05 Dec 1923" ~ "06 Dec 1923",
        )
    )

# Manually added extra info for games missing from 11v11
missing_records <- read_csv("./input/2015-18-cup-games.csv", show_col_types = FALSE)

# create df containing extra details for all 11v11 games by binding pre_2223 and s2223
df <- pre_2223 %>%
    filter(!game_date %in% missing_records$game_date) %>%
    rbind(s2223) %>%
    mutate(game_date = as.Date(dmy(game_date)))

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
df_left <- results

In [3]:
df_right <- df %>%
    inner_join(game_dates, by = "game_date") %>%
    mutate(
        ko_time = str_match(match_notes, "(?:[Kk]ick[\\s|-][Oo]ff:\\s)(\\d+.\\d+[a|p]m)")[,2],
    ) %>%
    mutate(
        competition_2 = str_replace(competition_2, "Associate Members' Cup", "Associate Members Cup"),
        competition_2 = str_replace(competition_2, "Sherpa Van Trophy", "Sherpa Vans Trophy"),
        cup_stage = str_match(competition_2, str_glue("{competition}\\s(.*)"))[,2],
        cup_round_no = str_match(competition_2, "(\\d+).*\\sround")[,2],
        cup_round = case_when(
            str_detect(cup_stage, regex("quarter-final", ignore_case = TRUE)) ~ "QF",
            str_detect(cup_stage, regex("semi-final", ignore_case = TRUE)) ~ "SF",
            str_detect(cup_stage, regex("^final", ignore_case = TRUE)) ~ "F",
            str_detect(cup_stage, regex("preliminary", ignore_case = TRUE)) ~ "P",
            str_detect(cup_stage, regex("group", ignore_case = TRUE)) ~ "G",
            .default = cup_round_no
        ),
        cup_replay = case_when(
            grepl("\\sreplay", competition_2, ignore.case = TRUE) ~ 1,
            TRUE ~ 0
        ),
        cup_section = case_when(
            str_detect(cup_stage, regex("\\s([a-zA-Z]+)\\s[Ss]ection")) ~ str_match(str_to_title(cup_stage), "\\s([a-zA-Z]+)\\s[Ss]ection")[,2],
            str_detect(cup_stage, regex("Group\\s[a-zA-Z]\\s–\\s[a-zA-Z]+")) ~ str_match(str_to_title(cup_stage), "(Group\\s[a-zA-Z]\\s–\\s[a-zA-Z]+)")[,2],
            .default = NA
        ),
        cup_leg = str_match(competition_2, "(\\d+)[a-zA-Z]+\\sleg")[,2]
    ) %>%
    mutate(
        extra_time = case_when(
            grepl("after extra time", match_notes, ignore.case = TRUE) ~ 1,
            grepl("golden goal rule", match_notes, ignore.case = TRUE) ~ 1,
            TRUE ~ 0)
    ) %>%
    mutate(
        penalties = case_when(
            grepl("penalt", score_2, ignore.case = TRUE) ~ str_match(score_2, "(?:[Pp]enalties\\s)(\\d+-\\d+ to\\s.+)"),
            grepl("penalt", match_notes, ignore.case = FALSE) ~ str_match(match_notes, "(?:penalty\\sshoot[-|\\s]out\\s)\\((\\d+-\\d+)\\)"),
            grepl("PENALT", match_notes, ignore.case = FALSE) ~ str_match(match_notes, "(?:[^|\\|].+\\sWON\\s)(\\d+-\\d+)(?:\\sON\\sPENALTIES)"),
            .default = NA)[,2],
        pen_winners = case_when(
            grepl("\\d+-\\d+ to", penalties, ignore.case = TRUE) ~ str_match(penalties, "\\d+-\\d+\\sto\\s(.*)")[,2],
            grepl("were awarded a bonus point", match_notes, ignore.case = TRUE) ~ str_match(match_notes, ",\\s(.*)\\swere\\sawarded\\sa\\sbonus\\spoint")[,2],
            grepl("won \\d+-\\d+ on penalties", match_notes, ignore.case = TRUE) ~ str_match(str_to_title(match_notes), "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\sWon\\s\\d+-\\d+\\sOn\\sPenalties")[,2],
            TRUE ~ NA
        ),
        pen_score = case_when(
            is.na(pen_winners) ~ NA,
            !grepl("Tranmere", pen_winners, ignore.case = TRUE) ~ paste0(str_match(penalties, "-(\\d+)")[,2], "-", str_match(penalties, "(\\d+)-")[,2]),
            TRUE ~ str_match(penalties, "(\\d+-\\d+)")[,2]),
        pen_outcome = case_when(
            is.na(pen_winners) ~ NA,
            grepl("Tranmere", pen_winners, ignore.case = TRUE) ~ "W",
            !grepl("Tranmere", pen_winners, ignore.case = TRUE) ~ "L"
        ),
        pen_gf = case_when(
            !is.na(pen_score) & venue == "H" ~ str_match(pen_score, "(\\d+)-")[,2],
            !is.na(pen_score) & venue == "A" ~ str_match(pen_score, "-(\\d+)")[,2],
            .default = NA
        ),
        pen_ga = case_when(
            !is.na(pen_score) & venue == "H" ~ str_match(pen_score, "-(\\d+)")[,2],
            !is.na(pen_score) & venue == "A" ~ str_match(pen_score, "(\\d+)-")[,2],
            .default = NA
        ),
        pen_gf = as.numeric(pen_gf),
        pen_ga = as.numeric(pen_ga),
    ) %>%
    mutate(
        agg_score = case_when(
            grepl("on aggregate", match_notes, ignore.case = TRUE) ~ str_match(match_notes, "(\\d+-\\d+)\\son\\saggregate")[,2],
            grepl("agg:", score, ignore.case = TRUE) ~ str_match(score, "[Aa]gg:\\s?(\\d+-\\d+)")[,2],
            .default = NA
          ),
        agg_winners = str_match(match_notes, "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\swon\\s\\d+-\\d+\\son\\saggregate")[,2],
        agg_outcome = case_when(
            cup_leg == "2" & is.na(agg_winners) & !is.na(agg_score) ~ "D",
            cup_leg == "2" & !is.na(agg_winners) & str_detect(agg_winners, regex("Tranmere", ignore_case = TRUE)) ~ "W",
            cup_leg == "2" & !is.na(agg_winners) & str_detect(agg_winners, regex("Tranmere", ignore_case = TRUE), negate = TRUE) ~ "L",
            .default = NA),
        agg_score = case_when(
            agg_outcome == "D" ~ agg_score,
            agg_outcome == "W" ~ agg_score,
            agg_outcome == "L" ~ paste0(str_sub(agg_score, 3, 3), "-", str_sub(agg_score, 1, 1)),
            .default = NA
        )
    ) %>%
    mutate(
        away_goals_winner = str_match(match_notes, "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\swon\\son\\sthe\\saway\\sgoals\\srule")[,2],
        away_goal_outcome = case_when(
            is.na(away_goals_winner) ~ NA,
            grepl("Tranmere", away_goals_winner, ignore.case = TRUE) ~ "W",
            !grepl("Tranmere", away_goals_winner, ignore.case = TRUE) ~ "L"
        )
    ) %>%
    mutate(
        gg_winner = str_match(match_notes, "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\sWON\\sON\\sTHE\\sGOLDEN\\sGOAL\\sRULE")[,2],
        gg_outcome = case_when(
            is.na(gg_winner) ~ NA,
            grepl("Tranmere", gg_winner, ignore.case = TRUE) ~ "W",
            !grepl("Tranmere", gg_winner, ignore.case = TRUE) ~ "L"
        )
    ) %>%
    mutate(
        cup_leg = case_when(
            !is.na(agg_outcome) & is.na(cup_leg) ~ "2",
            .default = cup_leg
        )
    ) %>%
    mutate(
        agg_gf = case_when(
            cup_leg == "2" & is.na(agg_winners) & !is.na(agg_score) & venue == "H" ~ str_match(agg_score, "(\\d+)-")[,2],
            cup_leg == "2" & is.na(agg_winners) & !is.na(agg_score) & venue == "A" ~ str_match(agg_score, "-(\\d+)")[,2],
            .default = NA
        ),
        agg_ga = case_when(
            cup_leg == "2" & is.na(agg_winners) & !is.na(agg_score) & venue == "H" ~ str_match(agg_score, "-(\\d+)")[,2],
            cup_leg == "2" & is.na(agg_winners) & !is.na(agg_score) & venue == "A" ~ str_match(agg_score, "(\\d+)-")[,2],
            .default = NA
        ),
        agg_gf = as.numeric(agg_gf),
        agg_ga = as.numeric(agg_ga),
        agg_outcome = case_when(
            agg_gf > agg_ga ~ "W",
            agg_gf < agg_ga ~ "L",
            agg_gf == agg_ga ~ "D",
            .default = agg_outcome
        ),
        agg_score = case_when(
            agg_outcome == "W" & venue == "A" ~ paste0(str_sub(agg_score, 3, 3), "-", str_sub(agg_score, 1, 1)),
            .default = agg_score
        ),
    ) %>%
    mutate(cup_leg = as.numeric(cup_leg)) %>%
    select(
        game_date,
        ko_time,
        cup_round, cup_leg, cup_stage, cup_replay, cup_section, 
        extra_time,
        pen_outcome, pen_score, pen_gf, pen_ga,
        agg_outcome, agg_score, agg_gf, agg_ga,
        away_goal_outcome,
        gg_outcome
    )

In [4]:
df_right <- df_right %>%
    bind_rows(missing_records)

In [5]:
df <- df_left %>%
    left_join(df_right, by = "game_date") %>%
    select(-secondary_score) %>%
    arrange(desc(game_date))

write_csv(df, "results_df_extra.csv", na = "")

In [6]:
extra_details <- df %>% select(names(df_right))

write_csv(extra_details, "extra_details.csv", na = "")

In [7]:
# create a new column in extra_details that converts the time in ko_time to 24 hours from hh.mm am/pm
game_datetime <- extra_details %>%
    mutate(
        ko_hr = case_when(
            grepl("am", ko_time, ignore.case = TRUE) ~ str_match(ko_time, "(\\d+)\\.")[,2],
            grepl("pm", ko_time, ignore.case = TRUE) & str_match(ko_time, "(\\d+)\\.")[,2] == "12" ~ "12",
            grepl("pm", ko_time, ignore.case = TRUE) ~ as.character(as.numeric(str_match(ko_time, "(\\d+)\\.")[,2]) + 12)
        ),
        ko_min = str_match(ko_time, "\\.(\\d+)")[,2],
        ko_hr = as.numeric(ko_hr),
        ko_min = as.numeric(ko_min),
        datetime = as.POSIXct(paste0(game_date, " ", ko_hr, ":", ko_min, ":00"), format = "%Y-%m-%d %H:%M:%S")
    ) %>%
    select(game_date, ko_time, ko_hr, ko_min, datetime) %>%
    filter(!is.na(datetime))
    
game_datetime %>%
    head()

game_date,ko_time,ko_hr,ko_min,datetime
<date>,<chr>,<dbl>,<dbl>,<dttm>
2023-05-08,12.30pm,12,30,2023-05-08 12:30:00
2023-04-29,3.00pm,15,0,2023-04-29 15:00:00
2023-04-22,3.00pm,15,0,2023-04-22 15:00:00
2023-04-18,7.45pm,19,45,2023-04-18 19:45:00
2023-04-15,3.00pm,15,0,2023-04-15 15:00:00
2023-04-10,3.00pm,15,0,2023-04-10 15:00:00


In [8]:
cup_cols <- names(extra_details %>% select("cup_round": "gg_outcome") %>% select(-c("cup_replay", "extra_time")))

cup_details <- extra_details %>%
    filter(if_any(cup_cols, ~!is.na(.)))

write_csv(cup_details, "cup_details.csv", na = "")

“[1m[22mUsing an external vector in selections was deprecated in tidyselect 1.1.0.
[36mℹ[39m Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(cup_cols)

  # Now:
  data %>% select(all_of(cup_cols))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.”
