In [31]:
library(tidyverse)

df <- read_csv("https://raw.githubusercontent.com/petebrown/scrape-results-playwright/main/data/records.csv", show_col_types = FALSE)

In [40]:
df2 <- df %>%
    mutate(
        game_date = lubridate::dmy(game_date),
        ko_time = str_match(match_notes, "(?:[Kk]ick[\\s|-][Oo]ff:\\s)(\\d+.\\d+[a|p]m)")[,2],
        venue = case_when(
            stadium == "Wembley Stadium" ~ "N",
            home_team == "Tranmere Rovers" & stadium != "Wembley Stadium" ~ "H",
            away_team == "Tranmere Rovers" & stadium != "Wembley Stadium" ~ "A",
            FALSE ~ "?"
           )
    ) %>%
    mutate(
        competition_2 = str_replace(competition_2, "Associate Members' Cup", "Associate Members Cup"),
        cup_stage = str_match(competition_2, str_glue("{competition}\\s(.*)"))[,2],
        cup_round_no = str_match(competition_2, "(\\d+).*\\sround")[,2],
        cup_round = case_when(
            grepl("quarter-final", cup_stage, ignore.case = TRUE) ~ "QF",
            grepl("semi-final", cup_stage, ignore.case = TRUE) ~ "SF",
            grepl("^final", cup_stage, ignore.case = TRUE) ~ "F",
            grepl("qualifying", cup_stage, ignore.case = TRUE) ~ str_glue("Q{cup_round_no}"),
            grepl("preliminary", cup_stage, ignore.case = TRUE) ~ "P",
            grepl("group", cup_stage, ignore.case = TRUE) ~ "G",
            TRUE ~ cup_round_no
        ),
        cup_replay = case_when(
            grepl("\\sreplay", competition_2, ignore.case = TRUE) ~ 1,
            TRUE ~ 0
        ),
        cup_section = str_match(str_to_title(cup_stage), "\\s([a-zA-Z]+)\\sSection")[,2],
        cup_leg = str_match(competition_2, "(\\d+)[a-zA-Z]+\\sleg")[,2]
    ) %>%
    mutate(
        extra_time = case_when(
            grepl("after extra time", match_notes, ignore.case = TRUE) ~ 1,
            TRUE ~ 0)
    ) %>%
    mutate(
        penalties = case_when(
            grepl("penalt", score_2, ignore.case = TRUE) ~ str_match(score_2, "(?:[Pp]enalties\\s)(\\d+-\\d+ to\\s.+)"),
            grepl("penalt", match_notes, ignore.case = FALSE) ~ str_match(match_notes, "(?:penalty\\sshoot[-|\\s]out\\s)\\((\\d+-\\d+)\\)"),
            grepl("PENALT", match_notes, ignore.case = FALSE) ~ str_match(match_notes, "(?:[^|\\|].+\\sWON\\s)(\\d+-\\d+)(?:\\sON\\sPENALTIES)"),
            .default = NA)[,2],
        pen_winners = case_when(
            grepl("\\d+-\\d+ to", penalties, ignore.case = TRUE) ~ str_match(penalties, "\\d+-\\d+\\sto\\s(.*)")[,2],
            grepl("were awarded a bonus point", match_notes, ignore.case = TRUE) ~ str_match(match_notes, ",\\s(.*)\\swere\\sawarded\\sa\\sbonus\\spoint")[,2],
            grepl("won \\d+-\\d+ on penalties", match_notes, ignore.case = TRUE) ~ str_match(str_to_title(match_notes), "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\sWon\\s\\d+-\\d+\\sOn\\sPenalties")[,2],
            TRUE ~ NA
        ),
        pen_score = case_when(
            is.na(pen_winners) ~ NA,
            !grepl("Tranmere", pen_winners, ignore.case = TRUE) ~ paste0(str_match(penalties, "-(\\d+)")[,2], "-", str_match(penalties, "(\\d+)-")[,2]),
            TRUE ~ str_match(penalties, "(\\d+-\\d+)")[,2]),
        pen_outcome = case_when(
            is.na(pen_winners) ~ NA,
            grepl("Tranmere", pen_winners, ignore.case = TRUE) ~ "W",
            !grepl("Tranmere", pen_winners, ignore.case = TRUE) ~ "L"
        )
    ) %>%
    mutate(
        agg_score = case_when(
            grepl("on aggregate", match_notes, ignore.case = TRUE) ~ str_match(match_notes, "(\\d+-\\d+)\\son\\saggregate")[,2],
            grepl("agg:", score, ignore.case = TRUE) ~ str_match(score, "[Aa]gg:\\s?(\\d+-\\d+)")[,2],
            .default = NA
          ),
        agg_winners = str_match(match_notes, "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\swon\\s\\d+-\\d+\\son\\saggregate")[,2],
        agg_outcome = case_when(
            is.na(agg_winners) & !is.na(agg_score) & cup_leg == "2" ~ "D",
            !is.na(agg_winners) & !grepl("Tranmere", agg_winners, ignore.case = TRUE) ~ "L",
            !is.na(agg_winners) & grepl("Tranmere", agg_winners, ignore.case = TRUE) ~ "W",
            TRUE ~ NA),
        agg_score = case_when(
            agg_outcome == "D" ~ agg_score,
            agg_outcome == "W" ~ agg_score,
            agg_outcome == "L" ~ paste0(str_sub(agg_score, 3, 3), "-", str_sub(agg_score, 1, 1)),
            is.na(agg_outcome) ~ NA
        )
    ) %>%
    mutate(
        away_goals_winner = str_match(match_notes, "(([a-zA-Z]+\\s)?[a-zA-Z]+)\\swon\\son\\sthe\\saway\\sgoals\\srule")[,2],
        away_goal_outcome = case_when(
            is.na(away_goals_winner) ~ NA,
            grepl("Tranmere", away_goals_winner, ignore.case = TRUE) ~ "W",
            !grepl("Tranmere", away_goals_winner, ignore.case = TRUE) ~ "L"
        )
    ) %>%
    mutate(
        cup_leg = case_when(
            !is.na(agg_outcome) & is.na(cup_leg) ~ "2",
            TRUE ~ cup_leg
        )
    )

df2 %>% select(ko_time, cup_stage, cup_round, cup_replay, cup_section, cup_leg, extra_time, pen_outcome, pen_score,  agg_outcome, agg_score, away_goal_outcome)

ko_time,cup_stage,cup_round,cup_replay,cup_section,cup_leg,extra_time,pen_outcome,pen_score,agg_outcome,agg_score,away_goal_outcome
<chr>,<chr>,<glue>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
3.00pm,,,0,,,0,,,,,
3.00pm,,,0,,,0,,,,,
7.45pm,1st round,1,0,,,0,W,12-11,,,
3.00pm,,,0,,,0,,,,,
7.45pm,,,0,,,0,,,,,
3.00pm,,,0,,,0,,,,,
7.45pm,2nd round,2,0,,,0,,,,,
3.00pm,,,0,,,0,,,,,
3.00pm,,,0,,,0,,,,,
7.45pm,,,0,,,0,,,,,


In [33]:
# df2 %>% select(ko_time, cup_stage, cup_round, cup_replay, cup_section, cup_leg, extra_time, pen_outcome, pen_score,  agg_outcome, agg_score, away_goal_outcome)

df2 %>% glimpse()

Rows: 4,684
Columns: 30
$ game_date         [3m[90m<date>[39m[23m 2022-07-30, 2022-08-06, 2022-08-09, 2022-08-13, 202…
$ home_team         [3m[90m<chr>[39m[23m "Tranmere Rovers", "Mansfield Town", "Accrington Sta…
$ away_team         [3m[90m<chr>[39m[23m "Stevenage", "Tranmere Rovers", "Tranmere Rovers", "…
$ game_url          [3m[90m<chr>[39m[23m "https://www.11v11.com/matches/tranmere-rovers-v-ste…
$ outcome           [3m[90m<chr>[39m[23m "L", "L", "W", "W", "D", "L", "L", "W", "L", "L", "W…
$ score             [3m[90m<chr>[39m[23m "1-2", "1-0", "2-2 (11-12)", "3-0", "0-0", "2-1", "1…
$ competition       [3m[90m<chr>[39m[23m "League Two", "League Two", "League Cup", "League Tw…
$ stadium           [3m[90m<chr>[39m[23m "Prenton Park", "One Call Stadium", "Wham Stadium", …
$ attendance        [3m[90m<dbl>[39m[23m 6111, 7128, 1035, 5414, 4072, 4070, 10961, 5041, 657…
$ competition_2     [3m[90m<chr>[39m[23m "League Two", "League Two", "League Cup 

In [34]:
df_left <- read_csv("https://raw.githubusercontent.com/petebrown/update-results/main/data/results_df.csv", show_col_types = FALSE) %>%
    rename(game_url = source_url)

df_right <- df2 %>%
    select(game_url, competition_2, ko_time, cup_stage, cup_round, cup_replay, cup_section, cup_leg, extra_time, pen_outcome, pen_score,  agg_outcome, agg_score, away_goal_outcome)

In [44]:
df_left %>%
    left_join(df_right, by = "game_url") %>%
    filter(!game_url %in% df_right$game_url)

season,game_date,opposition,venue,score,home_team,away_team,outcome,home_goals,away_goals,⋯,cup_round,cup_replay,cup_section,cup_leg,extra_time,pen_outcome,pen_score,agg_outcome,agg_score,away_goal_outcome
<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,⋯,<glue>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
2017/18,2018-05-12,Boreham Wood,N,2-1,,,W,1,2,⋯,,,,,,,,,,
2017/18,2018-05-05,Ebbsfleet United,H,4-2,Tranmere Rovers,Ebbsfleet United,W,4,2,⋯,,,,,,,,,,
2017/18,2018-04-28,Hartlepool United,H,1-2,Tranmere Rovers,Hartlepool United,L,1,2,⋯,,,,,,,,,,
2017/18,2018-04-24,Solihull Moors,H,1-2,Tranmere Rovers,Solihull Moors,L,1,2,⋯,,,,,,,,,,
2017/18,2018-04-21,F.C. Hailfax Town,A,2-0,F.C. Hailfax Town,Tranmere Rovers,W,0,2,⋯,,,,,,,,,,
2017/18,2018-04-17,Dover Athletic,A,1-0,Dover Athletic,Tranmere Rovers,W,0,1,⋯,,,,,,,,,,
2017/18,2018-04-14,Aldershot Town,H,2-0,Tranmere Rovers,Aldershot Town,W,2,0,⋯,,,,,,,,,,
2017/18,2018-04-10,Gateshead,H,4-2,Tranmere Rovers,Gateshead,W,4,2,⋯,,,,,,,,,,
2017/18,2018-04-07,Chester,A,2-0,Chester,Tranmere Rovers,W,0,2,⋯,,,,,,,,,,
2017/18,2018-03-24,Eastleigh,H,3-1,Tranmere Rovers,Eastleigh,W,3,1,⋯,,,,,,,,,,


In [43]:
df_left %>%
    filter(!game_url %in% df_right$game_url)

season,game_date,opposition,venue,score,home_team,away_team,outcome,home_goals,away_goals,⋯,attendance,stadium,league_tier,generic_comp,game_type,goal_diff,ssn_game_no,ssn_comp_game_no,weekday,manager
<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,⋯,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
2017/18,2018-05-12,Boreham Wood,N,2-1,,,W,1,2,⋯,16306,Wembley Stadium,5,Non-League,League Play-Off,1,52,2,Saturday,Micky Mellon
2017/18,2018-05-05,Ebbsfleet United,H,4-2,Tranmere Rovers,Ebbsfleet United,W,4,2,⋯,8898,Prenton Park,5,Non-League,League Play-Off,2,51,1,Saturday,Micky Mellon
2017/18,2018-04-28,Hartlepool United,H,1-2,Tranmere Rovers,Hartlepool United,L,1,2,⋯,5499,Prenton Park,5,Non-League,League,-1,50,46,Saturday,Micky Mellon
2017/18,2018-04-24,Solihull Moors,H,1-2,Tranmere Rovers,Solihull Moors,L,1,2,⋯,3946,Prenton Park,5,Non-League,League,-1,49,45,Tuesday,Micky Mellon
2017/18,2018-04-21,F.C. Hailfax Town,A,2-0,F.C. Hailfax Town,Tranmere Rovers,W,0,2,⋯,3113,,5,Non-League,League,2,48,44,Saturday,Micky Mellon
2017/18,2018-04-17,Dover Athletic,A,1-0,Dover Athletic,Tranmere Rovers,W,0,1,⋯,1231,,5,Non-League,League,1,47,43,Tuesday,Micky Mellon
2017/18,2018-04-14,Aldershot Town,H,2-0,Tranmere Rovers,Aldershot Town,W,2,0,⋯,5444,Prenton Park,5,Non-League,League,2,46,42,Saturday,Micky Mellon
2017/18,2018-04-10,Gateshead,H,4-2,Tranmere Rovers,Gateshead,W,4,2,⋯,4328,Prenton Park,5,Non-League,League,2,45,41,Tuesday,Micky Mellon
2017/18,2018-04-07,Chester,A,2-0,Chester,Tranmere Rovers,W,0,2,⋯,3103,,5,Non-League,League,2,44,40,Saturday,Micky Mellon
2017/18,2018-03-24,Eastleigh,H,3-1,Tranmere Rovers,Eastleigh,W,3,1,⋯,4619,Prenton Park,5,Non-League,League,2,43,39,Saturday,Micky Mellon
