In [299]:
import pandas as pd
import numpy as np

In [300]:
def fix_name(name):
    if name in name_fixes:
        return name_fixes[name]
    else:
        return name

name_fixes = {
    "Corey Taylor": "Corey Blackett-Taylor",
    "Craig Carl Curran": "Craig Curran",
    "Chris Edwards": "Christian Edwards",
    "Dave Nugent": "David Nugent",
    "Dylan Mottley Henry": "Dylan Mottley-Henry",
    "Ian Thomas-Moore": "Ian Moore",
    "John-Louis Akpa Akpro": "Jean-Louis Akpa Akpro",
    "John Morrissey": "Johnny Morrissey",
    "Jonathon Margetts": "Johnny Margetts",
    "Joseph Maguire": "Joe Maguire",
    "Kaylden Brown": "Kayleden Brown",
    "Lewis Sinnot": "Lewis Sinnott",
    "Lateef Elford Alliyu": "Lateef Elford-Alliyu",
    "Michael Jackson": "Mike Jackson",
    "Richard Sutton": "Ritchie Sutton",
    "Robert Taylor": "Rob Taylor",
    "Samuel Taylor": "Sam Taylor",
    "Steven O'Leary": "Stephen O'Leary"
}

In [301]:
managers_df = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/managers.csv", parse_dates=["date_from", "date_to"])

managers_df.tail(3)

Unnamed: 0,manager_name,date_from,date_to,role
48,Micky Mellon,2021-06-01,2023-03-19,Manager
49,Ian Dawes,2023-03-20,2023-05-04,Caretaker
50,Ian Dawes,2023-05-04,2024-04-27,Manager


In [302]:
managers_df.to_csv('data/managers.csv', index = False)

# Create `results_df`

In [303]:
def get_manager(game_date):
    manager_name = " & ".join(managers_df[(managers_df.date_from <= game_date) & (managers_df.date_to >= game_date)].manager_name)
    
    return manager_name

results = pd.read_csv("https://raw.githubusercontent.com/petebrown/update-results/main/data/results_df.csv", parse_dates = ["game_date"])

results["manager"] = results.game_date.apply(get_manager)

results.loc[results.goals_for > results.goals_against, "outcome"] = "W"
results.loc[results.goals_for == results.goals_against, "outcome"] = "D"
results.loc[results.goals_for < results.goals_against, "outcome"] = "L"

results.competition = results.competition.str.replace(r"lay-[oO]ffs?", "lay-Offs", regex = True)

results.head(1)

Unnamed: 0,season,game_date,opposition,venue,score,home_team,away_team,outcome,home_goals,away_goals,...,attendance,stadium,league_tier,generic_comp,game_type,goal_diff,ssn_game_no,ssn_comp_game_no,weekday,manager
0,2023/24,2023-08-12,Milton Keynes Dons,A,0-1,Milton Keynes Dons,Tranmere Rovers,L,1.0,0.0,...,5831.0,Stadium MK,4.0,Football League,League,-1.0,3,2,Saturday,Ian Dawes


In [304]:
league_positions = pd.read_csv("https://raw.githubusercontent.com/petebrown/league-position-tool/main/docs/input/results_mini.csv", parse_dates = ["game_date"])[["game_date", "ranking", "pts"]].rename(columns = {"ranking": "league_pos"}).sort_values("game_date").reset_index(drop = True)

league_positions.head(3)

Unnamed: 0,game_date,league_pos,pts
0,1921-08-27,3.0,2
1,1921-09-03,1.0,3
2,1921-09-10,11.0,3


In [305]:
results = pd.merge(results, league_positions, how = "left", on = "game_date")

results.head(1)

Unnamed: 0,season,game_date,opposition,venue,score,home_team,away_team,outcome,home_goals,away_goals,...,league_tier,generic_comp,game_type,goal_diff,ssn_game_no,ssn_comp_game_no,weekday,manager,league_pos,pts
0,2023/24,2023-08-12,Milton Keynes Dons,A,0-1,Milton Keynes Dons,Tranmere Rovers,L,1.0,0.0,...,4.0,Football League,League,-1.0,3,2,Saturday,Ian Dawes,22.0,0.0


In [306]:
cup_game_details = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/11v11-extra-details/cup_details.csv", parse_dates = ["game_date"]).rename(columns = {"extra_time": "aet"})

cup_game_details.loc[cup_game_details.game_date == "1984-05-18", "cup_leg"] = np.nan
cup_game_details.loc[cup_game_details.game_date == "1984-05-18", "cup_stage"] = cup_game_details.loc[cup_game_details.game_date == "1984-05-18"].cup_stage.str.replace(" (1st leg)", "")

cup_game_details.loc[cup_game_details.game_date.isin(["2014-10-07", "2014-12-09", "1995-11-08"]), "aet"] = 1

cup_game_details.ko_time = pd.to_datetime(cup_game_details.ko_time.str.replace(".", ":"), format = 
"%I:%M%p").dt.time
cup_game_details.cup_replay = cup_game_details.cup_replay.replace(0, np.nan)
cup_game_details.aet = cup_game_details.aet.replace(0, np.nan)

cup_game_details.head(3)

Unnamed: 0,game_date,ko_time,cup_round,cup_leg,cup_stage,cup_replay,cup_section,aet,pen_outcome,pen_score,pen_gf,pen_ga,agg_outcome,agg_score,agg_gf,agg_ga,away_goal_outcome,gg_outcome
0,2022-11-22,19:00:00,2,,2nd round - northern section,,Northern,,L,4-5,5.0,4.0,,,,,,
1,2022-11-05,15:00:00,1,,1st round,,,,,,,,,,,,,
2,2022-10-18,19:00:00,G,,Group B - North,,,,,,,,,,,,,


In [307]:
def outcome_decider(row):
    if row['pen_outcome'] is not np.nan:
        return "pens"
    elif row["away_goal_outcome"] is not np.nan:
        return "away_goals"
    elif row["gg_outcome"] is not np.nan:
        return "golden_goal"
    elif row["agg_outcome"] is not np.nan:
        if row["game_date"] == pd.to_datetime("1975-08-26"):
            return "replay"
        else:
            return "agg"
    else:
        return np.nan
    
def get_cup_outcome(row):
    if row["pen_outcome"] is not np.nan:
        return row["pen_outcome"]
    elif row["away_goal_outcome"] is not np.nan:
        return row["away_goal_outcome"]
    elif row["gg_outcome"] is not np.nan:
        return row["gg_outcome"]
    elif row["agg_outcome"] is not np.nan:
        return row["agg_outcome"]
    else:
        return np.nan
    
def get_outcome_desc(row):
    if row["pen_outcome"] is not np.nan:
        if row["agg_outcome"] is not np.nan:
            desc = row["agg_score"] + " on agg. " + row["pen_outcome"] + " " + row["pen_score"] + " on pens"
            desc = desc.replace("W ", "Won ").replace("L ", "Lost")
            return desc
        else:
            desc = row["pen_outcome"] + " " + row["pen_score"] + " on pens"
            desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
            return desc
    elif row["away_goal_outcome"] is not np.nan:
        desc = row["away_goal_outcome"] + " on away goals rule"
        desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
        return desc
    elif row["gg_outcome"] is not np.nan:
        desc = row["gg_outcome"] + " on golden goal"
        desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
        return desc
    elif row["agg_outcome"] is not np.nan:
        if row["game_date"] == pd.to_datetime("1975-08-26"):
            return "Game replayed"
        else:
            desc = row["agg_outcome"] + " " + row["agg_score"] + " on agg"
            desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
            return desc
    else:
        return np.nan
    
cup_game_details["decider"] = cup_game_details.apply(outcome_decider, axis = 1)    
cup_game_details["cup_outcome"] = cup_game_details.apply(get_cup_outcome, axis = 1)
cup_game_details["outcome_desc"] = cup_game_details.apply(get_outcome_desc, axis = 1)

In [308]:
def calc_game_length(row):
    if row["gg_outcome"] is not np.nan:
        return 116
    elif row["aet"] == 1 and row["gg_outcome"] not in ["W", "L"]:
        return 120
    else:
        return 90

results_df = results.rename(columns = {
    "ssn_game_no": "game_no",
})[["season", "game_date", "game_no", "opposition", "venue", "score", "outcome", "goals_for", "goals_against", "goal_diff", "game_type", "competition", "generic_comp", "ssn_comp_game_no", "league_tier", "league_pos", "pts", "attendance", "weekday", "manager"]]

results_df = pd.merge(left = results_df, right = cup_game_details, how = "left", on = "game_date")

results_df["game_length"] = results_df.apply(calc_game_length, axis = 1)

results_df.to_csv("data/results.csv", index = False)

# Create `goals_df`

In [309]:
sb_match_apps = pd.read_csv(
    "https://raw.githubusercontent.com/petebrown/update-player-stats/main/data/players_df.csv",
    parse_dates = ["game_date"]
).rename(columns = {
    "sb_game_id": "game_id",
    "sb_player_id": "player_id",
})

sb_match_apps.game_id = sb_match_apps.game_id.str.replace("tpg", "").astype(int)

sb_match_apps.player_name = sb_match_apps.player_name.apply(fix_name)

In [310]:
sb_game_ids = sb_match_apps[["game_id", "game_date", "season"]].drop_duplicates().sort_values(by = ["game_date"]).reset_index(drop = True)

sb_game_ids.head(3)

Unnamed: 0,game_id,game_date,season
0,240309,1996-08-17,1996/97
1,243200,1996-08-21,1996/97
2,242383,1996-08-23,1996/97


In [311]:
sb_player_ids = sb_match_apps[["player_id", "player_name"]].drop_duplicates().reset_index(drop = True)
sb_player_ids.player_name = sb_player_ids.player_name.apply(fix_name)

sb_player_ids.head(3)

Unnamed: 0,player_id,player_name
0,128729,Jake Burton
1,18605,Peter Clarke
2,103555,Josh Dacres-Cogley


In [312]:
game_nos = results[["game_date", "season", "ssn_game_no"]].rename(columns = {"ssn_game_no": "game_no"}).sort_values(["game_date"]).reset_index(drop = True)

game_nos.head(3)

Unnamed: 0,game_date,season,game_no
0,1921-08-27,1921/22,1
1,1921-09-03,1921/22,2
2,1921-09-10,1921/22,3


### Import goal details scraped from Soccerbase

**Seasons:** 1996-97 - 2022/23

**Fields:**
- `player_name`
- `minute`
- `penalty` (0/1)
- `own_goal` (0/1)
- `game_date` _(after join)_

In [313]:
# Import Soccerbase goal details
sb_goals = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-goals/main/data/goals.csv")

# Filter for Tranmere goals
sb_goals = sb_goals[sb_goals["goal_type"] == "for"]

# Fix player names
sb_goals.player_name = sb_goals.player_name.apply(fix_name)

# Add game dates via a join
sb_goals = sb_goals.merge(sb_game_ids, on = "game_id", how = "left").sort_values(["game_date", "minute"])

# Reduce to subset of columns
sb_goals = sb_goals[["game_date", "player_name", "minute", "penalty", "own_goal"]]

sb_goals = sb_goals.rename(columns = {"minute": "goal_min"})

sb_goals.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
1713,1996-08-17,Johnny Morrissey,22,0,0
1714,1996-08-21,John Aldridge,30,0,0
1715,1996-08-21,Ivano Bonetti,56,0,0


### Import goals from Complete Record

**Seasons:** 1921/22 - 1995/96 (after filter)

**Fields:**

* `game_date`
* `player_name`
* `goals_scored` (no. of goals scored by player in game)

In [314]:
# Import Complete Record goal details and convert to a dictionary
all_goals = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/scorers-long.csv").merge(game_nos, how = "left", on = ["season", "game_no"])[["season", "game_date", "player_name", "goals_scored"]].to_dict("records")

# Initiate empty list for goals
cr_goals = []

# Loop through each goal and one for every goal in goals_scored column
for goal in all_goals:
    n_goals = goal["goals_scored"]

    for i in range(n_goals):
        cr_goals.append(goal)

# Convert to Pandas dataframe
cr_goals = pd.DataFrame(cr_goals).drop("goals_scored", axis = 1)

# Add own_goal column to match sb_goals
cr_goals.loc[cr_goals.player_name == "OG", "own_goal"] = 1
cr_goals.loc[cr_goals.player_name != "OG", "own_goal"] = 0
cr_goals.own_goal = cr_goals.own_goal.astype(int)

# Filter for seasons covered by Soccerbase data, i.e. before 1996/97
cr_goals = cr_goals[cr_goals.season < "1996"]

# Drop the season column
cr_goals = cr_goals.drop("season", axis = 1)

cr_goals.head(3)

Unnamed: 0,game_date,player_name,own_goal
0,1921-08-27,Tom Stuart,0
1,1921-08-27,Charles Milnes,0
2,1921-08-27,Fred Groves,0


### Import manually collected goal minutes

In [315]:
# Import manually collected goal minute data
cr_goal_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/goal_mins.csv", parse_dates = ["game_date"]).sort_values(["game_date", "goal_min"])

# Amend penalty column to 0/1
cr_goal_mins.penalty = cr_goal_mins.penalty.apply(lambda x: 1 if x == "pen" else 0)

# Add goal numbers for each player in each game, e.g. 1-3 for a hat-trick
cr_goal_mins["pl_goal_no"] = cr_goal_mins.groupby(["player_name", "game_date"]).cumcount() + 1

cr_goal_mins.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,pl_goal_no
0,1964-09-04,John Manning,60,0,1
1,1986-09-27,Johnny Morrissey,50,0,1
2,1986-09-27,Gary Williams,54,0,1


### Join manually collected goal minutes to Complete Record goal data

In [316]:
# For join: add goal numbers for each player in each game, e.g. 1-3 for a hat-trick
cr_goals["pl_goal_no"] = cr_goals.groupby(["player_name", "game_date"]).cumcount() + 1

# Merge goal minutes with Complete Record goal data and drop temporary pl_goal_no column
cr_goals = cr_goals.merge(cr_goal_mins, how = "left", on = ["game_date", "player_name", "pl_goal_no"]).drop(columns = ["pl_goal_no"])

cr_goals.head(3)

Unnamed: 0,game_date,player_name,own_goal,goal_min,penalty
0,1921-08-27,Tom Stuart,0,,
1,1921-08-27,Charles Milnes,0,,
2,1921-08-27,Fred Groves,0,,


### Import FA Trophy goals

**Fields:**

* `game_date`
* `player_name`
* `minute`
* `penalty` (0/1)
* `own_goal` (0/1)

In [317]:
fa_trophy_goals = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_goals.csv", parse_dates = ["game_date"]).rename(columns = {"minute": "goal_min"})

fa_trophy_goals.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
0,2015-12-12,James Norwood,45,0,0
1,2015-12-12,James Norwood,82,0,0
2,2016-12-10,Ritchie Sutton,26,0,0


In [318]:
# Concatenate Complete Record and Soccerbase goal dataframes
goals_df = pd.concat([cr_goals, sb_goals, fa_trophy_goals], axis = 0)

goals_df = goals_df.sort_values(by = ["game_date", "goal_min"])[["game_date", "player_name", "goal_min", "penalty", "own_goal"]]

goals_df.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
0,1921-08-27,Tom Stuart,,,0
1,1921-08-27,Charles Milnes,,,0
2,1921-08-27,Fred Groves,,,0


#### Save goals to CSV

In [319]:
goals_df.to_csv('data/goals.csv', index=False)

# Create `sub_mins_df`

In [320]:
season_dates = results[["game_date", "season"]].drop_duplicates().sort_values("game_date").reset_index(drop=True)

season_dates.head(3)

Unnamed: 0,game_date,season
0,1921-08-27,1921/22
1,1921-09-03,1921/22
2,1921-09-10,1921/22


In [321]:
game_nos = season_dates.copy()
game_nos["game_no"] = game_nos.sort_values("game_date").groupby("season").cumcount() + 1

game_nos.head(3)

Unnamed: 0,game_date,season,game_no
0,1921-08-27,1921/22,1
1,1921-09-03,1921/22,2
2,1921-09-10,1921/22,3


In [322]:
sb_game_ids.head(3)

Unnamed: 0,game_id,game_date,season
0,240309,1996-08-17,1996/97
1,243200,1996-08-21,1996/97
2,242383,1996-08-23,1996/97


In [323]:
sb_player_ids.head(3)

Unnamed: 0,player_id,player_name
0,128729,Jake Burton
1,18605,Peter Clarke
2,103555,Josh Dacres-Cogley


In [324]:
# Import Soccerbase subs and red card CSV
sb_sub_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-events/main/data/subs-and-reds.csv").merge(sb_game_ids, how = "left", on = "game_id") \
    .merge(sb_player_ids, how = "left", on = "player_id") \
    .sort_values("game_date") \
    [["game_date", "player_name", "min_on", "min_off"]].reset_index(drop = True)

# Import manual fixes to Soccerbase sub data
sb_sub_min_fixes = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/sb_sub_fixes.csv", parse_dates = ["game_date"])[["game_date", "player_name", "min_on", "min_off"]]

 # Fix known error with Luke Norris sub v. Barrow 
sb_sub_mins.loc[(sb_sub_mins.game_date == "2023-08-05") & (sb_sub_mins.player_name == "Luke Norris"), 'min_off'] = np.nan

# Filter out sendings off
sb_sub_mins = sb_sub_mins[~((sb_sub_mins.min_on.isna()) & (sb_sub_mins.min_off.isna()))]

# Filter out records for dates with manual fixes
sb_sub_mins = sb_sub_mins[~sb_sub_mins.game_date.isin(sb_sub_min_fixes.game_date.unique())]

# Append manual fixes to Soccerbase sub data
sb_sub_mins = pd.concat([sb_sub_mins, sb_sub_min_fixes]).sort_values("game_date").reset_index(drop = True)

sb_sub_mins.head(3)

Unnamed: 0,game_date,player_name,min_on,min_off
0,1996-08-17,Kenny Irons,,69.0
1,1996-08-17,John Aldridge,,79.0
2,1996-08-17,Johnny Morrissey,,86.0


In [325]:
cr_sub_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/cr_subs_and_reds.csv", parse_dates = ["game_date"], na_values = ["0"])[["game_date", "player_name", "min_off", "min_on"]]

cr_sub_mins.head(3)

Unnamed: 0,game_date,player_name,min_off,min_on
0,1965-08-23,Jack Lornie,,75.0
1,1965-08-23,Mandy Hill,75.0,
2,1965-10-15,Eddie Stuart,82.0,


In [326]:
fa_trophy_sub_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_fixtures.csv", parse_dates = ["game_date"])[["game_date", "player_name", "min_off", "min_on"]].query("min_off > 0 or min_on > 0")

fa_trophy_sub_mins.head(3)

Unnamed: 0,game_date,player_name,min_off,min_on
5,2015-12-12,Ben Tomlinson,67.0,
7,2015-12-12,Jake Kirby,67.0,
11,2015-12-12,Andy Mangan,,67.0


In [327]:
sub_mins_df = pd.concat([cr_sub_mins, sb_sub_mins, fa_trophy_sub_mins]).sort_values("game_date").reset_index(drop = True)

sub_mins_df.head(3)

Unnamed: 0,game_date,player_name,min_off,min_on
0,1965-08-23,Jack Lornie,,75.0
1,1965-08-23,Mandy Hill,75.0,
2,1965-10-15,Eddie Stuart,82.0,


# Create `shirt_nos_df`

In [328]:
sub_mins_df.to_csv('data/sub_mins.csv', index = False)

In [329]:
shirt_nos_df = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/squad_nos/squad_nos.csv")

shirt_nos_df.head(3)

Unnamed: 0,season,squad_no,player_name
0,1999/00,1,John Achterberg
1,1999/00,2,Graham Allen
2,1999/00,3,Andy Thompson


# Create `red_cards_df`

In [330]:
cr_red_cards = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/cards_red.csv", parse_dates = ["game_date"]).drop(columns = ["red_cards"])

cr_red_cards.head(3)

Unnamed: 0,game_date,player_name,min_so
0,1986-09-27,Andy Thorpe,90
1,1986-10-11,Frank Worthington,77
2,1987-02-14,Simon Farnworth,87


In [331]:
sb_red_cards = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-events/main/data/subs-and-reds.csv").merge(sb_game_ids, how = "left", on = "game_id").merge(sb_player_ids, how = "left", on = "player_id")[["game_date", "player_name", "min_so"]].sort_values("game_date")

sb_red_cards = sb_red_cards[~sb_red_cards.min_so.isna()].reset_index(drop = True)

sb_red_cards.head(3)

Unnamed: 0,game_date,player_name,min_so
0,1996-08-17,Alan Rogers,69.0
1,1997-01-14,John Aldridge,78.0
2,1997-01-18,Dave Higgins,34.0


In [332]:
fa_trophy_reds = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_cards.csv", parse_dates = ["game_date"])[["game_date", "player_name", "red_card", "min_so"]].query("red_card == 1").drop(columns = "red_card").reset_index(drop = True)

fa_trophy_reds.head(3)

Unnamed: 0,game_date,player_name,min_so
0,2015-10-27,Scott Davies,14.0


In [333]:
red_cards_df = pd.concat([cr_red_cards, sb_red_cards, fa_trophy_reds], axis = 0).sort_values(["game_date", "min_so"]).reset_index(drop = True)

red_cards_df.head(3)

Unnamed: 0,game_date,player_name,min_so
0,1986-09-27,Andy Thorpe,90.0
1,1986-10-11,Frank Worthington,77.0
2,1987-02-14,Simon Farnworth,87.0


#### Save red_cards to CSV

In [334]:
red_cards_df.to_csv('data/red_cards.csv', index = False)

# Create `yellow_cards_df`

In [335]:
cr_yellows = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/cards_yellow.csv", parse_dates = ["game_date"])

cr_yellows.head(3)

Unnamed: 0,game_date,player_name,yellow_cards
0,1986-09-27,Andy Thorpe,1
1,1986-09-27,Ronnie Moore,1
2,1986-09-27,Mark Hughes,1


In [336]:
sb_yellows = sb_match_apps[["game_date", "player_name", "yellow_cards"]].query("yellow_cards > 0").sort_values(by = "game_date")

sb_yellows.head(3)

Unnamed: 0,game_date,player_name,yellow_cards
17600,1996-08-17,Ivano Bonetti,1
17700,1996-08-21,Paul Cook,1
18130,1996-08-23,Shaun Teale,1


In [337]:
fa_trophy_yellows = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_cards.csv", parse_dates = ["game_date"])[["game_date", "player_name", "yellow_card"]].query("yellow_card == 1").rename(columns = {"yellow_card": "yellow_cards"})

fa_trophy_yellows.head(3)

Unnamed: 0,game_date,player_name,yellow_cards
0,2015-12-12,Lee Vaughan,1
1,2015-12-12,Lois Maynard,1
2,2016-12-10,Jeff Hughes,1


In [338]:
yellow_cards_df = pd.concat([cr_yellows, sb_yellows, fa_trophy_yellows], axis = 0).sort_values(by = "game_date").reset_index(drop = True)

yellow_cards_df.head(3)

Unnamed: 0,game_date,player_name,yellow_cards
0,1986-09-27,Andy Thorpe,1
1,1986-09-27,Ronnie Moore,1
2,1986-09-27,Mark Hughes,1


#### Save yellow_cards to CSV

In [339]:
yellow_cards_df.to_csv('data/yellow_cards.csv', index = False)

# Create `player_apps_df`

In [340]:
pl_ssns_9798 = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_ssns_9798.csv")
pl_ssns_9798.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,Aldridge,John,John Aldridge,1997,1997/98,Aldridge
1,Branch,Graham,Graham Branch,1997,1997/98,Branch
2,Challinor,Dave,Dave Challinor,1997,1997/98,Challinor


In [341]:
pl_ssns_9899 = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_ssns_9899.csv")
pl_ssns_9899.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,Simonsen,Steve,Steve Simonsen,1998,1998/99,Simonsen
1,Frail,Stephen,Stephen Frail,1998,1998/99,Frail
2,Thompson,Andy,Andy Thompson,1998,1998/99,Thompson


In [342]:
pl_ssns = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_ssns.csv")
pl_ssns.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,A'Court,Alan,Alan A'Court,1964,1964/65,A'Court
1,A'Court,Alan,Alan A'Court,1965,1965/66,A'Court
2,Adams,Arthur,Arthur Adams,1932,1932/33,Adams A


In [343]:
cr_pl_ssns = pd.concat([pl_ssns, pl_ssns_9798, pl_ssns_9899], axis = 0).sort_values(by = ["surname", "forename", "ssn"]).reset_index(drop = True)
cr_pl_ssns.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,A'Court,Alan,Alan A'Court,1964,1964/65,A'Court
1,A'Court,Alan,Alan A'Court,1965,1965/66,A'Court
2,Achterberg,John,John Achterberg,1998,1998/99,Achterberg


In [344]:
cr_pl_ssns_prepped = cr_pl_ssns[["season", "disam_name", "player_name"]].drop_duplicates().reset_index(drop = True)

cr_player_apps = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/apps_long.csv") \
    .merge(game_nos, how = "left", on = ["season", "game_no"]) \
    .merge(cr_pl_ssns_prepped, how = "left", left_on = ["season", "player_name"], right_on = ["season", "disam_name"]) \
    .rename(columns = {"player_name_y": "player_name"}) \
    [["game_date", "player_name", "shirt_no", "role"]]

cr_player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,1921-08-27,Harry Bradshaw,1,starter
1,1921-08-27,John Grainger,2,starter
2,1921-08-27,Tom Stuart,3,starter


In [345]:
fa_trophy_player_apps = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_fixtures.csv", parse_dates = ["game_date"])[["game_date", "player_name", "shirt_no", "role"]]

fa_trophy_player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,2015-12-12,Scott Davies,1,starter
1,2015-12-12,Lee Vaughan,7,starter
2,2015-12-12,Steve McNulty,24,starter


In [346]:
def get_squad_no(season, player_name, game_date):
    try:
        if season == "2014/15" and player_name == "Janoi Donacien" and game_date < pd.Timestamp("2015-03-07"):
            return 19
        elif season == "2014/15" and player_name == "Janoi Donacien" and game_date >= pd.Timestamp("2015-03-07"):
            return  12
        else:
            return shirt_nos_df[(shirt_nos_df.season == season) & (shirt_nos_df.player_name == player_name)].squad_no.values[0]
    except:
        return None

sb_player_apps = sb_match_apps[~sb_match_apps.season.isin(["1996/97", "1997/98", "1998/99"])][["game_date", "player_name"]].copy()

sb_player_apps["shirt_no"] = sb_match_apps.apply(lambda x: get_squad_no(x.season, x.player_name, x.game_date), axis = 1)

In [347]:
def get_role(game_date, player_name):
    subs = sub_mins_df[~sub_mins_df.min_on.isna()].drop(columns=['min_off', 'min_on'])

    subs = subs[(subs.game_date == game_date) & (subs.player_name == player_name)]

    if subs.empty:
        return "starter"
    else:
        return "sub"

sb_player_apps["role"] = sb_player_apps.apply(lambda x: get_role(x.game_date, x.player_name), axis=1)

sb_player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,2022-03-05,Jake Burton,27.0,starter
1,2021-08-07,Peter Clarke,26.0,starter
2,2021-08-10,Peter Clarke,26.0,starter


In [348]:
player_apps = pd.concat([cr_player_apps, sb_player_apps, fa_trophy_player_apps], axis = 0).sort_values(by = ["game_date", "role", "shirt_no"]).reset_index(drop = True)

player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,1921-08-27,Harry Bradshaw,1.0,starter
1,1921-08-27,John Grainger,2.0,starter
2,1921-08-27,Tom Stuart,3.0,starter


#### Save player_apps to csv

In [349]:
player_apps.to_csv('data/players_apps.csv', index = False)

# Create `subs_df`

In [350]:
cr_subs = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/apps_long.csv").merge(game_nos, how = "left", on = ["season", "game_no"]) \
    .merge(cr_pl_ssns_prepped, how = "left", left_on = ["season", "player_name"], right_on = ["season", "disam_name"]) \
    .rename(columns = {"player_name_y": "player_name"}) \
    .sort_values(["game_date", "shirt_no"]) \
    [["game_date", "shirt_no", "player_name", "on_for", "off_for"]]

cr_subs = cr_subs[(cr_subs.on_for.notnull()) | (cr_subs.off_for.notnull())]
cr_subs.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
18288,1965-08-23,7,Mandy Hill,,12.0
18293,1965-08-23,12,Jack Lornie,7.0,
18419,1965-10-15,5,Eddie Stuart,,12.0


In [351]:
sb_subs = sb_sub_mins[~sb_sub_mins.game_date.isin(cr_subs.game_date)].copy()
sb_subs.query("game_date == '2023-08-05'")

Unnamed: 0,game_date,player_name,min_on,min_off
6422,2023-08-05,Sam Taylor,89.0,
6423,2023-08-05,Josh Hawkes,,89.0
6424,2023-08-05,Harvey Saunders,,81.0
6425,2023-08-05,Brad Walker,,45.0
6426,2023-08-05,Kieron Morris,45.0,
6427,2023-08-05,Reece McAlear,81.0,
6428,2023-08-05,Connor Jennings,,71.0
6429,2023-08-05,Kristian Dennis,81.0,
6430,2023-08-05,Lee O'Connor,,81.0
6431,2023-08-05,Luke Norris,71.0,


In [352]:
# Import Soccerbase subs and red card CSV
sb_subs = sb_sub_mins[~sb_sub_mins.game_date.isin(cr_subs.game_date)].copy()

# Get lower number from min_on and min_off
sb_subs['sub_min'] = sb_subs[['min_on', 'min_off']].min(axis = 1)

sb_subs_on = sb_subs[sb_subs.min_on.notnull()]
sb_subs_off = sb_subs[sb_subs.min_off.notnull()]

sb_subs_on.head(3)

Unnamed: 0,game_date,player_name,min_on,min_off,sub_min
703,1999-08-07,David Kelly,80.0,,80.0
705,1999-08-07,Michael Black,80.0,,80.0
706,1999-08-10,Michael Black,72.0,,72.0


In [353]:
sb_subs_on_unique = sb_subs_on[["game_date", "min_on"]].copy()

sb_subs_on_unique = sb_subs_on_unique.groupby([sb_subs_on_unique.game_date, sb_subs_on_unique.min_on]).size().reset_index(name='count').query('count == 1').drop(columns = {"count"})

sb_subs_on_unique = sb_subs_on.merge(sb_subs_on_unique, on = ["game_date", "min_on"], how = "inner") \
    .merge(player_apps, left_on = ["game_date", "player_name"], right_on = ["game_date", "player_name"], how = "inner") \
    [["game_date", "shirt_no", "player_name", "min_on"]]

sb_subs_on_unique.head(3)

Unnamed: 0,game_date,shirt_no,player_name,min_on
0,1999-08-10,18.0,Stephen Frail,85.0
1,1999-08-14,17.0,Jason Koumas,80.0
2,1999-08-14,9.0,David Kelly,73.0


In [354]:
sb_subs_off_unique = sb_subs_off[["game_date", "min_off"]].copy()

sb_subs_off_unique = sb_subs_off_unique.groupby([sb_subs_off_unique.game_date, sb_subs_off_unique.min_off]).size().reset_index(name='count').query('count == 1').drop(columns = {"count"})

sb_subs_off_unique = sb_subs_off.merge(sb_subs_off_unique, on = ["game_date", "min_off"], how = "inner") \
    .merge(player_apps, left_on = ["game_date", "player_name"], right_on = ["game_date", "player_name"], how = "inner") \
    [["game_date", "shirt_no", "player_name", "min_off"]]

sb_subs_off_unique.tail(10)

Unnamed: 0,game_date,shirt_no,player_name,min_off
2380,2023-04-29,33.0,Jay Turner-Cook,55.0
2381,2023-05-08,17.0,Rhys Hughes,45.0
2382,2023-05-08,14.0,Jordan Turnbull,89.0
2383,2023-05-08,4.0,Lee O'Connor,67.0
2384,2023-08-05,11.0,Josh Hawkes,89.0
2385,2023-08-05,4.0,Brad Walker,45.0
2386,2023-08-05,18.0,Connor Jennings,71.0
2387,2023-08-08,20.0,Sam Taylor,62.0
2388,2023-08-08,7.0,Kieron Morris,89.0
2389,2023-08-12,19.0,Harvey Saunders,59.0


In [355]:
sb_subs_1 = pd.merge(left = sb_subs_on_unique, right = sb_subs_off_unique, how = "inner", left_on = ["game_date", "min_on"], right_on = ["game_date", "min_off"]) \
.rename(
    columns = {
        "shirt_no_x": "shirt_no",
        "player_name_x": "player_name",
        "shirt_no_y": "on_for",
    }
)[["game_date", "shirt_no", "player_name", "min_on", "on_for"]]

sb_subs_1.head(3)

Unnamed: 0,game_date,shirt_no,player_name,min_on,on_for
0,1999-08-10,18.0,Stephen Frail,85.0,17.0
1,1999-08-14,17.0,Jason Koumas,80.0,10.0
2,1999-08-14,9.0,David Kelly,73.0,32.0


In [356]:
sb_subs_2 = pd.merge(left = sb_subs_on_unique, right = sb_subs_off_unique, how = "inner", left_on = ["game_date", "min_on"], right_on = ["game_date", "min_off"]) \
    .rename(
        columns = {
            "shirt_no_y": "shirt_no",
            "player_name_y": "player_name",
            "shirt_no_x": "off_for",
        }
    )[["game_date", "shirt_no", "player_name", "min_off", "off_for"]]

sb_subs_2.head(3)

Unnamed: 0,game_date,shirt_no,player_name,min_off,off_for
0,1999-08-10,17.0,Jason Koumas,85.0,18.0
1,1999-08-14,10.0,Scott Taylor,80.0,17.0
2,1999-08-14,32.0,Pedro Miguel Matias,73.0,9.0


In [357]:
sb_subs = pd.concat([sb_subs_1, sb_subs_2], axis = 0).sort_values("game_date").reset_index(drop = True)[["game_date", "shirt_no", "player_name", "on_for", "off_for"]]

sb_subs.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
0,1999-08-10,18.0,Stephen Frail,17.0,
1,1999-08-10,17.0,Jason Koumas,,18.0
2,1999-08-14,9.0,David Kelly,32.0,


In [358]:
# Import manual sub fixes
sb_sub_fixes = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/sb_sub_fixes.csv", parse_dates = ["game_date"])[["game_date", "shirt_no", "player_name", "on_for", "off_for"]].sort_values(["game_date"])

sb_sub_fixes.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
0,1999-08-07,29,Michael Black,7.0,
1,1999-08-07,7,Alan Mahon,,29.0
2,1999-08-07,9,David Kelly,14.0,


In [359]:
fa_trophy_subs = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_fixtures.csv", parse_dates = ["game_date"])[["game_date", "shirt_no", "player_name", "on_for", "off_for"]].query("on_for > 0 or off_for > 0")

fa_trophy_subs.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
5,2015-12-12,29,Ben Tomlinson,,14.0
7,2015-12-12,23,Jake Kirby,,20.0
11,2015-12-12,14,Andy Mangan,29.0,


In [360]:
subs_df = pd.concat([cr_subs, sb_subs, fa_trophy_subs], axis = 0).query("~game_date.isin(@sb_sub_fixes.game_date)")

subs_df = pd.concat([subs_df, sb_sub_fixes], axis = 0).sort_values("game_date").reset_index(drop = True)[["game_date", "shirt_no", "player_name", "on_for", "off_for"]]

subs_df.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
0,1965-08-23,7.0,Mandy Hill,,12.0
1,1965-08-23,12.0,Jack Lornie,7.0,
2,1965-10-15,5.0,Eddie Stuart,,12.0


#### Save subs_df to CSV

In [361]:
subs_df.to_csv("data/subs.csv", index = False)

In [362]:
game_lengths = results_df[["game_date", "game_length"]]

game_lengths.head(3)

Unnamed: 0,game_date,game_length
0,2023-08-12,90
1,2023-08-08,90
2,2023-08-05,90


In [363]:
def calc_mins_played(row):
    role = row["role"]
    min_on = row["min_on"]
    min_off = row["min_off"]
    min_so = row["min_so"]
    game_length = row["game_length"]
    
    # Started, played to end
    if role == "starter" and np.isnan(min_off) and np.isnan(min_so):
        return game_length
    # Started, sent off
    elif role == "starter" and not np.isnan(min_so):
        return min_so
    # Started, subbed off
    elif role == "starter" and not np.isnan(min_off):
        return min_off
    # Subbed on, played to end
    elif role == "sub" and np.isnan(min_off):
        return game_length - min_on
    # Subbed on, sent off
    elif role == "sub" and not np.isnan(min_so):
        return min_so - min_on
    # Subbed on, subbed off
    elif role == "sub" and not np.isnan(min_off):
        return min_off - min_on
    else:
        return "OTHER"
    

player_events = player_apps \
    .merge(sub_mins_df, how = "left", on = ["game_date", "player_name"]) \
    .merge(red_cards_df, how = "left", on = ["game_date", "player_name"]) \
    .merge(game_lengths, how = "left", on = ["game_date"])

player_events["mins_played"] = player_events.apply(calc_mins_played, axis = 1)

player_events.head()

Unnamed: 0,game_date,player_name,shirt_no,role,min_off,min_on,min_so,game_length,mins_played
0,1921-08-27,Harry Bradshaw,1.0,starter,,,,90,90.0
1,1921-08-27,John Grainger,2.0,starter,,,,90,90.0
2,1921-08-27,Tom Stuart,3.0,starter,,,,90,90.0
3,1921-08-27,Johnny Campbell,4.0,starter,,,,90,90.0
4,1921-08-27,Charles Milnes,5.0,starter,,,,90,90.0


In [364]:
player_events[(player_events.role == "sub") & (np.isnan(player_events.min_on))].tail(10)

Unnamed: 0,game_date,player_name,shirt_no,role,min_off,min_on,min_so,game_length,mins_played
35995,1995-04-17,Jon Kenworthy,12.0,sub,,,,90,
36007,1995-04-21,Jon Kenworthy,12.0,sub,,,,90,
36008,1995-04-21,Ged Brannan,14.0,sub,,,,90,
36020,1995-04-30,Johnny Morrissey,12.0,sub,,,,90,
36021,1995-04-30,Ged Brannan,14.0,sub,,,,90,
36044,1995-05-07,Jon Kenworthy,12.0,sub,,,,90,
36045,1995-05-07,Chris Malkin,14.0,sub,,,,90,
36057,1995-05-14,Ged Brannan,14.0,sub,,,,90,
36069,1995-05-17,Jon Kenworthy,12.0,sub,,,,90,
36070,1995-05-17,Liam O'Brien,14.0,sub,,,,90,
