In [1]:
import pandas as pd
import numpy as np

In [2]:
def fix_name(name):
    if name in name_fixes:
        return name_fixes[name]
    else:
        return name

name_fixes = {
    "Anthony Kay": "Antony Kay",
    "Corey Taylor": "Corey Blackett-Taylor",
    "Craig Carl Curran": "Craig Curran",
    "Chris Edwards": "Christian Edwards",
    "Daniel Robert Harrison": "Danny Harrison",
    "Dave Nugent": "David Nugent",
    "Dylan Mottley Henry": "Dylan Mottley-Henry",
    "Jack Flemming": "Jack Fleming",
    "Jay Devine": "James Devine",
    "Jay Turner-Cook": "Jay Turner-Cooke",
    "Jimmy McNulty": "Jim McNulty",
    "John-Louis Akpa Akpro": "Jean-Louis Akpa Akpro",
    "John Morrissey": "Johnny Morrissey",
    "Jonathon Margetts": "Johnny Margetts",
    "Joseph Maguire": "Joe Maguire",
    "Kaylden Brown": "Kayleden Brown",
    "Lewis Sinnot": "Lewis Sinnott",
    "Lateef Elford Alliyu": "Lateef Elford-Alliyu",
    "Matty Kennedy": "Matthew Kennedy",
    "Michael Jackson": "Mike Jackson",
    "Michael Jones": "Mike Jones",
    "Oliver Banks": "Ollie Banks",
    "Ousmane Kane": "Ousmane Kané",
    "Richard Sutton": "Ritchie Sutton",
    "Robert Apter": "Rob Apter",
    "Robert Taylor": "Rob Taylor",
    "Steven O'Leary": "Stephen O'Leary",
    "Tom Coughan" : "Tom Croughan"
}

In [3]:
managers_df = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/managers.csv", parse_dates=["date_from", "date_to"])

managers_df.loc[managers_df.date_from == max(managers_df.date_from), "date_to"] = '2026-06-30'

managers_df.tail(3)

Unnamed: 0,manager_name,date_from,date_to,role
51,Ian Dawes,2023-05-04,2023-09-10,Manager
52,Nigel Adkins,2023-09-10,2023-11-02,Caretaker
53,Nigel Adkins,2023-11-02,2026-06-30,Manager


In [4]:
managers_df.to_csv('data/managers.csv', index = False)

# Create `results_df`

In [5]:
def get_manager(game_date):
    manager_name = " & ".join(managers_df[(managers_df.date_from <= game_date) & (managers_df.date_to >= game_date)].manager_name)
    
    return manager_name

results = pd.read_csv("https://raw.githubusercontent.com/petebrown/data-updater/main/data/results.csv", parse_dates = ["game_date"])

results["manager"] = results.game_date.apply(get_manager)

results.loc[results.goals_for > results.goals_against, "outcome"] = "W"
results.loc[results.goals_for == results.goals_against, "outcome"] = "D"
results.loc[results.goals_for < results.goals_against, "outcome"] = "L"

results.competition = results.competition.str.replace(r"lay-[oO]ffs?", "lay-Offs", regex = True)

results.head(1)

Unnamed: 0,season,game_date,game_no,opposition,venue,score,outcome,goals_for,goals_against,goal_diff,...,agg_gf,agg_ga,away_goal_outcome,gg_outcome,decider,cup_outcome,outcome_desc,game_length,stadium,referee
0,1921/22,1921-08-27,1,Crewe Alexandra,H,4-1,W,4.0,1.0,3.0,...,,,,,,,,90,,


In [6]:
league_positions = pd.read_csv("https://raw.githubusercontent.com/petebrown/league-position-tool/main/docs/input/results_mini.csv", parse_dates = ["game_date"])[["game_date", "ranking", "pts"]].rename(columns = {"ranking": "league_pos"}).sort_values("game_date").reset_index(drop = True)

league_positions.head(3)

Unnamed: 0,game_date,league_pos,pts
0,1921-08-27,3.0,2
1,1921-09-03,1.0,3
2,1921-09-10,11.0,3


In [7]:
cup_game_details = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/11v11-extra-details/cup_details.csv", parse_dates = ["game_date"]).rename(columns = {"extra_time": "aet"})

cup_game_details.loc[cup_game_details.game_date == "1984-05-18", "cup_leg"] = np.nan
cup_game_details.loc[cup_game_details.game_date == "1984-05-18", "cup_stage"] = cup_game_details.loc[cup_game_details.game_date == "1984-05-18"].cup_stage.str.replace(" (1st leg)", "")

cup_game_details.loc[cup_game_details.game_date.isin(["2014-10-07", "2014-12-09", "1995-11-08"]), "aet"] = 1

cup_game_details.ko_time = pd.to_datetime(cup_game_details.ko_time.str.replace(".", ":"), format = 
"%I:%M%p").dt.time
cup_game_details.cup_replay = cup_game_details.cup_replay.replace(0, np.nan)
cup_game_details.aet = cup_game_details.aet.replace(0, np.nan)

cup_game_details.head(3)

  cup_game_details.loc[cup_game_details.game_date.isin(["2014-10-07", "2014-12-09", "1995-11-08"]), "aet"] = 1


Unnamed: 0,game_date,ko_time,cup_round,cup_leg,cup_stage,cup_replay,cup_section,aet,pen_outcome,pen_score,pen_gf,pen_ga,agg_outcome,agg_score,agg_gf,agg_ga,away_goal_outcome,gg_outcome
0,2022-11-22,19:00:00,2,,2nd round - northern section,,Northern,,L,4-5,5.0,4.0,,,,,,
1,2022-11-05,15:00:00,1,,1st round,,,,,,,,,,,,,
2,2022-10-18,19:00:00,G,,Group B - North,,,,,,,,,,,,,


In [8]:
def outcome_decider(row):
    if row['pen_outcome'] is not np.nan:
        return "pens"
    elif row["away_goal_outcome"] is not np.nan:
        return "away_goals"
    elif row["gg_outcome"] is not np.nan:
        return "golden_goal"
    elif row["agg_outcome"] is not np.nan:
        if row["game_date"] == pd.to_datetime("1975-08-26"):
            return "replay"
        else:
            return "agg"
    else:
        return np.nan
    
def get_cup_outcome(row):
    if row["pen_outcome"] is not np.nan:
        return row["pen_outcome"]
    elif row["away_goal_outcome"] is not np.nan:
        return row["away_goal_outcome"]
    elif row["gg_outcome"] is not np.nan:
        return row["gg_outcome"]
    elif row["agg_outcome"] is not np.nan:
        return row["agg_outcome"]
    else:
        return np.nan
    
def get_outcome_desc(row):
    if row["pen_outcome"] is not np.nan:
        if row["agg_outcome"] is not np.nan:
            desc = row["agg_score"] + " on agg. " + row["pen_outcome"] + " " + row["pen_score"] + " on pens"
            desc = desc.replace("W ", "Won ").replace("L ", "Lost")
            return desc
        else:
            desc = row["pen_outcome"] + " " + row["pen_score"] + " on pens"
            desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
            return desc
    elif row["away_goal_outcome"] is not np.nan:
        desc = row["away_goal_outcome"] + " on away goals rule"
        desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
        return desc
    elif row["gg_outcome"] is not np.nan:
        desc = row["gg_outcome"] + " on golden goal"
        desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
        return desc
    elif row["agg_outcome"] is not np.nan:
        if row["game_date"] == pd.to_datetime("1975-08-26"):
            return "Game replayed"
        else:
            desc = row["agg_outcome"] + " " + row["agg_score"] + " on agg"
            desc = desc.replace("W ", "Won ").replace("L ", "Lost ")
            return desc
    else:
        return np.nan
    
cup_game_details["decider"] = cup_game_details.apply(outcome_decider, axis = 1)    
cup_game_details["cup_outcome"] = cup_game_details.apply(get_cup_outcome, axis = 1)
cup_game_details["outcome_desc"] = cup_game_details.apply(get_outcome_desc, axis = 1)

In [9]:
def calc_game_length(row):
    if row["gg_outcome"] is not np.nan:
        return 116
    elif row["aet"] == 1 and row["gg_outcome"] not in ["W", "L"]:
        return 120
    else:
        return 90

results_df = results.rename(columns = {
    "ssn_game_no": "game_no",
})[["season", "game_date", "game_no", "opposition", "venue", "score", "outcome", "goals_for", "goals_against", "goal_diff", "game_type", "competition", "generic_comp", "ssn_comp_game_no", "league_tier", "league_pos", "pts", "attendance", "weekday", "manager"]]

results_df = pd.merge(left = results_df, right = cup_game_details, how = "left", on = "game_date")

results_df["game_length"] = results_df.apply(calc_game_length, axis = 1)

results_df.to_csv("data/results.csv", index = False)

# Create `goals_df`

In [10]:
sb_match_apps = pd.read_csv(
    "https://raw.githubusercontent.com/petebrown/update-player-stats/main/data/players_df.csv",
    parse_dates = ["game_date"]
).rename(columns = {
    "sb_game_id": "game_id",
    "sb_player_id": "player_id",
})

sb_match_apps.game_id = sb_match_apps.game_id.str.replace("tpg", "").astype(int)

sb_match_apps.player_name = sb_match_apps.player_name.apply(fix_name)

In [11]:
sb_game_ids = sb_match_apps[["game_id", "game_date", "season"]].drop_duplicates().sort_values(by = ["game_date"]).reset_index(drop = True)

sb_game_ids.head(3)

Unnamed: 0,game_id,game_date,season
0,240309,1996-08-17,1996/97
1,243200,1996-08-21,1996/97
2,242383,1996-08-23,1996/97


In [12]:
sb_player_ids = sb_match_apps[["player_id", "player_name"]].drop_duplicates().reset_index(drop = True)
sb_player_ids.player_name = sb_player_ids.player_name.apply(fix_name)

sb_player_ids.head(3)

Unnamed: 0,player_id,player_name
0,128729,Jake Burton
1,18605,Peter Clarke
2,103555,Josh Dacres-Cogley


In [13]:
game_nos = results[["game_date", "season", "game_no"]].rename(columns = {"ssn_game_no": "game_no"}).sort_values(["game_date"]).reset_index(drop = True)

game_nos = results[["game_date", "season", "game_no"]].sort_values(["game_date"]).reset_index(drop = True)

game_nos.head(3)

Unnamed: 0,game_date,season,game_no
0,1921-08-27,1921/22,1
1,1921-09-03,1921/22,2
2,1921-09-10,1921/22,3


### Import goal details scraped from Soccerbase

**Seasons:** 1996-97 - 2022/23

**Fields:**
- `player_name`
- `minute`
- `penalty` (0/1)
- `own_goal` (0/1)
- `game_date` _(after join)_

In [14]:
# Import Soccerbase goal details
sb_goals = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-goals/main/data/goals.csv")

# Filter for Tranmere goals
sb_goals = sb_goals[sb_goals["goal_type"] == "for"]

# Fix player names
sb_goals.player_name = sb_goals.player_name.apply(fix_name)

# Add game dates via a join
sb_goals = sb_goals.merge(sb_game_ids, on = "game_id", how = "left").sort_values(["game_date", "minute"])

# Reduce to subset of columns
sb_goals = sb_goals[["game_date", "player_name", "minute", "penalty", "own_goal"]]

sb_goals = sb_goals.rename(columns = {"minute": "goal_min"})

sb_goals.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
1713,1996-08-17,Johnny Morrissey,22,0,0
1714,1996-08-21,John Aldridge,30,0,0
1715,1996-08-21,Ivano Bonetti,56,0,0


### Import goals from Complete Record

**Seasons:** 1921/22 - 1995/96 (after filter)

**Fields:**

* `game_date`
* `player_name`
* `goals_scored` (no. of goals scored by player in game)

In [15]:
# Import Complete Record goal details and convert to a dictionary
all_goals = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/scorers-long.csv").merge(game_nos, how = "left", on = ["season", "game_no"])[["season", "game_date", "player_name", "goals_scored"]].to_dict("records")

# Initiate empty list for goals
cr_goals = []

# Loop through each goal and one for every goal in goals_scored column
for goal in all_goals:
    n_goals = goal["goals_scored"]

    for i in range(n_goals):
        cr_goals.append(goal)

# Convert to Pandas dataframe
cr_goals = pd.DataFrame(cr_goals).drop("goals_scored", axis = 1)

# Add own_goal column to match sb_goals
cr_goals.loc[cr_goals.player_name == "OG", "own_goal"] = 1
cr_goals.loc[cr_goals.player_name != "OG", "own_goal"] = 0
cr_goals.own_goal = cr_goals.own_goal.astype(int)

# Filter for seasons covered by Soccerbase data, i.e. before 1996/97
cr_goals = cr_goals[cr_goals.season < "1996"]

# Drop the season column
cr_goals = cr_goals.drop("season", axis = 1)

cr_goals.head(3)

Unnamed: 0,game_date,player_name,own_goal
0,1921-08-27,Tom Stuart,0
1,1921-08-27,Charles Milnes,0
2,1921-08-27,Fred Groves,0


### Import manually collected goal minutes

In [16]:
# Import manually collected goal minute data
cr_goal_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/goal_mins.csv", parse_dates = ["game_date"]).sort_values(["game_date", "goal_min"])

# Amend penalty column to 0/1
cr_goal_mins.penalty = cr_goal_mins.penalty.apply(lambda x: 1 if x == "pen" else 0)

# Add goal numbers for each player in each game, e.g. 1-3 for a hat-trick
cr_goal_mins["pl_goal_no"] = cr_goal_mins.groupby(["player_name", "game_date"]).cumcount() + 1

cr_goal_mins.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,pl_goal_no
0,1964-09-04,John Manning,60,0,1
1,1984-05-21,Dave Philpotts,74,1,1
2,1984-11-10,Colin Clarke,39,0,1


### Join manually collected goal minutes to Complete Record goal data

In [17]:
# For join: add goal numbers for each player in each game, e.g. 1-3 for a hat-trick
cr_goals["pl_goal_no"] = cr_goals.groupby(["player_name", "game_date"]).cumcount() + 1

# Merge goal minutes with Complete Record goal data and drop temporary pl_goal_no column
cr_goals = cr_goals.merge(cr_goal_mins, how = "left", on = ["game_date", "player_name", "pl_goal_no"]).drop(columns = ["pl_goal_no"])

cr_goals.head(3)

Unnamed: 0,game_date,player_name,own_goal,goal_min,penalty
0,1921-08-27,Tom Stuart,0,,
1,1921-08-27,Charles Milnes,0,,
2,1921-08-27,Fred Groves,0,,


### Import FA Trophy goals

**Fields:**

* `game_date`
* `player_name`
* `minute`
* `penalty` (0/1)
* `own_goal` (0/1)

In [18]:
fa_trophy_goals = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_goals.csv", parse_dates = ["game_date"]).rename(columns = {"minute": "goal_min"})

fa_trophy_goals.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
0,2015-12-12,James Norwood,45,0,0
1,2015-12-12,James Norwood,82,0,0
2,2016-12-10,Ritchie Sutton,26,0,0


In [19]:
# Concatenate Complete Record and Soccerbase goal dataframes
goals_df = pd.concat([cr_goals, sb_goals, fa_trophy_goals], axis = 0)

goals_df = goals_df.sort_values(by = ["game_date", "goal_min"])[["game_date", "player_name", "goal_min", "penalty", "own_goal"]]

goals_df.head(3)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
0,1921-08-27,Tom Stuart,,,0
1,1921-08-27,Charles Milnes,,,0
2,1921-08-27,Fred Groves,,,0


#### Save goals to CSV

In [20]:
goals_df.to_csv('data/goals.csv', index=False)

# Create `sub_mins_df`

In [21]:
season_dates = results[["game_date", "season"]].drop_duplicates().sort_values("game_date").reset_index(drop=True)

season_dates.head(3)

Unnamed: 0,game_date,season
0,1921-08-27,1921/22
1,1921-09-03,1921/22
2,1921-09-10,1921/22


In [22]:
game_nos = season_dates.copy()
game_nos["game_no"] = game_nos.sort_values("game_date").groupby("season").cumcount() + 1

game_nos.head(3)

Unnamed: 0,game_date,season,game_no
0,1921-08-27,1921/22,1
1,1921-09-03,1921/22,2
2,1921-09-10,1921/22,3


In [23]:
sb_game_ids.head(3)

Unnamed: 0,game_id,game_date,season
0,240309,1996-08-17,1996/97
1,243200,1996-08-21,1996/97
2,242383,1996-08-23,1996/97


In [24]:
sb_player_ids.head(3)

Unnamed: 0,player_id,player_name
0,128729,Jake Burton
1,18605,Peter Clarke
2,103555,Josh Dacres-Cogley


In [25]:
# Import Soccerbase subs and red card CSV
sb_sub_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-events/main/data/subs-and-reds.csv").merge(sb_game_ids, how = "left", on = "game_id") \
    .merge(sb_player_ids, how = "left", on = "player_id") \
    .sort_values("game_date") \
    [["game_date", "player_name", "min_on", "min_off"]].reset_index(drop = True)

# Import manual fixes to Soccerbase sub data
sb_sub_min_fixes = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/sb_sub_fixes.csv", parse_dates = ["game_date"])[["game_date", "player_name", "min_on", "min_off"]]

 # Fix known error with Luke Norris sub v. Barrow 
sb_sub_mins.loc[(sb_sub_mins.game_date == "2023-08-05") & (sb_sub_mins.player_name == "Luke Norris"), 'min_off'] = np.nan

# Filter out sendings off
sb_sub_mins = sb_sub_mins[~((sb_sub_mins.min_on.isna()) & (sb_sub_mins.min_off.isna()))]

# Filter out records for dates with manual fixes
sb_sub_mins = sb_sub_mins[~sb_sub_mins.game_date.isin(sb_sub_min_fixes.game_date.unique())]

# Append manual fixes to Soccerbase sub data
sb_sub_mins = pd.concat([sb_sub_mins, sb_sub_min_fixes]).sort_values("game_date").reset_index(drop = True)

sb_sub_min_removals = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/sb_sub_min_removals.csv", parse_dates = ["game_date"])

sb_sub_mins = sb_sub_mins.merge(sb_sub_min_removals, how = "outer", on = ["game_date", "player_name"], indicator = True).query("_merge == 'left_only'").drop(columns = "_merge")

sb_sub_min_additions = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/sb_sub_min_additions.csv", parse_dates = ["game_date"])[["game_date", "player_name", "min_on", "min_off"]]

sb_sub_mins = pd.concat([sb_sub_mins, sb_sub_min_additions]).sort_values("game_date").reset_index(drop = True)

sb_sub_mins.head(3)

Unnamed: 0,game_date,player_name,min_on,min_off
0,1996-08-17,Ged Brannan,79.0,
1,1996-08-17,Ian Thomas-Moore,86.0,
2,1996-08-17,John Aldridge,,79.0


In [26]:
cr_sub_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/subs_missing_mins.csv", parse_dates = ["game_date"], na_values = ["0"])[["game_date", "player_name", "min_off", "min_on"]].query("min_off > 0 or min_on > 0")

cr_sub_mins.head(3)

Unnamed: 0,game_date,player_name,min_off,min_on
0,1965-08-23,Mandy Hill,75.0,
1,1965-08-23,Jack Lornie,,75.0
2,1965-10-15,Eddie Stuart,82.0,


In [27]:
fa_trophy_sub_mins = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_fixtures.csv", parse_dates = ["game_date"])[["game_date", "player_name", "min_off", "min_on"]].query("min_off > 0 or min_on > 0")

fa_trophy_sub_mins.head(3)

Unnamed: 0,game_date,player_name,min_off,min_on
5,2015-12-12,Ben Tomlinson,67.0,
7,2015-12-12,Jake Kirby,67.0,
11,2015-12-12,Andy Mangan,,67.0


In [28]:
maidstone_sub_mins = [
    {
    'game_date': '2017-04-29',
    'player_name': 'Jack Fleming',
    'min_on': 36,
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'Tom Croughan',
        'min_on': 64,
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'James Devine',
        'min_on': 76,
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'Eddie Clarke',
        'min_off': 36,
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'Sam Ilesanmi',
        'min_off': 64,
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'Adam Mekki',
        'min_off': 76,
    }
]

maidstone_sub_mins = pd.DataFrame(maidstone_sub_mins)

maidstone_sub_mins['game_date'] = pd.to_datetime(maidstone_sub_mins['game_date'])

In [29]:
sub_mins_df = pd.concat([cr_sub_mins, sb_sub_mins, fa_trophy_sub_mins, maidstone_sub_mins]).sort_values("game_date").reset_index(drop = True)

sub_mins_df.head(3)

Unnamed: 0,game_date,player_name,min_off,min_on
0,1965-08-23,Mandy Hill,75.0,
1,1965-08-23,Jack Lornie,,75.0
2,1965-10-15,Eddie Stuart,82.0,


In [30]:
sub_mins_df = sub_mins_df.drop_duplicates()

In [31]:
sub_mins_df.to_csv('data/sub_mins.csv', index = False)

# Create `shirt_nos_df`

In [32]:
shirt_nos_df = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/squad_nos/squad_nos.csv")

maidstone_shirts = [
    {
        'season': '2016/17',
        'squad_no': 32,
        'player_name': 'Tom Croughan'
    },
    {
        'season': '2016/17',
        'squad_no': 36,
        'player_name': 'Jack Fleming'
    },
    {
        'season': '2016/17',
        'squad_no': 42,
        'player_name': 'James Devine'
    }
]

maidstone_shirts_df = pd.DataFrame(maidstone_shirts)

shirt_nos_df = pd.concat([shirt_nos_df, maidstone_shirts_df]).sort_values(["season", "squad_no"]).reset_index(drop = True)

shirt_nos_df.head(3)

Unnamed: 0,season,squad_no,player_name
0,1999/00,1,John Achterberg
1,1999/00,2,Graham Allen
2,1999/00,3,Andy Thompson


# Create `red_cards_df`

In [33]:
cr_red_cards = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/cards_red.csv", parse_dates = ["game_date"])

cr_red_cards.head(3)

Unnamed: 0,game_date,player_name,min_so
0,1967-08-19,Gerry Casey,49
1,1984-05-21,Dave Higgins,89
2,1986-09-27,Andy Thorpe,90


In [34]:
sb_red_cards = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-events/main/data/subs-and-reds.csv").merge(sb_game_ids, how = "left", on = "game_id").merge(sb_player_ids, how = "left", on = "player_id")[["game_date", "player_name", "min_so"]].sort_values("game_date")

sb_red_cards = sb_red_cards[~sb_red_cards.min_so.isna()].reset_index(drop = True)

sb_red_cards.head(3)

Unnamed: 0,game_date,player_name,min_so
0,1996-08-17,Alan Rogers,69.0
1,1997-01-14,John Aldridge,78.0
2,1997-01-18,Dave Higgins,34.0


In [35]:
fa_trophy_reds = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_cards.csv", parse_dates = ["game_date"])[["game_date", "player_name", "red_card", "min_so"]].query("red_card == 1").drop(columns = "red_card").reset_index(drop = True)

fa_trophy_reds.head(3)

Unnamed: 0,game_date,player_name,min_so
0,2015-10-27,Scott Davies,14.0


In [36]:
red_cards_df = pd.concat([cr_red_cards, sb_red_cards, fa_trophy_reds], axis = 0).sort_values(["game_date", "min_so"]).reset_index(drop = True)

red_cards_df.head(3)

Unnamed: 0,game_date,player_name,min_so
0,1967-08-19,Gerry Casey,49.0
1,1984-05-21,Dave Higgins,89.0
2,1986-09-27,Andy Thorpe,90.0


#### Save red_cards to CSV

In [37]:
red_cards_df.to_csv('data/red_cards.csv', index = False)

# Create `yellow_cards_df`

In [38]:
cr_yellows = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/cards_yellow.csv", parse_dates = ["game_date"])

cr_yellows.head(3)

Unnamed: 0,game_date,player_name
0,1984-05-21,Dave Higgins
1,1984-05-21,Steve Mungall
2,1986-03-25,Ian Muir


In [39]:
sb_yellows = sb_match_apps[["game_date", "player_name", "yellow_cards"]].query("yellow_cards > 0").sort_values(by = "game_date")

sb_yellows.head(3)

Unnamed: 0,game_date,player_name,yellow_cards
17600,1996-08-17,Ivano Bonetti,1
17700,1996-08-21,Paul Cook,1
18130,1996-08-23,Shaun Teale,1


In [40]:
fa_trophy_yellows = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_cards.csv", parse_dates = ["game_date"])[["game_date", "player_name", "yellow_card"]].query("yellow_card == 1").rename(columns = {"yellow_card": "yellow_cards"})

fa_trophy_yellows.head(3)

Unnamed: 0,game_date,player_name,yellow_cards
0,2015-12-12,Lee Vaughan,1
1,2015-12-12,Lois Maynard,1
2,2016-12-10,Jeff Hughes,1


In [41]:
maidstone_yellows = [
    {
        'game_date': '2017-04-29',
        'player_name': 'Jack Fleming'
    }
]

maidstone_yellows = pd.DataFrame(maidstone_yellows)

maidstone_yellows['game_date'] = pd.to_datetime(maidstone_yellows['game_date'])

In [42]:
yellow_cards_df = pd.concat([cr_yellows, sb_yellows, fa_trophy_yellows, maidstone_yellows], axis = 0).sort_values(by = ["game_date", "player_name"]).drop(columns=["yellow_cards"]).reset_index(drop = True)

yellow_cards_df.head(3)

Unnamed: 0,game_date,player_name
0,1984-05-21,Dave Higgins
1,1984-05-21,Steve Mungall
2,1986-03-25,Ian Muir


#### Save yellow_cards to CSV

In [43]:
yellow_cards_df.to_csv('data/yellow_cards.csv', index = False)

# Create `player_apps_df`

In [44]:
pl_ssns_9798 = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_ssns_9798.csv")
pl_ssns_9798.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,Aldridge,John,John Aldridge,1997,1997/98,Aldridge
1,Branch,Graham,Graham Branch,1997,1997/98,Branch
2,Challinor,Dave,Dave Challinor,1997,1997/98,Challinor


In [45]:
pl_ssns_9899 = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_ssns_9899.csv")
pl_ssns_9899.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,Simonsen,Steve,Steve Simonsen,1998,1998/99,Simonsen
1,Frail,Stephen,Stephen Frail,1998,1998/99,Frail
2,Thompson,Andy,Andy Thompson,1998,1998/99,Thompson


In [46]:
pl_ssns = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_ssns.csv")
pl_ssns.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,A'Court,Alan,Alan A'Court,1964,1964/65,A'Court
1,A'Court,Alan,Alan A'Court,1965,1965/66,A'Court
2,Adams,Arthur,Arthur Adams,1932,1932/33,Adams A


In [47]:
cr_pl_ssns = pd.concat([pl_ssns, pl_ssns_9798, pl_ssns_9899], axis = 0).sort_values(by = ["surname", "forename", "ssn"]).reset_index(drop = True)
cr_pl_ssns.head(3)

Unnamed: 0,surname,forename,player_name,ssn,season,disam_name
0,A'Court,Alan,Alan A'Court,1964,1964/65,A'Court
1,A'Court,Alan,Alan A'Court,1965,1965/66,A'Court
2,Achterberg,John,John Achterberg,1998,1998/99,Achterberg


In [48]:
cr_pl_ssns_prepped = cr_pl_ssns[["season", "disam_name", "player_name"]].drop_duplicates().reset_index(drop = True)

cr_player_apps = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/apps_long.csv") \
    .merge(game_nos, how = "left", on = ["season", "game_no"]) \
    .merge(cr_pl_ssns_prepped, how = "left", left_on = ["season", "player_name"], right_on = ["season", "disam_name"]) \
    .rename(columns = {"player_name_y": "player_name"}) \
    [["game_date", "player_name", "shirt_no", "role"]]

cr_player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,1921-08-27,Harry Bradshaw,1,starter
1,1921-08-27,John Grainger,2,starter
2,1921-08-27,Tom Stuart,3,starter


In [49]:
fa_trophy_player_apps = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_fixtures.csv", parse_dates = ["game_date"])[["game_date", "player_name", "shirt_no", "role"]]

fa_trophy_player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,2015-12-12,Scott Davies,1,starter
1,2015-12-12,Lee Vaughan,7,starter
2,2015-12-12,Steve McNulty,24,starter


In [50]:
maidstone_player_apps = [
    {
        'game_date': '2017-04-29',
        'player_name': 'Tom Croughan',
        'shirt_no': 32,
        'role': 'sub'
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'Jack Fleming',
        'shirt_no': 36,
        'role': 'sub'
    },
    {
        'game_date': '2017-04-29',
        'player_name': 'James Devine',
        'shirt_no': 42,
        'role': 'sub'
    }
]

maidstone_player_apps = pd.DataFrame(maidstone_player_apps)

maidstone_player_apps['game_date'] = pd.to_datetime(maidstone_player_apps['game_date'])

In [51]:
def get_squad_no(season, player_name, game_date):
    try:
        if season == "2014/15" and player_name == "Janoi Donacien" and game_date < pd.Timestamp("2015-03-07"):
            return 19
        elif season == "2014/15" and player_name == "Janoi Donacien" and game_date >= pd.Timestamp("2015-03-07"):
            return  12
        else:
            return shirt_nos_df[(shirt_nos_df.season == season) & (shirt_nos_df.player_name == player_name)].squad_no.values[0]
    except:
        return None

sb_player_apps = sb_match_apps[~sb_match_apps.season.isin(["1996/97", "1997/98", "1998/99"])][["game_date", "player_name"]].copy()

sb_player_apps["shirt_no"] = sb_match_apps.apply(lambda x: get_squad_no(x.season, x.player_name, x.game_date), axis = 1)

In [52]:
def get_role(game_date, player_name):
    subs = sub_mins_df[~sub_mins_df.min_on.isna()].drop(columns=['min_off', 'min_on'])

    subs = subs[(subs.game_date == game_date) & (subs.player_name == player_name)]

    if subs.empty:
        return "starter"
    else:
        return "sub"

sb_player_apps["role"] = sb_player_apps.apply(lambda x: get_role(x.game_date, x.player_name), axis=1)

sb_player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,2022-03-05,Jake Burton,27.0,starter
1,2021-08-07,Peter Clarke,26.0,starter
2,2021-08-10,Peter Clarke,26.0,starter


In [53]:
player_apps = pd.concat([cr_player_apps, sb_player_apps, fa_trophy_player_apps, maidstone_player_apps], axis = 0).sort_values(by = ["game_date", "role", "shirt_no"]).reset_index(drop = True)

player_apps.head(3)

Unnamed: 0,game_date,player_name,shirt_no,role
0,1921-08-27,Harry Bradshaw,1.0,starter
1,1921-08-27,John Grainger,2.0,starter
2,1921-08-27,Tom Stuart,3.0,starter


In [54]:
player_apps.loc[(player_apps.game_date == '2024-04-20') & (player_apps.player_name == 'Ousmane Kané'), 'shirt_no'] = 32

In [55]:
print(len(player_apps))

# Remove incorrect Adam Mekki appearance
player_apps = player_apps[~((player_apps.game_date == '2016-08-09') & (player_apps.player_name == 'Adam Mekki'))]

# Remove incorrect Mark Ellis appearance
player_apps = player_apps[~((player_apps.game_date == '2018-10-20') & (player_apps.player_name == 'Mark Ellis'))]

print(len(player_apps))

58720
58718


In [56]:
player_apps['shirt_no'] = player_apps['shirt_no'].astype(int)

#### Save player_apps to csv

In [57]:
player_apps.to_csv('data/player_apps.csv', index = False)

# Create `subs_df`

In [58]:
cr_subs = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/apps_long.csv").merge(game_nos, how = "left", on = ["season", "game_no"]) \
    .merge(cr_pl_ssns_prepped, how = "left", left_on = ["season", "player_name"], right_on = ["season", "disam_name"]) \
    .rename(columns = {"player_name_y": "player_name"}) \
    .sort_values(["game_date", "shirt_no"]) \
    [["game_date", "shirt_no", "player_name", "on_for", "off_for"]]

cr_subs = cr_subs[(cr_subs.on_for.notnull()) | (cr_subs.off_for.notnull())]
cr_subs.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
18849,1965-08-23,7,Mandy Hill,,12.0
18854,1965-08-23,12,Jack Lornie,7.0,
18980,1965-10-15,5,Eddie Stuart,,12.0


In [59]:
sb_subs = sb_sub_mins[~sb_sub_mins.game_date.isin(cr_subs.game_date)].copy()
sb_subs.query("game_date == '2023-08-05'")

  sb_subs.query("game_date == '2023-08-05'")


Unnamed: 0,game_date,player_name,min_on,min_off
6427,2023-08-05,Luke Norris,71.0,
6428,2023-08-05,Reece McAlear,81.0,
6429,2023-08-05,Lee O'Connor,,81.0
6430,2023-08-05,Kristian Dennis,81.0,
6431,2023-08-05,Kieron Morris,45.0,
6432,2023-08-05,Josh Hawkes,,89.0
6433,2023-08-05,Connor Jennings,,71.0
6434,2023-08-05,Brad Walker,,45.0
6435,2023-08-05,Samuel Taylor,89.0,
6436,2023-08-05,Harvey Saunders,,81.0


In [60]:
# Import Soccerbase subs and red card CSV
sb_subs = sb_sub_mins[~sb_sub_mins.game_date.isin(cr_subs.game_date)].copy()

# Get lower number from min_on and min_off
sb_subs['sub_min'] = sb_subs[['min_on', 'min_off']].min(axis = 1)

sb_subs_on = sb_subs[sb_subs.min_on.notnull()]
sb_subs_off = sb_subs[sb_subs.min_off.notnull()]

sb_subs_on.head(3)

Unnamed: 0,game_date,player_name,min_on,min_off,sub_min
707,1999-08-07,Michael Black,80.0,,80.0
709,1999-08-07,David Kelly,80.0,,80.0
712,1999-08-10,Stephen Frail,85.0,,85.0


In [61]:
sb_subs_on_unique = sb_subs_on[["game_date", "min_on"]].copy()

sb_subs_on_unique = sb_subs_on_unique.groupby([sb_subs_on_unique.game_date, sb_subs_on_unique.min_on]).size().reset_index(name='count').query('count == 1').drop(columns = {"count"})

sb_subs_on_unique = sb_subs_on.merge(sb_subs_on_unique, on = ["game_date", "min_on"], how = "inner") \
    .merge(player_apps, left_on = ["game_date", "player_name"], right_on = ["game_date", "player_name"], how = "inner") \
    [["game_date", "shirt_no", "player_name", "min_on"]]

sb_subs_on_unique.head(3)

Unnamed: 0,game_date,shirt_no,player_name,min_on
0,1999-08-10,18,Stephen Frail,85.0
1,1999-08-14,17,Jason Koumas,80.0
2,1999-08-14,14,Andy Parkinson,29.0


In [62]:
sb_subs_off_unique = sb_subs_off[["game_date", "min_off"]].copy()

sb_subs_off_unique = sb_subs_off_unique.groupby([sb_subs_off_unique.game_date, sb_subs_off_unique.min_off]).size().reset_index(name='count').query('count == 1').drop(columns = {"count"})

sb_subs_off_unique = sb_subs_off.merge(sb_subs_off_unique, on = ["game_date", "min_off"], how = "inner") \
    .merge(player_apps, left_on = ["game_date", "player_name"], right_on = ["game_date", "player_name"], how = "inner") \
    [["game_date", "shirt_no", "player_name", "min_off"]]

sb_subs_off_unique.tail(10)

Unnamed: 0,game_date,shirt_no,player_name,min_off
2519,2024-10-22,22,Lee O'Connor,89.0
2520,2024-10-26,3,Zak Bradshaw,79.0
2521,2024-11-02,9,Luke Norris,62.0
2522,2024-11-09,2,Cameron Norman,45.0
2523,2024-11-12,11,Josh Hawkes,70.0
2524,2024-11-12,20,Samuel Taylor,69.0
2525,2024-11-22,30,Omari Patrick,70.0
2526,2024-11-22,17,Sam Finley,89.0
2527,2024-11-22,9,Luke Norris,71.0
2528,2024-11-22,7,Kieron Morris,83.0


In [63]:
sb_subs_1 = pd.merge(left = sb_subs_on_unique, right = sb_subs_off_unique, how = "inner", left_on = ["game_date", "min_on"], right_on = ["game_date", "min_off"]) \
.rename(
    columns = {
        "shirt_no_x": "shirt_no",
        "player_name_x": "player_name",
        "shirt_no_y": "on_for",
    }
)[["game_date", "shirt_no", "player_name", "min_on", "on_for"]]

sb_subs_1.head(3)

Unnamed: 0,game_date,shirt_no,player_name,min_on,on_for
0,1999-08-10,18,Stephen Frail,85.0,17
1,1999-08-14,17,Jason Koumas,80.0,10
2,1999-08-14,14,Andy Parkinson,29.0,30


In [64]:
sb_subs_2 = pd.merge(left = sb_subs_on_unique, right = sb_subs_off_unique, how = "inner", left_on = ["game_date", "min_on"], right_on = ["game_date", "min_off"]) \
    .rename(
        columns = {
            "shirt_no_y": "shirt_no",
            "player_name_y": "player_name",
            "shirt_no_x": "off_for",
        }
    )[["game_date", "shirt_no", "player_name", "min_off", "off_for"]]

sb_subs_2.head(3)

Unnamed: 0,game_date,shirt_no,player_name,min_off,off_for
0,1999-08-10,17,Jason Koumas,85.0,18
1,1999-08-14,10,Scott Taylor,80.0,17
2,1999-08-14,30,Reuben Hazell,29.0,14


In [65]:
sb_subs = pd.concat([sb_subs_1, sb_subs_2], axis = 0).sort_values("game_date").reset_index(drop = True)[["game_date", "shirt_no", "player_name", "on_for", "off_for"]]

sb_subs.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
0,1999-08-10,18,Stephen Frail,17.0,
1,1999-08-10,17,Jason Koumas,,18.0
2,1999-08-14,32,Pedro Miguel Matias,,9.0


In [66]:
# Import manual sub fixes
sb_sub_fixes = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/seasons/manual/sb_sub_fixes.csv", parse_dates = ["game_date"])[["game_date", "shirt_no", "player_name", "on_for", "off_for"]].sort_values(["game_date"])

sb_sub_fixes.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
0,1999-08-07,29,Michael Black,7.0,
1,1999-08-07,7,Alan Mahon,,29.0
2,1999-08-07,9,David Kelly,14.0,


In [67]:
fa_trophy_subs = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/fa-trophy/fa_trophy_fixtures.csv", parse_dates = ["game_date"])[["game_date", "shirt_no", "player_name", "on_for", "off_for"]].query("on_for > 0 or off_for > 0")

fa_trophy_subs.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
5,2015-12-12,29,Ben Tomlinson,,14.0
7,2015-12-12,23,Jake Kirby,,20.0
11,2015-12-12,14,Andy Mangan,29.0,


In [68]:
maidstone_subs = [
    {
        'game_date': '2017-04-29',
        'shirt_no': 32,
        'player_name': 'Tom Croughan',
        'on_for': 30
    },
    {
        'game_date': '2017-04-29',
        'shirt_no': 36,
        'player_name': 'Jack Fleming',
        'on_for': 31
    },
    {
        'game_date': '2017-04-29',
        'shirt_no': 42,
        'player_name': 'James Devine',
        'on_for': 7
    }
]

maidstone_subs = pd.DataFrame(maidstone_subs)

maidstone_subs['game_date'] = pd.to_datetime(maidstone_subs['game_date'])

In [69]:
subs_df = pd.concat([cr_subs, sb_subs, fa_trophy_subs, maidstone_subs], axis = 0).query("~game_date.isin(@sb_sub_fixes.game_date)")

subs_df = pd.concat([subs_df, sb_sub_fixes], axis = 0).sort_values("game_date").reset_index(drop = True)[["game_date", "shirt_no", "player_name", "on_for", "off_for"]]

subs_df.head(3)

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for
0,1965-08-23,7,Mandy Hill,,12.0
1,1965-08-23,12,Jack Lornie,7.0,
2,1965-10-15,5,Eddie Stuart,,12.0


#### Save subs_df to CSV

In [70]:
subs_df.to_csv("data/subs.csv", index = False)

In [71]:
# Temporary comparison of subs_df before and after
new = subs_df
old = pd.read_csv("https://raw.githubusercontent.com/petebrown/pre-2023-data-prep/main/data/subs.csv", parse_dates = ["game_date"])

pd.merge(new, old, how = "outer", indicator = True).query('_merge == "right_only"')

Unnamed: 0,game_date,shirt_no,player_name,on_for,off_for,_merge


In [72]:
# def calc_mins_played(row):
#     role = row["role"]
#     min_on = row["min_on"]
#     min_off = row["min_off"]
#     min_so = row["min_so"]
#     game_length = row["game_length"]
    
#     # Started, played to end
#     if role == "starter" and np.isnan(min_off) and np.isnan(min_so):
#         return game_length
#     # Started, sent off
#     elif role == "starter" and not np.isnan(min_so):
#         return min_so
#     # Started, subbed off
#     elif role == "starter" and not np.isnan(min_off):
#         return min_off
#     # Subbed on, played to end
#     elif role == "sub" and np.isnan(min_off):
#         return game_length - min_on
#     # Subbed on, sent off
#     elif role == "sub" and not np.isnan(min_so):
#         return min_so - min_on
#     # Subbed on, subbed off
#     elif role == "sub" and not np.isnan(min_off):
#         return min_off - min_on
#     else:
#         return "OTHER"
    

# player_events = player_apps \
#     .merge(sub_mins_df, how = "left", on = ["game_date", "player_name"]) \
#     .merge(red_cards_df, how = "left", on = ["game_date", "player_name"]) \
#     .merge(game_lengths, how = "left", on = ["game_date"])

# player_events["mins_played"] = player_events.apply(calc_mins_played, axis = 1)

# player_events.head()

In [73]:
# player_events[(player_events.role == "sub") & (np.isnan(player_events.min_on))].tail(10)

In [74]:
sb_player_info = pd.read_csv("https://raw.githubusercontent.com/petebrown/scrape-player-info/main/data/player-info.csv")

# Add Mikey Davies DOB
sb_player_info.loc[sb_player_info.player_id == 181422, 'player_dob'] = '2004-09-23'

# Correct Akpo Sodje DOB
sb_player_info.loc[sb_player_info.player_id == 24524, 'player_dob'] = '1980-01-31'
# Correct Arnaud Mendy DOB
sb_player_info.loc[sb_player_info.player_id == 51865, 'player_dob'] = '1990-02-10'
# Correct Ben Tomlinson DOB
sb_player_info.loc[sb_player_info.player_id == 59954, 'player_dob'] = '1989-10-31'
# Correct Brad Walker DOB
sb_player_info.loc[sb_player_info.player_id == 69329, 'player_dob'] = '1996-04-25'
# Correct Chris McCready DOB
sb_player_info.loc[sb_player_info.player_id == 21233, 'player_dob'] = '1981-09-05'
# Correct Clayton McDonald DOB
sb_player_info.loc[sb_player_info.player_id == 49727, 'player_dob'] = '1988-12-06'
# Correct Emmanuel Dieseruvwe DOB
sb_player_info.loc[sb_player_info.player_id == 72487, 'player_dob'] = '1995-02-20'
# Correct Nathan Blissett
sb_player_info.loc[sb_player_info.player_id == 66526, 'player_dob'] = '1990-06-29'
# Correct Rob Apter DOB
sb_player_info.loc[sb_player_info.player_id == 136092, 'player_dob'] = '2003-04-23'
# Correct Robbie Burns DOB
sb_player_info.loc[sb_player_info.player_id == 51259, 'player_dob'] = '1990-11-15'
# Correct Ryan Edwards DOB
sb_player_info.loc[sb_player_info.player_id == 60033, 'player_dob'] = '1993-10-07'
# Correct Seyni N'Diaye DOB
sb_player_info.loc[sb_player_info.player_id == 24422, 'player_dob'] = '1973-06-01',
# Correct Stephen O'Leary DOB
sb_player_info.loc[sb_player_info.player_id == 28848, 'player_dob'] = '1987-02-02'
# Correct Will Aimson DOB
sb_player_info.loc[sb_player_info.player_id == 74874, 'player_dob'] = '1994-06-03'


sb_player_dobs = sb_player_info[["player_id", "player_dob"]].drop_duplicates().sort_values(by = "player_dob").dropna(subset = ["player_dob"]).merge(sb_player_ids, on = "player_id", how = "left").dropna(subset = "player_name")

sb_plr_ssns = sb_match_apps[["player_id", "game_date"]] \
    .merge(season_dates, how="left", on="game_date") \
    .merge(sb_player_dobs, how="left", on="player_id") \
    [["player_name", "season", "player_dob"]] \
    .sort_values(["player_dob", "season"]) \
    .drop_duplicates() \
    .reset_index(drop = True)

sb_plr_ssns.player_name = sb_plr_ssns.player_name.apply(fix_name)

# sb_plr_ssns.player_dob = pd.to_datetime(sb_plr_ssns.player_dob)

sb_plr_ssns

Unnamed: 0,player_name,season,player_dob
0,John Aldridge,1996/97,1958-09-18
1,John Aldridge,1997/98,1958-09-18
2,Dave Higgins,1996/97,1961-08-19
3,Eric Nixon,1996/97,1962-10-04
4,Eric Nixon,1999/00,1962-10-04
...,...,...,...
884,Samuel Taylor,2024/25,2003-12-23
885,Jay Turner-Cooke,2022/23,2003-12-31
886,Arthur Lomax,2022/23,2004-05-17
887,Josh Williams,2024/25,2004-07-13


In [75]:
def display_dob(dob, dob_qtr, dob_yr, dob_yr_is_est):
    if dob:
        return None

    elif dob_qtr and dob_yr:
        dob_str = f"Q{dob_qtr} {dob_yr}"

    elif dob_yr and not dob_yr_is_est:
        dob_str = str(dob_yr)

    elif dob_yr:
        dob_str = f"~{dob_yr}"

    elif not all([dob, dob_qtr, dob_yr, dob_yr_is_est]):
        dob_str = "Unknown"
    
    return dob_str.replace(".0", "")
    
cr_player_info = pd.read_csv("https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_stats.csv").rename(columns = {"dob": "player_dob"})

cr_player_info["player_name"] = cr_player_info["forename"] + " " + cr_player_info["surname"]

# DOBs found on Transfermarkt
dob_fixes = {
    'Edgar Walkden': '1914-11-04',
    'Ernie Davies': '1916-01-31',
    'Gary Bennett': '1963-09-20',
    'Gwyn Jones': '1912-02-21',
    'Jack Kearns': '1914-01-04',
    'James Cassidy': '1911-08-03',
    'John Griffiths': '1916-06-30'
}

for player, dob in dob_fixes.items():
    cr_player_info.loc[cr_player_info['player_name']==player, 'player_dob'] = dob

cr_player_info[["player_dob", "dob_qtr", "dob_yr"]] = cr_player_info[["player_dob", "dob_qtr", "dob_yr"]].fillna(False)

cr_player_info["dob_yr_is_est"] = cr_player_info["dob_yr_is_est"].astype(bool)

cr_player_info["dob_display"] = cr_player_info.apply(lambda x: display_dob(x.player_dob, x.dob_qtr, x.dob_yr, x.dob_yr_is_est), axis=1)

cr_player_info[['player_name', 'player_dob']].drop_duplicates().player_name.value_counts().to_frame().reset_index().query("count > 1")

Unnamed: 0,player_name,count
0,John Byrne,2


In [76]:
cr_dob_dict = cr_player_info[["player_name", "player_dob", "dob_display", "ssn_join", "ssn_lve"]].to_dict("records")

cr_dobs = []
for player in cr_dob_dict:
    years = (player["ssn_lve"] - player["ssn_join"]) + 1
    for year in range(0, years):
        season_1 = player["ssn_join"] + year
        season_2 = str(season_1 + 1)[2:]
        season = f"{season_1}/{season_2}"
        player_rec = {
            "player_name": player["player_name"],
            "player_dob": player["player_dob"],
            "dob_display": player["dob_display"],
            "season": season
        }
        cr_dobs.append(player_rec)

cr_dobs = pd.DataFrame(cr_dobs)

cr_plr_ssns = cr_player_apps \
    .merge(season_dates, on = ["game_date"], how = "left") \
    .merge(cr_dobs, on = ["season", "player_name"], how = "inner") \
    [["player_name", "season", "player_dob", "dob_display"]] \
    .drop_duplicates() \
    .reset_index(drop = True)

cr_plr_ssns

Unnamed: 0,player_name,season,player_dob,dob_display
0,Harry Bradshaw,1921/22,1895-01-22,
1,John Grainger,1921/22,False,~1896
2,Tom Stuart,1921/22,1893-10-25,
3,Johnny Campbell,1921/22,1894-10-14,
4,Charles Milnes,1921/22,False,Q2 1885
...,...,...,...,...
1636,Andy Thorn,1996/97,1966-11-12,
1637,Eric Nixon,1996/97,1962-10-04,
1638,Dave Challinor,1996/97,1975-10-02,
1639,Lee Jones,1996/97,1973-05-29,


In [77]:
maidstone_plr_ssns = [
    {
        'player_name': 'Tom Croughan',
        'season': '2016/17',
        'player_dob': '1999-09-19'
    },
    {
        'player_name': 'Jack Fleming',
        'season': '2016/17',
        'player_dob': '1999-01-10'
    },
    {
        'player_name': 'James Devine',
        'season': '2016/17',
        'player_dob': '1999-07-26'
    }
]

maidstone_plr_ssns = pd.DataFrame(maidstone_plr_ssns)

maidstone_plr_ssns['player_dob'] = pd.to_datetime(maidstone_plr_ssns['player_dob'])

In [78]:
player_dobs = pd.concat([cr_plr_ssns, sb_plr_ssns, maidstone_plr_ssns])
player_dobs

Unnamed: 0,player_name,season,player_dob,dob_display
0,Harry Bradshaw,1921/22,1895-01-22,
1,John Grainger,1921/22,False,~1896
2,Tom Stuart,1921/22,1893-10-25,
3,Johnny Campbell,1921/22,1894-10-14,
4,Charles Milnes,1921/22,False,Q2 1885
...,...,...,...,...
887,Josh Williams,2024/25,2004-07-13,
888,Mikey Davies,2023/24,2004-09-23,
0,Tom Croughan,2016/17,1999-09-19 00:00:00,
1,Jack Fleming,2016/17,1999-01-10 00:00:00,


In [79]:
dupe_players = player_dobs[["player_name", "player_dob"]].drop_duplicates().player_name.value_counts().to_frame().reset_index().query("count > 1").player_name.tolist()

dupe_players

['John Byrne', 'John Johnson', 'Ryan Williams', 'Charlie Jolley']

In [80]:
player_dobs

Unnamed: 0,player_name,season,player_dob,dob_display
0,Harry Bradshaw,1921/22,1895-01-22,
1,John Grainger,1921/22,False,~1896
2,Tom Stuart,1921/22,1893-10-25,
3,Johnny Campbell,1921/22,1894-10-14,
4,Charles Milnes,1921/22,False,Q2 1885
...,...,...,...,...
887,Josh Williams,2024/25,2004-07-13,
888,Mikey Davies,2023/24,2004-09-23,
0,Tom Croughan,2016/17,1999-09-19 00:00:00,
1,Jack Fleming,2016/17,1999-01-10 00:00:00,


In [81]:
def get_pl_index(player_name, surname, forename, dob, dob_display):
    if player_name not in dupe_players:
        return f"{surname}, {forename}"
    elif dob is not pd.NaT:
        return f"{surname}, {forename} (b.{dob.year})"
    else:
        return f"{surname}, {forename} (b.{dob_display[-4:]})"
    
player_dobs["name_words"] = player_dobs.player_name.str.split(" ").apply(len)

long_names = player_dobs.query("name_words != 2").player_name.drop_duplicates()

player_dobs["forename"] = player_dobs.player_name.str.split(" ").str[0]
player_dobs["surname"] = player_dobs.player_name.str.split(" ").str[1:].str.join(" ")
player_dobs.loc[player_dobs.player_name == "Pedro Miguel Matias", 'forename'] = 'Pedro Miguel'
player_dobs.loc[player_dobs.player_name == "Pedro Miguel Matias", 'surname'] = 'Matias'

player_dobs["player_dob"] = pd.to_datetime(player_dobs.player_dob, errors = "coerce")

player_dobs["pl_index"] = player_dobs.apply(lambda x: get_pl_index(x.player_name, x.surname, x.forename, x.player_dob, x.dob_display), axis = 1)

player_dobs = player_dobs.drop(columns = ["name_words"]).sort_values(by = ["pl_index", "season"]).drop_duplicates().reset_index(drop = True)

player_dobs

Unnamed: 0,player_name,season,player_dob,dob_display,forename,surname,pl_index
0,Alan A'Court,1964/65,1934-09-30,,Alan,A'Court,"A'Court, Alan"
1,Alan A'Court,1965/66,1934-09-30,,Alan,A'Court,"A'Court, Alan"
2,John Achterberg,1998/99,1971-07-08,,John,Achterberg,"Achterberg, John"
3,John Achterberg,1999/00,1971-07-08,,John,Achterberg,"Achterberg, John"
4,John Achterberg,2000/01,1971-07-08,,John,Achterberg,"Achterberg, John"
...,...,...,...,...,...,...,...
2503,Tommy Young,1976/77,1947-12-24,,Tommy,Young,"Young, Tommy"
2504,Calvin Zola,2004/05,1984-12-31,,Calvin,Zola,"Zola, Calvin"
2505,Calvin Zola,2005/06,1984-12-31,,Calvin,Zola,"Zola, Calvin"
2506,Calvin Zola,2006/07,1984-12-31,,Calvin,Zola,"Zola, Calvin"


In [82]:
player_dobs.to_csv("./data/player_dobs.csv", index = False)

In [83]:
player_dobs[["pl_index", "player_dob", "dob_display"]].drop_duplicates().reset_index(drop = True)

Unnamed: 0,pl_index,player_dob,dob_display
0,"A'Court, Alan",1934-09-30,
1,"Achterberg, John",1971-07-08,
2,"Adams, Arthur",NaT,Unknown
3,"Adams, Edward",1908-11-12,
4,"Adkins, Nigel",1965-03-11,
...,...,...,...
1031,"Yates, Wilf",NaT,1898
1032,"Yeats, Ron",1937-11-15,
1033,"Young, Jack",2000-10-21,
1034,"Young, Tommy",1947-12-24,


In [84]:
goals_df.query("goal_min.isna()").tail(10)

Unnamed: 0,game_date,player_name,goal_min,penalty,own_goal
5025,1994-04-16,Ian Muir,,,0
5026,1994-04-23,John Aldridge,,,0
5027,1994-04-23,John Aldridge,,,0
5028,1994-04-23,Kenny Irons,,,0
5029,1994-04-26,Ged Brannan,,,0
5030,1994-04-26,John Aldridge,,,0
5031,1994-04-30,John Aldridge,,,0
5032,1994-04-30,Ian Muir,,,0
5033,1994-05-08,Ian Muir,,,0
5034,1994-05-18,Pat Nevin,,,0
