## Process downloaded line-ups

In [39]:
import glob
import json
import pandas as pd

line_ups = sorted(glob.glob("./lineups/*.json"))

all_apps_df = pd.DataFrame()
all_subs_df = pd.DataFrame()
all_sub_mins_df = pd.DataFrame()
all_yellows_df = pd.DataFrame()
all_reds_df = pd.DataFrame()

for line_up in line_ups:
    with open(line_up) as f:
        data = f.read()

    data = json.loads(data)
    try:
        event_id = data["event_id"]
    except:
        event_id = data[0]["event_id"]
        data = data[0]

    home_team = data["data"]["teams"]["homeTeam"]["name"]
    away_team = data["data"]["teams"]["awayTeam"]["name"]

    if home_team == "Tranmere Rovers":
        team = 'homeTeam'
    else:
        team = 'awayTeam'

    formation = data["data"]["teams"][team]["formation"]
    formation = "-".join(str(formation))

    players = data["data"]["teams"][team]["players"]

    apps_df = pd.DataFrame()
    cards_df = pd.DataFrame()
    subs_df = pd.DataFrame()
    sub_mins_df = pd.DataFrame()

    for player in players:
        player_name = player["name"]["full"]
        try:
            shirt_no = player["meta"]["uniformNumber"]
        except:
            shirt_no = None
        role = player["meta"]["status"].replace("bench", "sub")
        
        player_app = {
            "event_id": event_id,
            "player_name": player_name,
            "shirt_no": shirt_no,
            "role": role
        }
        player_app = pd.DataFrame(player_app, index = [0])

        apps_df = pd.concat([apps_df, player_app], axis = 0)

        cards = player["bookings"]
        if cards:
            player_cards = pd.json_normalize(player["bookings"])
            player_cards["event_id"] = event_id
            player_cards["player_name"] = player_name
            cards_df = pd.concat([cards_df, player_cards], axis = 0)

        subs = player["substitutions"]
        if subs:
            sub_min = subs[0]["timeElapsed"]
            try:
                sub_on_no = subs[0]["replacedBy"]["meta"]["uniformNumber"]
            except:
                sub_on_no = None
            player_on = subs[0]["replacedBy"]["name"]["full"]
            sub_mins_pld = subs[0]["timeElapsed"]
            sub_mins = [{
                "event_id": event_id,
                "player_name": player_name,
                "min_off": sub_min,
                "min_on": ""
            },
            {
                "event_id": event_id,
                "player_name": player_on,
                "min_off": "",
                "min_on": sub_min
            }]
            sub_mins = pd.DataFrame(sub_mins)
            sub_mins_df = pd.concat([sub_mins_df, sub_mins], axis = 0)

            sub_names = [{
                "event_id": event_id,
                "shirt_no": sub_on_no,
                "player_name": player_name,
                "on_for": shirt_no,
                "off_for": ""
            },
            {
                "event_id": event_id,
                "shirt_no": shirt_no,
                "player_name": player_on,
                "on_for": "",
                "off_for": sub_on_no
            }]
            sub_names = pd.DataFrame(sub_names)
            subs_df = pd.concat([subs_df, sub_names], axis = 0)
    
    all_apps_df = pd.concat([all_apps_df, apps_df], axis = 0)
    all_subs_df = pd.concat([all_subs_df, subs_df], axis = 0)
    all_sub_mins_df = pd.concat([all_sub_mins_df, sub_mins_df], axis = 0)

    if not cards_df.empty:
        yellows_df = cards_df.query("type == 'yellow-card'")
        reds_df = cards_df.query("type.str.contains('red')")
        if not yellows_df.empty:
            yellows_df = yellows_df[["event_id", "player_name"]]
            yellows_df["yellow_card"] = 1
            all_yellows_df = pd.concat([all_yellows_df, yellows_df], axis = 0)
        if not reds_df.empty:
            reds_df = reds_df[["event_id", "player_name", "timeElapsed"]].rename(columns = {"timeElapsed": "min_so"})
            all_reds_df = pd.concat([all_reds_df, reds_df], axis = 0)

# ALL REQUIRE GAME DATE
# all_subs_df
# all_sub_mins_df
# all_yellows_df
# all_reds_df

## Process downloaded match data

In [2]:
import glob
import json
import pandas as pd
import re

all_months_json = sorted(glob.glob("./matches/*.json"))

all_match_records = []
for month_json in all_months_json:
        with open(month_json) as f:
                data = f.read()

        data = json.loads(data)

        comps = data["payload"][0]["body"]["matchData"]

        for comp in comps:
                comp_data = comp["tournamentMeta"]

                match_keys = comp["tournamentDatesWithEvents"].keys()

                for match_key in match_keys:
                        match = comp["tournamentDatesWithEvents"][match_key][0]

                        cup_data = match["round"]

                        match_data = match["events"][0]

                        event_id = match_data["eventKey"]

                        if match_data["eventStatus"] != "post-event":
                                next
                        elif comp_data["tournamentName"]["first"] == "Friendly Match":
                                next
                        else:
                                if match_data["homeTeam"]["name"]["full"] == "Tranmere Rovers":
                                        tranmere = "homeTeam"
                                        opponent = "awayTeam"
                                else:
                                        tranmere = "awayTeam"
                                        opponent = "homeTeam"
                                
                                game_date = match_data["startTime"][:10]
                                print(game_date)

                                opposition = match_data[opponent]["name"]["full"]

                                if match_data["venue"]["name"]["full"] == "Wembley Stadium":
                                        venue = "N"
                                elif match_data["venue"]["name"]["full"] == "Prenton Park":
                                        venue = "H"
                                elif match_data["venue"]["name"]["full"] != "Prenton Park":
                                        venue = "A"

                                goals_for = match_data[tranmere]["scores"]["score"]

                                goals_against = match_data[opponent]["scores"]["score"]

                                score = f"{goals_for}-{goals_against}"

                                if goals_for > goals_against:
                                        outcome = "W"
                                elif goals_for < goals_against:
                                        outcome = "L"
                                else:
                                        outcome = "D"

                                goal_diff = goals_for - goals_against

                                if comp_data["tournamentName"]["first"] in ["League One", "League Two", "National League"]:
                                        if "round" not in cup_data:
                                                game_type = "League"
                                        else:
                                                if "Play-offs" in cup_data["round"]["full"]:
                                                        game_type = "League Play-Off"     
                                else:
                                        game_type = "Cup"

                                competition = comp_data["tournamentName"]["full"]

                                ko_time = match_data["startTimeInUKHHMM"]

                                event_type = match_data["eventType"]
                                if event_type:
                                        if " leg" in event_type:
                                                cup_leg = event_type[:1]

                                        if event_type.upper() == "REPLAY":
                                                cup_replay = 1
                                else:
                                        cup_leg = None
                                        cup_replay = None

                                cup_name = cup_data["name"]
                                if cup_name:
                                        # if "full" in cup_data["name"]:
                                        cup_stage = cup_data["name"]["full"]
                                        if re.search(r"North(?:ern)?", cup_stage):
                                                cup_section = re.search(r"North(?:ern)?", cup_stage).group(0)
                                else:
                                        cup_stage = None
                                        cup_section = None

                                if match_data["eventProgress"] == "EXTRATIMECOMPLETE":
                                        aet = 1
                                else:
                                        aet = None

                                pen_gf = match_data[tranmere]["scores"]["shootout"]

                                pen_ga = match_data[opponent]["scores"]["shootout"]

                                if pen_gf:
                                        if pen_gf > pen_ga:
                                                pen_outcome = "W"
                                        elif pen_gf < pen_ga:
                                                pen_outcome = "L"
                                else:
                                        pen_outcome = None

                                pen_score = f"{pen_gf}-{pen_ga}"

                                agg_gf = match_data[tranmere]["scores"]["aggregate"]

                                agg_ga = match_data[opponent]["scores"]["aggregate"]

                                agg_outcome = None
                                if agg_gf:
                                        if agg_gf > agg_ga:
                                                agg_outcome = "W"
                                        elif agg_gf < agg_ga:
                                                agg_outcome = "L"
                                        else:
                                                agg_outcome = "D"

                                agg_score = None
                                if agg_outcome:
                                        agg_score = f"{agg_gf}-{agg_ga}"

                                decider = match_data["eventOutcomeType"]
                                if decider == "shootout":
                                        decider = "pens"
                                if decider == "extra-time":
                                        decider = "extra time"

                                if aet or pen_outcome or agg_outcome:
                                        cup_outcome = match_data[tranmere]["eventOutcome"].upper()[:1]
                                else:
                                        cup_outcome = None

                                if pen_outcome and agg_outcome:
                                        if pen_outcome == "W":
                                                str_outcome = "Won"
                                        else:
                                                str_outcome = "Lost"
                                        outcome_desc = f"{agg_score}. {str_outcome} {pen_score} on pens"
                                elif pen_outcome and not agg_outcome:
                                        if pen_outcome == "W":
                                                str_outcome = "Won"
                                        else:
                                                str_outcome = "Lost"
                                        outcome_desc = f"{str_outcome} {pen_score} on pens"
                                elif agg_outcome and not pen_outcome:
                                        if agg_outcome == "W":
                                                str_outcome = "Won"
                                        else:
                                                str_outcome = "Lost"
                                        outcome_desc = f"{str_outcome} {agg_score} on agg"
                                else:
                                        outcome_desc = None

                                if aet == 1:
                                        game_length = 120
                                else:
                                        game_length = 90

                                stadium = match_data["venue"]["name"]["full"]

                                match_record = {
                                        "game_date": game_date,
                                        "opposition": opposition,
                                        "venue": venue,
                                        "goals_for": goals_for,
                                        "goals_against": goals_against,
                                        "score": score,
                                        "outcome": outcome,
                                        "goal_diff": goal_diff,
                                        "game_type": game_type,
                                        "competition": competition,
                                        "ko_time": ko_time,
                                        "cup_leg": cup_leg,
                                        "cup_stage": cup_stage,
                                        "cup_replay": cup_replay,
                                        "cup_section": cup_section,
                                        "aet": aet,
                                        "pen_gf": pen_gf,
                                        "pen_ga": pen_ga,
                                        "pen_outcome": pen_outcome,
                                        "pen_score": pen_score,
                                        "agg_gf": agg_gf,
                                        "agg_ga": agg_ga,
                                        "agg_outcome": agg_outcome,
                                        "agg_score": agg_score,
                                        "decider": decider,
                                        "cup_outcome": cup_outcome,
                                        "outcome_desc": outcome_desc,
                                        "game_length": game_length,
                                        "stadium": stadium,
                                        "event_id": event_id
                                }
                                all_match_records.append(match_record)
                        
                                filename = f"./matches/match-data/{event_id}.json"
                                with open(filename, "w") as outfile:
                                        json.dump(match, outfile)
                                        print(f"Saved data to {filename}" )

matches_df = pd.DataFrame(all_match_records)                        

2011-08-06
Saved data to ./matches/match-data/EFBO361411.json
2011-08-13
Saved data to ./matches/match-data/EFBO361420.json
2011-08-16
Saved data to ./matches/match-data/EFBO361433.json
2011-08-20
Saved data to ./matches/match-data/EFBO361447.json
2011-08-27
Saved data to ./matches/match-data/EFBO361451.json
2011-08-30
Saved data to ./matches/match-data/EFBO381822.json
2011-09-03
Saved data to ./matches/match-data/EFBO361471.json
2011-09-10
Saved data to ./matches/match-data/EFBO361479.json
2011-09-13
Saved data to ./matches/match-data/EFBO361495.json
2011-09-17
Saved data to ./matches/match-data/EFBO361507.json
2011-09-24
Saved data to ./matches/match-data/EFBO361518.json
2011-10-01
Saved data to ./matches/match-data/EFBO361531.json
2011-10-01
Saved data to ./matches/match-data/EFBO361531.json
2011-10-08
Saved data to ./matches/match-data/EFBO361537.json
2011-10-12
Saved data to ./matches/match-data/EFBO419418.json
2011-10-15
Saved data to ./matches/match-data/EFBO361555.json
2011-10-

In [4]:
def insert_round_no(cup_stage):
    if cup_stage is None:
        return None
    elif " FINAL" in cup_stage.upper():
        return "F"
    elif cup_stage.upper() in ["PLAY-OFFS", "SEMI-FINALS"]:
        return "SF"
    elif "QUARTER-FINALS" in cup_stage.upper():
        return "QF"
    elif "FIFTH ROUND" in cup_stage.upper():
        return "5"
    elif "FOURTH ROUND" in cup_stage.upper():
        return "4"
    elif "THIRD ROUND" in cup_stage.upper():
        return "3"
    elif "SECOND ROUND" in cup_stage.upper():
        return "2"
    elif "FIRST ROUND" in cup_stage.upper():
        return "1"
    elif "GROUP" in cup_stage.upper():
        return "G"
    else:
        return None

matches_df["cup_round"] = matches_df.cup_stage.apply(insert_round_no)
matches_df[~matches_df.cup_stage.isna()][["cup_stage", "cup_round"]].head(3)

Unnamed: 0,cup_stage,cup_round
372,Play-off Final,F
381,Northern Group D,G
388,Northern Group D,G


In [5]:
matches_df.competition = matches_df.competition.str.replace("Sky Bet League", "League").str.replace("Vanarama National", "National")

In [6]:
generic_comps_map = {
        "Carabao Cup": "League Cup",
        "FA Cup Qualifying": "FA Cup Qualifying",
        "Isuzu FA Trophy": "FA Trophy",
        "League One": "Football League",
        "League Two": "Football League",
        "National League": "Non-League",
        "Papa John's Trophy": "Associate Members' Cup",
        "The Emirates FA Cup": "FA Cup"
    }
matches_df['generic_comp'] = matches_df.competition.map(generic_comps_map)

matches_df[["competition", "generic_comp"]].drop_duplicates().sort_values("competition")

Unnamed: 0,competition,generic_comp
102,Carabao Cup,League Cup
229,FA Cup Qualifying,FA Cup Qualifying
237,Isuzu FA Trophy,FA Trophy
0,League One,Football League
156,League Two,Football League
211,National League,Non-League
5,Papa John's Trophy,Associate Members' Cup
69,The Emirates FA Cup,FA Cup


In [11]:
import json
import pandas as pd
import requests
import glob
import time

matches = sorted(glob.glob("./matches/match-data/*.json"))

attendances = []
for match in matches:
    event_id = match.replace("./matches/match-data/", "").replace(".json", "")

    lineup_url = f"https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/{event_id}/version/1.0.8"
    # print(lineup_url)

    r = requests.get(lineup_url)
    time.sleep(0.2)

    try:
        data = r.json()

        if "attendance" in data["meta"]:
            attendance = data["meta"]["attendance"].replace(",", "")
        else:
            attendance = None
            print(f"Attendance not found at {lineup_url}")
        if "referee" in data["meta"]:
            referee = data["meta"]["referee"].replace(",", "")
        else:
            referee = None
            print(f"Referee not found at {lineup_url}")

        attendances.append({
            "event_id": event_id,
            "attendance": attendance,
            "referee": referee
        })
    except json.JSONDecodeError:
        print(f"JSONDecodeError at {lineup_url}")

Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2127743/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2128099/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2128103/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2128104/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2130365/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2130383/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-football-team-lineups-data/event/EFBO2130387/version/1.0.8
Attendance not found at https://push.api.bbci.co.uk/data/bbc-morph-sport-foo

In [12]:
attendances = pd.DataFrame(attendances)
attendances.head(3)

Unnamed: 0,event_id,attendance,referee
0,EFBO1014828,2174,Graham Salisbury
1,EFBO1014831,1379,Peter Wright
2,EFBO1014833,747,Paul Marsden


In [17]:
import json
import glob
import pandas as pd

matches = sorted(glob.glob("./matches/match-data/*.json"))

game_dates = []
for match in matches:
    event_id = match.replace("./matches/match-data/", "").replace(".json", "")

    with open(match) as f:
        data = f.read()

    data = json.loads(data)

    # match_key = next(iter(data["tournamentDatesWithEvents"]))

    game_date = data["events"][0]["startTime"][:10]

    game_dates.append({
        "event_id": event_id,
        "game_date": game_date
    })

game_dates = pd.DataFrame(game_dates).drop_duplicates()

In [18]:
game_dates

Unnamed: 0,event_id,game_date
0,EFBO1014828,2018-09-04
1,EFBO1014831,2018-10-09
2,EFBO1014833,2018-10-30
3,EFBO1032591,2018-11-10
4,EFBO1033275,2018-11-20
...,...,...
626,EFBO992566,2019-04-13
627,EFBO992579,2019-04-19
628,EFBO992584,2019-04-22
629,EFBO992603,2019-04-30


In [22]:
atts_and_refs = game_dates.merge(attendances, on="event_id", how="left")

In [40]:
all_apps_df = all_apps_df.merge(game_dates, on = "event_id", how = "left")
column = all_apps_df.pop("game_date")
all_apps_df.insert(0, "game_date", column)
all_apps_df = all_apps_df.drop(columns = ["event_id"])

all_subs_df = all_subs_df.merge(game_dates, on = "event_id", how = "left")
column = all_subs_df.pop("game_date")
all_subs_df.insert(0, "game_date", column)
all_subs_df = all_subs_df.drop(columns = ["event_id"])

all_sub_mins_df = all_sub_mins_df.merge(game_dates, on = "event_id", how = "left")
column = all_sub_mins_df.pop("game_date")
all_sub_mins_df.insert(0, "game_date", column)
all_sub_mins_df = all_sub_mins_df.drop(columns = ["event_id"])

all_yellows_df = all_yellows_df.merge(game_dates, on = "event_id", how = "left")
column = all_yellows_df.pop("game_date")
all_yellows_df.insert(0, "game_date", column)
all_yellows_df = all_yellows_df.drop(columns = ["event_id"])

all_reds_df = all_reds_df.merge(game_dates, on = "event_id", how = "left")
column = all_reds_df.pop("game_date")
all_reds_df.insert(0, "game_date", column)
all_reds_df = all_reds_df.drop(columns = ["event_id"])

In [51]:
all_apps_df.query("~game_date.isna()").to_csv("./processed-csvs/bbc_api_apps.csv", index = False)
all_subs_df.query("~game_date.isna()").to_csv("./processed-csvs/bbc_api_subs.csv", index = False)
all_sub_mins_df.query("~game_date.isna()").to_csv("./processed-csvs/bbc_api_sub_mins.csv", index = False)
all_yellows_df.query("~game_date.isna()").to_csv("./processed-csvs/bbc_api_yellows.csv", index = False)
all_reds_df.query("~game_date.isna()").to_csv("./processed-csvs/bbc_api_reds.csv", index = False)