In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
pd.set_option('display.max_columns', 999)

In [22]:
# Get footywire data
df_footywire_data = pd.read_csv(r'inputs/footywire_data 2020-01-05_15-08.csv', index_col=0)
df_footywire_data.tail(3)

Unnamed: 0,Date,Season,Round,Venue,Player,Team,Opposition,Status,Match_id,CP,UP,ED,DE,CM,GA,MI5,One.Percenters,BO,TOG,K,HB,D,M,G,B,T,HO,GA1,I50,CL,CG,R50,FF,FA,AF,SC,CCL,SCL,SI,MG,TO,ITC,T5
90418,2019-09-28,2019,Grand Final,MCG,Adam Kennedy,GWS,Richmond,Away,9927,4,8,9,81.8,0,0,0,1,0,71,5,6,11,4,0,0,4,0,0,2,1,0,0,1,0,56,55,0.0,1.0,1.0,121.0,0.0,5.0,1.0
90419,2019-09-28,2019,Grand Final,MCG,Shane Mumford,GWS,Richmond,Away,9927,8,2,7,70.0,0,0,0,4,0,75,2,8,10,0,0,0,1,15,0,1,2,2,1,1,0,42,62,1.0,1.0,0.0,0.0,1.0,2.0,0.0
90420,2019-09-28,2019,Grand Final,MCG,Jeremy Finlayson,GWS,Richmond,Away,9927,1,1,1,100.0,0,0,0,3,0,63,1,0,1,1,0,0,1,1,0,0,1,1,0,0,1,8,7,1.0,0.0,0.0,26.0,1.0,0.0,1.0


In [6]:
# Get AFL Tables data
df_afl_tables_match_results = pd.read_csv(r'inputs/afl_tables_match_results 2020-01-06_09-40.csv', index_col=0)
df_afl_tables_match_results.tail(3)

Unnamed: 0,Game,Date,Round,Home.Team,Home.Goals,Home.Behinds,Home.Points,Away.Team,Away.Goals,Away.Behinds,Away.Points,Venue,Margin,Season,Round.Type,Round.Number
15612,15612,2019-09-20,PF,Richmond,12,13,85,Geelong,9,12,66,M.C.G.,19,2019,Finals,26
15613,15613,2019-09-21,PF,GWS,8,8,56,Collingwood,7,10,52,M.C.G.,4,2019,Finals,26
15614,15614,2019-09-28,GF,Richmond,17,12,114,GWS,3,7,25,M.C.G.,89,2019,Finals,27


In [47]:
# Create a column for a standard round name e.g. Grand Final should be GF
round_map_data = {
    "R1": "Round 1",
    "R2": "Round 2",
    "R3": "Round 3",
    "R4": "Round 4",
    "R5": "Round 5",
    "R6": "Round 6",
    "R7": "Round 7",
    "R8": "Round 8",
    "R9": "Round 9",
    "R10": "Round 10",
    "R11": "Round 11",
    "R12": "Round 12",
    "R13": "Round 13",
    "R14": "Round 14",
    "R15": "Round 15",
    "R16": "Round 16",
    "R17": "Round 17",
    "R18": "Round 18",
    "R19": "Round 19",
    "R20": "Round 20",
    "R21": "Round 21",
    "R22": "Round 22",
    "R23": "Round 23",
    "R24": "Round 24",
    "SF": "Semi Final",
    "GF": "Grand Final",
    "PF": "Preliminary Final",
    "QF": "Qualifying Final",
    "EF": "Elimination Final"
}

df_round_mapping = pd.DataFrame.from_dict(round_map_data, orient='index')
df_round_mapping = df_round_mapping.reset_index().rename(columns={'index': 'short_round', 0: 'long_round'})
round_mapping = dict(df_round_mapping[['long_round', 'short_round']].values)
df_footywire_data['short_round'] = df_footywire_data['Round'].map(round_mapping)
df_footywire_data

Unnamed: 0,Date,Season,Round,Venue,Player,Team,Opposition,Status,Match_id,CP,UP,ED,DE,CM,GA,MI5,One.Percenters,BO,TOG,K,HB,D,M,G,B,T,HO,GA1,I50,CL,CG,R50,FF,FA,AF,SC,CCL,SCL,SI,MG,TO,ITC,T5,short_round,Team_abbrev,Opposition_abbrev
1,2010-03-25,2010,Round 1,MCG,Daniel Connors,Richmond,Carlton,Home,5089,8,15,16,66.7,0,0,0,1,0,69,14,10,24,3,0,0,1,0,0,2,2,4,6,2,0,77,85,,,,,,,,R1,RIC,CAR
2,2010-03-25,2010,Round 1,MCG,Daniel Jackson,Richmond,Carlton,Home,5089,11,10,14,60.9,1,0,0,0,0,80,11,12,23,2,0,0,5,0,0,8,5,4,1,2,0,85,89,,,,,,,,R1,RIC,CAR
3,2010-03-25,2010,Round 1,MCG,Brett Deledio,Richmond,Carlton,Home,5089,7,14,16,76.2,0,0,0,0,0,89,12,9,21,5,1,0,6,0,0,4,3,4,3,1,2,94,93,,,,,,,,R1,RIC,CAR
4,2010-03-25,2010,Round 1,MCG,Ben Cousins,Richmond,Carlton,Home,5089,9,10,11,57.9,0,1,0,0,0,69,13,6,19,1,1,0,1,0,1,1,2,3,4,1,0,65,70,,,,,,,,R1,RIC,CAR
5,2010-03-25,2010,Round 1,MCG,Trent Cotchin,Richmond,Carlton,Home,5089,8,10,13,68.4,1,0,0,0,1,77,11,8,19,6,0,0,1,0,0,2,3,3,2,0,2,65,63,,,,,,,,R1,RIC,CAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90416,2019-09-28,2019,Grand Final,MCG,Harrison Himmelberg,GWS,Richmond,Away,9927,3,9,7,58.3,1,0,2,4,0,92,9,3,12,7,1,1,1,2,0,2,0,3,0,1,1,65,51,0.0,0.0,3.0,153.0,4.0,2.0,1.0,GF,GWS,RIC
90417,2019-09-28,2019,Grand Final,MCG,Samuel Reid,GWS,Richmond,Away,9927,5,6,8,66.7,0,0,0,4,0,96,5,7,12,2,0,0,3,0,0,0,1,2,1,0,0,47,55,0.0,1.0,1.0,97.0,2.0,5.0,0.0,GF,GWS,RIC
90418,2019-09-28,2019,Grand Final,MCG,Adam Kennedy,GWS,Richmond,Away,9927,4,8,9,81.8,0,0,0,1,0,71,5,6,11,4,0,0,4,0,0,2,1,0,0,1,0,56,55,0.0,1.0,1.0,121.0,0.0,5.0,1.0,GF,GWS,RIC
90419,2019-09-28,2019,Grand Final,MCG,Shane Mumford,GWS,Richmond,Away,9927,8,2,7,70.0,0,0,0,4,0,75,2,8,10,0,0,0,1,15,0,1,2,2,1,1,0,42,62,1.0,1.0,0.0,0.0,1.0,2.0,0.0,GF,GWS,RIC


In [48]:
# Create columns for standard team names e.g. Saint Kilda should be STK
team_map_data = {
    "Adelaide": { "footywire_team_name":"Adelaide", "abbrev_team_name":"ADE"},
    "Brisbane Lions": { "footywire_team_name":"Brisbane", "abbrev_team_name":"BRI"},
    "Carlton": { "footywire_team_name":"Carlton", "abbrev_team_name":"CAR"},
    "Collingwood": { "footywire_team_name":"Collingwood", "abbrev_team_name":"COL"},
    "Essendon": { "footywire_team_name":"Essendon", "abbrev_team_name":"ESS"},
    "Fitzroy": { "footywire_team_name":"", "abbrev_team_name":"FIT"},
    "Footscray": { "footywire_team_name":"Western Bulldogs", "abbrev_team_name":"WBD"},
    "Fremantle": { "footywire_team_name":"Fremantle", "abbrev_team_name":"FRE"},
    "Geelong": { "footywire_team_name":"Geelong", "abbrev_team_name":"GEE"},
    "Gold Coast": { "footywire_team_name":"Gold Coast", "abbrev_team_name":"GCS"},
    "GWS": { "footywire_team_name":"GWS", "abbrev_team_name":"GWS"},
    "Hawthorn": { "footywire_team_name":"Hawthorn", "abbrev_team_name":"HAW"},
    "Melbourne": { "footywire_team_name":"Melbourne", "abbrev_team_name":"MEL"},
    "North Melbourne": { "footywire_team_name":"North Melbourne", "abbrev_team_name":"NM"},
    "Port Adelaide": { "footywire_team_name":"Port Adelaide", "abbrev_team_name":"PTA"},
    "Richmond": { "footywire_team_name":"Richmond", "abbrev_team_name":"RIC"},
    "St Kilda": { "footywire_team_name":"St Kilda", "abbrev_team_name":"STK"},
    "Sydney": { "footywire_team_name":"Sydney", "abbrev_team_name":"SYD"},
    "University": { "footywire_team_name":"", "abbrev_team_name":"UNI"},
    "West Coast": { "footywire_team_name":"West Coast", "abbrev_team_name":"WCE"}
}
df_team_mapping = pd.DataFrame.from_dict(team_map_data, orient='index')
df_team_mapping = df_team_mapping.reset_index().rename(columns={'index': 'afl_tables_team_name'})
team_mapping = dict(df_team_mapping[['footywire_team_name', 'abbrev_team_name']].values)
df_footywire_data['Team_abbrev'] = df_footywire_data['Team'].map(team_mapping)
df_footywire_data['Opposition_abbrev'] = df_footywire_data['Opposition'].map(team_mapping)
df_footywire_data.tail()

Unnamed: 0,Date,Season,Round,Venue,Player,Team,Opposition,Status,Match_id,CP,UP,ED,DE,CM,GA,MI5,One.Percenters,BO,TOG,K,HB,D,M,G,B,T,HO,GA1,I50,CL,CG,R50,FF,FA,AF,SC,CCL,SCL,SI,MG,TO,ITC,T5,short_round,Team_abbrev,Opposition_abbrev
90416,2019-09-28,2019,Grand Final,MCG,Harrison Himmelberg,GWS,Richmond,Away,9927,3,9,7,58.3,1,0,2,4,0,92,9,3,12,7,1,1,1,2,0,2,0,3,0,1,1,65,51,0.0,0.0,3.0,153.0,4.0,2.0,1.0,GF,GWS,RIC
90417,2019-09-28,2019,Grand Final,MCG,Samuel Reid,GWS,Richmond,Away,9927,5,6,8,66.7,0,0,0,4,0,96,5,7,12,2,0,0,3,0,0,0,1,2,1,0,0,47,55,0.0,1.0,1.0,97.0,2.0,5.0,0.0,GF,GWS,RIC
90418,2019-09-28,2019,Grand Final,MCG,Adam Kennedy,GWS,Richmond,Away,9927,4,8,9,81.8,0,0,0,1,0,71,5,6,11,4,0,0,4,0,0,2,1,0,0,1,0,56,55,0.0,1.0,1.0,121.0,0.0,5.0,1.0,GF,GWS,RIC
90419,2019-09-28,2019,Grand Final,MCG,Shane Mumford,GWS,Richmond,Away,9927,8,2,7,70.0,0,0,0,4,0,75,2,8,10,0,0,0,1,15,0,1,2,2,1,1,0,42,62,1.0,1.0,0.0,0.0,1.0,2.0,0.0,GF,GWS,RIC
90420,2019-09-28,2019,Grand Final,MCG,Jeremy Finlayson,GWS,Richmond,Away,9927,1,1,1,100.0,0,0,0,3,0,63,1,0,1,1,0,0,1,1,0,0,1,1,0,0,1,8,7,1.0,0.0,0.0,26.0,1.0,0.0,1.0,GF,GWS,RIC


In [121]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

url = "https://www.footywire.com/afl/footy/ft_match_list?year=2019"
res = res = requests.get(url)
soup = BeautifulSoup(res.text, features='lxml')
tables = soup.select(".data")

dates = []
teams = []
venues = []
crowds = []
results = []
match_ids = []
disposals = []
goals = []

counter = 0

for cell in tables:
    if counter == 0:
        dates.append(cell.text)
    elif counter == 1:
        teams.append(cell.text)
    elif counter == 2:
        venues.append(cell.text)
    elif counter == 3:
        crowds.append(cell.text)
    elif counter == 4:
        results.append(cell.text)
        for a in cell.find_all('a', href=True):
            match_ids.append(a['href'])
    elif counter == 5:
        disposals.append(cell.text)
    elif counter == 6:
        goals.append(cell.text)
    counter += 1
    if counter == 7:
        counter = 0

df_footywire_matches = pd.DataFrame(
    list(
        zip(
            dates,
            teams,
            venues,
            crowds,
            results,
            match_ids,
            disposals,
            goals,
        )
    ),
    columns=[
        'date',
        'teams',
        'venue',
        'crowd',
        'results',
        'match_id',
        'disposals',
        'goals'
    ])

split_teams = df_footywire_matches["teams"].str.split("\nv", n = 1, expand = True) 
df_footywire_matches["home_team"]= split_teams[0] 
df_footywire_matches["away_team"]= split_teams[1] 
split_results = df_footywire_matches["results"].str.split("-", n = 1, expand = True) 
df_footywire_matches["home_team_score"]= split_results[0] 
df_footywire_matches["away_team_score"]= split_results[1] 
df_footywire_matches.drop(columns =["teams", "results", "disposals", "goals"], inplace = True) 

df_footywire_matches['home_team'] = df_footywire_matches['home_team'].str.replace('\n', '')
df_footywire_matches['away_team'] = df_footywire_matches['away_team'].str.replace('\n', '')
df_footywire_matches['match_id'] = df_footywire_matches['match_id'].str.replace('ft_match_statistics\?mid=', '')
df_footywire_matches['match_id'] = df_footywire_matches['match_id'].astype(int)
df_footywire_matches
# df.to_csv(r'Outputs/footywire_match_results_2019_test.csv')

Unnamed: 0,date,venue,crowd,match_id,home_team,away_team,home_team_score,away_team_score
0,Thu 21 Mar 7:25pm,MCG,85016,9721,Carlton,Richmond,64,97
1,Fri 22 Mar 7:50pm,MCG,78017,9722,Collingwood,Geelong,65,72
2,Sat 23 Mar 1:45pm,MCG,38866,9723,Melbourne,Port Adelaide,61,87
3,Sat 23 Mar 4:05pm,Adelaide Oval,50180,9724,Adelaide,Hawthorn,55,87
4,Sat 23 Mar 7:20pm,Gabba,20029,9726,Brisbane,West Coast,102,58
...,...,...,...,...,...,...,...,...
202,Sat 17 Aug 6:10pm,Optus Stadium,47833,9923,Fremantle,Essendon,55,87
203,Sat 17 Aug 7:25pm,Marvel Stadium,17063,9924,North Melbourne,Port Adelaide,144,58
204,Sun 18 Aug 1:10pm,MCG,57415,9925,Richmond,West Coast,88,82
205,Sun 18 Aug 3:20pm,GIANTS Stadium,10139,9926,GWS,Western Bulldogs,65,126


In [120]:
df_footywire_player_match = pd.read_csv(r'Outputs/footywire_player_match_data.csv')
df_match_summary = pd.pivot_table(df_footywire_player_match, index=['Match_id'], values=['SC', 'AF'], columns=['Status'], aggfunc=np.sum)
df_match_summary = df_match_summary.reset_index()
df_match_summary['Match_id'] = df_match_summary['Match_id'].astype(int)
df_match_summary

Unnamed: 0_level_0,Match_id,AF,AF,SC,SC
Status,Unnamed: 1_level_1,Away,Home,Away,Home
0,4961,1411,1610,1656,1661
1,4962,1393,1713,1359,1944
2,4963,1452,1900,1332,1973
3,4964,1699,1364,1776,1534
4,4965,1643,1503,1742,1556
...,...,...,...,...,...
2032,9923,1363,1544,1495,1802
2033,9924,1338,1415,1622,1678
2034,9925,1492,1385,1680,1616
2035,9926,1518,1580,1657,1641


In [124]:
pd.merge(df_footywire_matches, df_match_summary,how='left', left_on='match_id', right_on='Match_id')

Unnamed: 0,date,venue,crowd,match_id,home_team,away_team,home_team_score,away_team_score,"(Match_id, )","(AF, Away)","(AF, Home)","(SC, Away)","(SC, Home)"
0,Thu 21 Mar 7:25pm,MCG,85016,9721,Carlton,Richmond,64,97,9721,1478,1642,1612,1687
1,Fri 22 Mar 7:50pm,MCG,78017,9722,Collingwood,Geelong,65,72,9722,1548,1525,1693,1608
2,Sat 23 Mar 1:45pm,MCG,38866,9723,Melbourne,Port Adelaide,61,87,9723,1726,1373,1830,1470
3,Sat 23 Mar 4:05pm,Adelaide Oval,50180,9724,Adelaide,Hawthorn,55,87,9724,1576,1683,1640,1658
4,Sat 23 Mar 7:20pm,Gabba,20029,9726,Brisbane,West Coast,102,58,9726,1277,1535,1489,1812
...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,Sat 17 Aug 6:10pm,Optus Stadium,47833,9923,Fremantle,Essendon,55,87,9923,1363,1544,1495,1802
203,Sat 17 Aug 7:25pm,Marvel Stadium,17063,9924,North Melbourne,Port Adelaide,144,58,9924,1338,1415,1622,1678
204,Sun 18 Aug 1:10pm,MCG,57415,9925,Richmond,West Coast,88,82,9925,1492,1385,1680,1616
205,Sun 18 Aug 3:20pm,GIANTS Stadium,10139,9926,GWS,Western Bulldogs,65,126,9926,1518,1580,1657,1641
