In [1]:
import urllib.request
import csv
import glob
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import numpy as np
import os

# Scrape Fanfooty data
This notebook is used to scrape the following data from fanfooty:

1. Current player list
2. Match stats for each player
3. Match results/fixture

## REQUIRED - Specify match IDs to scrape:

In [2]:
start_match = 7345
end_match = 7353

### Create folder to save all outputs

In [3]:
timestr = time.strftime("%Y%m%d-%H%M%S")
destination = "exports/scrape_{}".format(timestr)
os.mkdir(destination)

## 1. Current player list

### Save historical fanfooty player IDs csv file

In [4]:
url = "http://www.fanfooty.com.au/resource/player.php?type=all"
headers = "fanfooty_player_id,fanfooty_afl_player_id,fanfooty_first_name,fanfooty_surname,fanfooty_team,fanfooty_status,fanfooty_number,fanfooty_birth_date,fanfooty_height,fanfooty_weight,fanfooty_state_of_origin,fanfooty_recruited_from,fanfooty_games,fanfooty_goals"

res = requests.get(url)
split_rows_list = res.text.split('"\r\n')
f = open('{}/fanfooty_player_ids_historical.csv'.format(destination),'w')
f.write('{}\n'.format(headers)) 
for row in split_rows_list:
    f.write('{}\n'.format(row)) 
f.close()

### Clean data:
* Remove duplicate players
* Standard team name

In [5]:
df_player_list = pd.read_csv('{}/fanfooty_player_ids_historical.csv'.format(destination), encoding = "ISO-8859-1")
df_team_names = pd.read_csv('inputs/all_sources_team_names.csv')
df_player_list['short_team'] = df_player_list['fanfooty_team'].map(dict(df_team_names[['fanfooty_team_name_mid', 'sc_team_name']].values))
df_player_list = df_player_list.dropna(subset=['fanfooty_afl_player_id'])
df_player_list = df_player_list[df_player_list['fanfooty_afl_player_id'] != 0]

remove_duplicate_players = [
    {'player_id': 271128, 'first_name': 'Danny', 'surname': 'Butcher', 'team': 'Port Adelaide'},
    {'player_id': 296214, 'first_name': 'Jake', 'surname': 'Barratt', 'team': 'GWS'},
    {'player_id': 991933, 'first_name': 'Jason', 'surname': 'Cantstandya', 'team': 'Richmond'},
    {'player_id': 992351, 'first_name': 'Bailey', 'surname': 'Williams', 'team': 'None'}
]

for player in remove_duplicate_players:
    indexNames = df_player_list[
        (df_player_list['fanfooty_afl_player_id'] == player['player_id']) & 
        (df_player_list['fanfooty_first_name'] == player['first_name']) &
        (df_player_list['fanfooty_surname'] == player['surname']) &
        (df_player_list['fanfooty_team'] == player['team'])
    ].index
    df_player_list.drop(indexNames , inplace=True)

df_player_list.to_csv('{}/fanfooty_player_ids_historical.csv'.format(destination))
df_player_list

Unnamed: 0,fanfooty_player_id,fanfooty_afl_player_id,fanfooty_first_name,fanfooty_surname,fanfooty_team,fanfooty_status,fanfooty_number,fanfooty_birth_date,fanfooty_height,fanfooty_weight,fanfooty_state_of_origin,fanfooty_recruited_from,fanfooty_games,fanfooty_goals,short_team
3,43031,294472.0,Rory,Atkins,Adelaide,senior,21.0,1994-07-12,186.0,85.0,VIC,Calder Cannons,100.0,46.0,ADE
6,1011,980001.0,Nathan,Bassett,Adelaide,,8.0,1976-12-07,190.0,88.0,SA,Norwood,210.0,25.0,ADE
7,44021,295026.0,James,Battersby,Adelaide,,43.0,1995-11-07,177.0,78.0,SA,Sturt,,,ADE
8,46114,280974.0,Jonathon,Beech,Adelaide,,1.0,1990-11-09,187.0,85.0,SA,West Adelaide,3.0,1.0,ADE
9,623,990006.0,James,Begley,Adelaide,,28.0,1980-07-22,0.0,0.0,,,61.0,10.0,ADE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5041,47084,993894.0,Lachlan,Tiziani,GWS,,33.0,1997-03-13,189.0,79.0,NSW,Broken Hill,0.0,0.0,GWS
5042,42215,291928.0,Gerald,Ugle,GWS,,32.0,1993-01-31,180.0,80.0,WA,Beverley,3.0,1.0,GWS
5043,39670,280109.0,Callan,Ward,GWS,senior,8.0,1990-04-10,187.0,84.0,VIC,Western Jets,220.0,109.0,GWS
5044,43001,294305.0,Lachie,Whitfield,GWS,senior,6.0,1994-07-18,187.0,81.0,VIC,Dandenong Stingrays,140.0,63.0,GWS


## 2. Match stats for each player

### Scrape match files from Fanfooty website

In [6]:
def return_list_of_urls(match_id):
    full_url_list = []
    for match in match_id:
        url = "http://live.fanfooty.com.au/chat/"
        extension = ".txt"
        full_url = "{}{}{}".format(url, match, extension)
        full_url_list.append(full_url)
    return full_url_list



matches = list(range(start_match, end_match + 1))

list_of_urls = return_list_of_urls(matches)

for list in list_of_urls:
    print(list)

for url in list_of_urls:
    response = urllib.request.urlopen(url)
    webContent = response.read()
    filename = url[-8:]
    f = open("inputs/All Match Data/{}".format(filename), 'wb')
    f.write(webContent)
    print(filename)

http://live.fanfooty.com.au/chat/7345.txt
http://live.fanfooty.com.au/chat/7346.txt
http://live.fanfooty.com.au/chat/7347.txt
http://live.fanfooty.com.au/chat/7348.txt
http://live.fanfooty.com.au/chat/7349.txt
http://live.fanfooty.com.au/chat/7350.txt
http://live.fanfooty.com.au/chat/7351.txt
http://live.fanfooty.com.au/chat/7352.txt
http://live.fanfooty.com.au/chat/7353.txt
7345.txt
7346.txt
7347.txt
7348.txt
7349.txt
7350.txt
7351.txt
7352.txt
7353.txt


### Headers of each field in match file

In [7]:
column_header_names = [
    'Fanfooty Match ID',
    'Fanfooty Match URL',
    'Round',
    'Year',
    'Player ID',
    'First Name',
    'Surname',
    'Team',
    'null',
    'DT',
    'SC',
    'null2',
    'null3',
    'null4',
    'Kicks',
    'Handballs',
    'Marks',
    'Tackles',
    'Hitouts',
    'Frees for',
    'Frees against',
    'Goals',
    'Behinds',
    'Not sure',
    'Tag',
    'Tag Notes',
    'Tag 2',
    'Tag 2 Notes',
    'null5',
    'null6',
    'null7',
    'null8',
    'Position',
    'Jumper Number',
    'null9',
    'null10',
    'null11',
    'DT own %',
    'SC own %',
    'AF own %',
    'null12',
    'AF Breakeven',
    'null13',
    'Contested Possessions',
    'Clearances',
    'Clangers',
    'Disposal efficiency',
    'Time on ground',
    'Metres gained'
]

### Read match files and write to csv

In [8]:
df_fanfooty_player_raw = pd.DataFrame()
def get_number_of_lines_in_file(data):
    return len(data.split('\n'))


def get_match_id(data):
    name = data.split('\n', 1)[0]
    return name[-8:-4]


def get_url_of_match(data):
    name = data.split('\n', 1)[0]
    url = "http://live.fanfooty.com.au/game/matchcentre.html?id=" + name[-8:-4]
    return url


def get_round(data):
    line = data.split('\n', 1)[1]
    stripped_line = [x.strip() for x in line.split(',')]
    afl_round = stripped_line[4]
    return afl_round


def get_year(data):
    second_line = data.splitlines()[2]
    stripped_second_line = [x.strip() for x in second_line.split(',')]
    afl_year = stripped_second_line[1]
    return afl_year


def get_match_data_list():
    data_list = []
    path = "inputs/All Match Data/*.txt"

    for item in glob.glob(path):
        file = open(item, 'r')
        name = file.name
        data = file.read()
        data_list.append(name + '\n' + data)
    return data_list


def return_player_match_data(data_list):
    player_data_for_match = []

    for match in data_list:
        number_of_lines = get_number_of_lines_in_file(match)
        afl_round = get_round(match)
        afl_year = get_year(match)
        name = get_url_of_match(match)
        match_id = get_match_id(match)

        for line in range(5, number_of_lines - 1):
            line_data = match.splitlines()[line]
            line_data = [x.strip() for x in line_data.split(',')]
            line_data = [match_id] + [name] + [afl_round] + [afl_year] + line_data
            player_data_for_match.append(line_data)
    return player_data_for_match


match_data_list = get_match_data_list()
player_data = return_player_match_data(match_data_list)
file_name = "fanfooty_match_data_{}.csv".format(timestr)
with open("{}/{}".format(destination, file_name), "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(column_header_names)
    for item in player_data:
        writer.writerow(item)

df_fanfooty_player_raw = pd.read_csv("{}/{}".format(destination, file_name), error_bad_lines=False)
df_fanfooty_player_raw

b'Skipping line 89678: expected 49 fields, saw 50\nSkipping line 89679: expected 49 fields, saw 50\nSkipping line 89680: expected 49 fields, saw 50\nSkipping line 89681: expected 49 fields, saw 50\nSkipping line 89682: expected 49 fields, saw 50\nSkipping line 89683: expected 49 fields, saw 50\nSkipping line 89684: expected 49 fields, saw 50\nSkipping line 89685: expected 49 fields, saw 50\nSkipping line 89686: expected 49 fields, saw 50\nSkipping line 89687: expected 49 fields, saw 50\nSkipping line 89688: expected 49 fields, saw 50\nSkipping line 89689: expected 49 fields, saw 50\nSkipping line 89690: expected 49 fields, saw 50\nSkipping line 89691: expected 49 fields, saw 50\nSkipping line 89692: expected 49 fields, saw 50\nSkipping line 89693: expected 49 fields, saw 50\nSkipping line 89694: expected 49 fields, saw 50\nSkipping line 89695: expected 49 fields, saw 50\nSkipping line 89696: expected 49 fields, saw 50\nSkipping line 89697: expected 49 fields, saw 50\nSkipping line 8969

Unnamed: 0,Fanfooty Match ID,Fanfooty Match URL,Round,Year,Player ID,First Name,Surname,Team,null,DT,...,AF own %,null12,AF Breakeven,null13,Contested Possessions,Clearances,Clangers,Disposal efficiency,Time on ground,Metres gained
0,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,990020.0,Andrew,Embley,WC,30,111,...,,,,,,,,,,
1,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,230254.0,Adam,Selwood,WC,50,107,...,,,,,,,,,,
2,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,200112.0,Dean,Cox,WC,27,99,...,,,,,,,,,,
3,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,240016.0,Beau,Waters,WC,26,98,...,,,,,,,,,,
4,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,261911.0,Brad,Ebert,WC,26,94,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90859,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,997100.0,Will,Hayward,SY,4,44,...,0.0,,,,3.0,0.0,2.0,84.0,86.0,172.0
90860,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,290722.0,Sam,Gray,SY,4,42,...,0.0,,,,2.0,0.0,0.0,85.0,81.0,70.0
90861,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,1008080.0,Ryley,Stoddart,SY,4,38,...,0.0,,,,4.0,0.0,2.0,70.0,78.0,87.0
90862,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,1012014.0,Chad,Warner,SY,3,29,...,0.0,,,,4.0,0.0,3.0,75.0,78.0,114.0


### Clean player data

In [9]:
df_fanfooty_player_raw = df_fanfooty_player_raw.loc[df_fanfooty_player_raw['SC'] != '-', :]
df_fanfooty_player_raw['SC'] = df_fanfooty_player_raw['SC'].astype('int64')
df_fanfooty_player_raw

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Fanfooty Match ID,Fanfooty Match URL,Round,Year,Player ID,First Name,Surname,Team,null,DT,...,AF own %,null12,AF Breakeven,null13,Contested Possessions,Clearances,Clangers,Disposal efficiency,Time on ground,Metres gained
0,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,990020.0,Andrew,Embley,WC,30,111,...,,,,,,,,,,
1,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,230254.0,Adam,Selwood,WC,50,107,...,,,,,,,,,,
2,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,200112.0,Dean,Cox,WC,27,99,...,,,,,,,,,,
3,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,240016.0,Beau,Waters,WC,26,98,...,,,,,,,,,,
4,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,261911.0,Brad,Ebert,WC,26,94,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90859,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,997100.0,Will,Hayward,SY,4,44,...,0.0,,,,3.0,0.0,2.0,84.0,86.0,172.0
90860,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,290722.0,Sam,Gray,SY,4,42,...,0.0,,,,2.0,0.0,0.0,85.0,81.0,70.0
90861,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,1008080.0,Ryley,Stoddart,SY,4,38,...,0.0,,,,4.0,0.0,2.0,70.0,78.0,87.0
90862,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,1012014.0,Chad,Warner,SY,3,29,...,0.0,,,,4.0,0.0,3.0,75.0,78.0,114.0


## 3. Match results/fixture

### Save results/fixture csv file

In [10]:

url = "http://www.fanfooty.com.au/resource/draw.php"
headers = ["FanFooty draw ID", "year", "competition", "round", "gametime (AET)", "day", "home team", "away team", "ground", "timeslot", "TV coverage", "home supergoals", "home goals", "home behinds", "home points", "away supergoals", "away goals", "away behinds", "away points", "match status"]
data_list = []

res = requests.get(url)
split_rows_list = res.text.split('"\r\n')
for row in split_rows_list:
    field_list = row.split(',')
    data_list.append(field_list)
    
df_fixture = pd.DataFrame(data_list, columns=headers)

for i, col in enumerate(df_fixture.columns):
    df_fixture.iloc[:, i] = df_fixture.iloc[:, i].str.replace('"', '')
    
df_fixture.to_csv('{}/fanfooty_fixture.csv'.format(destination))
df_fixture

Unnamed: 0,FanFooty draw ID,year,competition,round,gametime (AET),day,home team,away team,ground,timeslot,TV coverage,home supergoals,home goals,home behinds,home points,away supergoals,away goals,away behinds,away points,match status
0,1006,1993,HA,1,1993-03-26 20:08:00,Friday,Western Bulldogs,Collingwood,MCG,N,,,13,17,95,,17,13,115,Full Time
1,1000,1993,HA,1,1993-03-27 14:00:00,Saturday,North Melbourne,Brisbane Bears,MCG,D,,,24,22,166,,22,11,143,Full Time
2,1001,1993,HA,1,1993-03-27 14:00:00,Saturday,Carlton,Fitzroy,Princes Park,D,,,17,10,112,,17,16,118,Full Time
3,1002,1993,HA,1,1993-03-27 14:00:00,Saturday,Hawthorn,Melbourne,Waverley,D,,,13,15,93,,11,4,70,Full Time
4,1005,1993,HA,1,1993-03-27 14:08:00,Saturday,Geelong,St Kilda,Kardinia,D,,,20,16,136,,16,16,112,Full Time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5650,7441,2020,HA,17,2020-09-26 19:25:00,Saturday,Brisbane Lions,Geelong,Gabba,N,,,,,,,,,,
5651,7442,2020,HA,17,2020-09-27 13:10:00,Sunday,Port Adelaide,Gold Coast,Adelaide,D,,,,,,,,,,
5652,7443,2020,HA,17,2020-09-27 15:20:00,Sunday,Carlton,Sydney,Docklands,D,,,,,,,,,,
5653,7444,2020,HA,17,2020-09-27 17:20:00,Sunday,West Coast,Adelaide,Perth,D,,,,,,,,,,


### Get standard team name

In [11]:
df_team_names = pd.read_csv("inputs/all_sources_team_names.csv")
df_fixture = df_fixture.merge(df_team_names[['fanfooty_team_name', 'fanfooty_team_name_mid']], left_on='home team' ,right_on='fanfooty_team_name_mid', how='left')
df_fixture = df_fixture.rename(columns={"fanfooty_team_name": "home_team_short"})
del df_fixture['fanfooty_team_name_mid']
df_fixture = df_fixture.merge(df_team_names[['fanfooty_team_name', 'fanfooty_team_name_mid']], left_on='away team' ,right_on='fanfooty_team_name_mid', how='left')
df_fixture = df_fixture.rename(columns={"fanfooty_team_name": "away_team_short"})
del df_fixture['fanfooty_team_name_mid']
df_fixture

Unnamed: 0,FanFooty draw ID,year,competition,round,gametime (AET),day,home team,away team,ground,timeslot,...,home goals,home behinds,home points,away supergoals,away goals,away behinds,away points,match status,home_team_short,away_team_short
0,1006,1993,HA,1,1993-03-26 20:08:00,Friday,Western Bulldogs,Collingwood,MCG,N,...,13,17,95,,17,13,115,Full Time,WB,CO
1,1000,1993,HA,1,1993-03-27 14:00:00,Saturday,North Melbourne,Brisbane Bears,MCG,D,...,24,22,166,,22,11,143,Full Time,NM,
2,1001,1993,HA,1,1993-03-27 14:00:00,Saturday,Carlton,Fitzroy,Princes Park,D,...,17,10,112,,17,16,118,Full Time,CA,
3,1002,1993,HA,1,1993-03-27 14:00:00,Saturday,Hawthorn,Melbourne,Waverley,D,...,13,15,93,,11,4,70,Full Time,HW,ME
4,1005,1993,HA,1,1993-03-27 14:08:00,Saturday,Geelong,St Kilda,Kardinia,D,...,20,16,136,,16,16,112,Full Time,GE,SK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5650,7441,2020,HA,17,2020-09-26 19:25:00,Saturday,Brisbane Lions,Geelong,Gabba,N,...,,,,,,,,,BL,GE
5651,7442,2020,HA,17,2020-09-27 13:10:00,Sunday,Port Adelaide,Gold Coast,Adelaide,D,...,,,,,,,,,PA,GC
5652,7443,2020,HA,17,2020-09-27 15:20:00,Sunday,Carlton,Sydney,Docklands,D,...,,,,,,,,,CA,SY
5653,7444,2020,HA,17,2020-09-27 17:20:00,Sunday,West Coast,Adelaide,Perth,D,...,,,,,,,,,WC,AD


### Get the total SuperCoach and AFL Fantasy scores for each team, for every match

In [14]:
# Create a summary to get the total SC and AF points for each match
df_match_summary = pd.pivot_table(df_fanfooty_player_raw, index=['Fanfooty Match ID'], values=['SC'], columns=['Team'], aggfunc=np.sum)
df_match_summary = df_match_summary.reset_index()
headings = [x[1] for x in df_match_summary.columns]
headings[0] = 'Fanfooty Match ID'
df_match_summary.columns = headings

def get_sc_total(row):
    match_id = row['Fanfooty Match ID']
    team_name = row['Team']
    sc_total = df_match_summary.loc[df_match_summary['Fanfooty Match ID'] == int(match_id), team_name].values[0]
    return sc_total

df_fanfooty_player_raw['Team SC total'] = df_fanfooty_player_raw.apply(lambda row: get_sc_total(row), axis=1)
df_fanfooty_player_raw

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Fanfooty Match ID,Fanfooty Match URL,Round,Year,Player ID,First Name,Surname,Team,null,DT,...,null12,AF Breakeven,null13,Contested Possessions,Clearances,Clangers,Disposal efficiency,Time on ground,Metres gained,Team SC total
0,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,990020.0,Andrew,Embley,WC,30,111,...,,,,,,,,,,1739.0
1,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,230254.0,Adam,Selwood,WC,50,107,...,,,,,,,,,,1739.0
2,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,200112.0,Dean,Cox,WC,27,99,...,,,,,,,,,,1739.0
3,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,240016.0,Beau,Waters,WC,26,98,...,,,,,,,,,,1739.0
4,3425,http://live.fanfooty.com.au/game/matchcentre.h...,R4,2010,261911.0,Brad,Ebert,WC,26,94,...,,,,,,,,,,1739.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90859,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,997100.0,Will,Hayward,SY,4,44,...,,,,3.0,0.0,2.0,84.0,86.0,172.0,1602.0
90860,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,290722.0,Sam,Gray,SY,4,42,...,,,,2.0,0.0,0.0,85.0,81.0,70.0,1602.0
90861,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,1008080.0,Ryley,Stoddart,SY,4,38,...,,,,4.0,0.0,2.0,70.0,78.0,87.0,1602.0
90862,7353,http://live.fanfooty.com.au/game/matchcentre.h...,R6,2020,1012014.0,Chad,Warner,SY,3,29,...,,,,4.0,0.0,3.0,75.0,78.0,114.0,1602.0
