# Fetch Snooker match data
Load 5 years worth of data for modelling and exploration purposes using the snooker.org API: [http://api.snooker.org/](http://api.snooker.org/)

### Imports

In [1]:
import requests
import json

import pandas as pd

pd.options.display.max_columns = 500

### Constants and utility functions

In [2]:
YEAR_START = 2014

YEAR_END = 2019

In [3]:
def fetch_players(year_start, year_end):
    players_json = []
    for year in range(year_start, year_end+1, 1):
        players_json = players_json + requests.get('http://api.snooker.org/?t=10&st=p&s={}'.format(str(year))).json()
    return pd.DataFrame(players_json).drop_duplicates('ID')

def fetch_data(year_start, year_end, player_data=None):
    matches_json = []
    
    if player_data is None:
        player_data = fetch_players(year_start, year_end)
        
    for year in range(year_start, year_end+1, 1):
        for player_id in player_data['ID'].unique():
            try:
                matches_json = matches_json + requests.get('http://api.snooker.org/?t=8&p={}&s={}'.format(str(player_id), str(year))).json()
            except:
                continue
                
    return pd.DataFrame(matches_json).drop_duplicates()

def fetch_season_rankings(year_start, year_end):
    ranking_json = []
    for year in range(year_start-1, year_end, 1):  # season before the current event is what is required
        try:
            ranking_json = ranking_json + requests.get('http://api.snooker.org/?rt=MoneyRankings&s={}'.format(year)).json()
        except:
            continue
    
    return pd.DataFrame(ranking_json).drop_duplicates()


def fetch_events_data(year_start, year_end):
    events_json = []
    for year in range(year_start, year_end+1, 1):
        try:
            events_json = events_json + requests.get('http://api.snooker.org/?t=5&s={}'.format(year)).json()
        except:
            continue
    
    return pd.DataFrame(events_json).drop_duplicates()


def fetch_seedings_data(matches_data):
    seedings_json = []
    for event_id in matches['EventID'].unique():
        try:
            seedings_json = seedings_json + requests.get('http://api.snooker.org/?t=13&e={}'.format(event_id)).json()
        except:
            continue
    
    return pd.DataFrame(seedings_json).drop_duplicates()
    
    

### Data Fetching

In [4]:
players = fetch_players(YEAR_START, YEAR_END)

In [5]:
players.head()

Unnamed: 0,ID,Type,FirstName,MiddleName,LastName,TeamName,TeamNumber,TeamSeason,ShortName,Nationality,Sex,BioPage,Born,Twitter,SurnameFirst,License,Club,URL,Photo,PhotoSource,FirstSeasonAsPro,LastSeasonAsPro,Info
0,1,1,Mark,J,Williams,,0,0,M J Williams,Wales,M,http://snooker.org/plr/bio/mwilliams.shtml,1975-03-21,markwil147,False,,,,http://snooker.org/img/players/MarkWilliams.png,,1992,0,
1,2,1,Stephen,,Maguire,,0,0,,Scotland,M,,1981-03-13,,False,,,,http://snooker.org/img/players/Maguire.png,,1998,0,
2,3,1,Jamie,,Cope,,0,0,,England,M,,1985-09-12,JamieCope147,False,,,,http://snooker.org/img/players/JamieCope.png,,2002,2016,
3,4,1,Marco,,Fu,,0,0,,Hong Kong,M,,1978-01-08,Marcofu18,False,,,,http://snooker.org/img/players/mfu.jpg,,1998,0,
4,5,1,Ronnie,,O'Sullivan,,0,0,R O'Sullivan,England,M,http://snooker.org/plr/bio/rosullivan.shtml,1975-12-05,ronnieo147,False,,,,http://snooker.org/img/players/rosullivan.jpg,,1992,0,


In [6]:
%%time
matches = fetch_data(YEAR_START, YEAR_END, players)

CPU times: user 4.21 s, sys: 317 ms, total: 4.53 s
Wall time: 34min 14s


In [7]:
matches.columns

Index(['ID', 'EventID', 'Round', 'Number', 'Player1ID', 'Score1', 'Walkover1',
       'Player2ID', 'Score2', 'Walkover2', 'WinnerID', 'Unfinished', 'OnBreak',
       'WorldSnookerID', 'LiveUrl', 'DetailsUrl', 'PointsDropped',
       'ShowCommonNote', 'Estimated', 'Type', 'TableNo', 'VideoURL',
       'InitDate', 'ModDate', 'StartDate', 'EndDate', 'ScheduledDate',
       'FrameScores', 'Sessions', 'Note', 'ExtendedNote'],
      dtype='object')

In [8]:
matches.shape

(18501, 31)

In [9]:
matches.head()

Unnamed: 0,ID,EventID,Round,Number,Player1ID,Score1,Walkover1,Player2ID,Score2,Walkover2,WinnerID,Unfinished,OnBreak,WorldSnookerID,LiveUrl,DetailsUrl,PointsDropped,ShowCommonNote,Estimated,Type,TableNo,VideoURL,InitDate,ModDate,StartDate,EndDate,ScheduledDate,FrameScores,Sessions,Note,ExtendedNote
0,1564655,334,1,40,1,5,False,608,3,False,1,False,False,239344,,,False,True,False,1,0,,2014-05-17T09:20:17Z,2014-05-28T13:33:32Z,2014-05-28T13:33:32Z,2014-05-28T16:11:53Z,2014-05-28T13:30:00Z,,,,
1,1590901,331,7,31,1,4,False,1529,0,False,1,False,False,241259,,,False,True,True,1,0,,2014-06-07T13:04:05Z,2014-06-18T11:09:12Z,2014-06-18T11:09:12Z,2014-06-18T11:51:21Z,2014-06-18T11:00:00Z,,,,
2,1589576,331,8,16,1,3,False,30,4,False,30,False,False,355721,,,False,True,True,1,0,,2014-06-07T13:10:41Z,2014-06-19T11:39:24Z,2014-06-19T11:39:24Z,2014-06-19T13:18:41Z,2014-06-19T11:00:00Z,,,,
3,1595583,332,7,20,48,1,False,1,5,False,1,False,False,241071,,,False,False,False,1,0,,2014-05-17T14:17:03Z,2014-06-24T02:23:45Z,2014-06-24T02:23:45Z,2014-06-24T04:00:42Z,2014-06-24T02:00:00Z,,,,
4,1596497,332,8,10,68,5,False,1,2,False,68,False,False,214156,,,False,False,False,1,0,,2014-05-17T14:17:35Z,2014-06-25T11:30:16Z,2014-06-25T11:30:16Z,2014-06-25T13:56:16Z,2014-06-25T11:30:00Z,,,,


In [10]:
matches['WIN'] = (matches['WinnerID'] == matches['Player1ID']).astype(int)

In [11]:
matches.to_csv('data/Matches_data.csv', index=False)

In [12]:
# matches = pd.read_csv('data/Matches_data.csv')

## Fetch and append other data sources

In [13]:
(~matches[['Walkover1', 'Walkover2']].any(axis=1)).value_counts()

True     18032
False      469
dtype: int64

In [14]:
matches = matches.loc[~matches[['Walkover1', 'Walkover2']].any(axis=1)]

In [15]:
events = fetch_events_data(YEAR_START, YEAR_END)
seedings = fetch_seedings_data(matches)
rankings = fetch_season_rankings(YEAR_START, YEAR_END)

In [16]:
players.to_csv('data/Players_data.csv', index=False)
events.to_csv('data/Events_data.csv', index=False)
seedings.to_csv('data/Seedings_data.csv', index=False)
rankings.to_csv('data/Rankings_data.csv', index=False)

In [17]:
# players = pd.read_csv('data/Players_data.csv')
# events = pd.read_csv('data/Events_data.csv')
# seedings = pd.read_csv('data/Seedings_data.csv')
# rankings = pd.read_csv('data/Rankings_data.csv')

### Merge matches data with other sources for a comprehensive raw dataset

Merge events data.

In [19]:
matches = matches.merge(events.rename(columns={'ID': 'EventID'}), on='EventID', how='left', suffixes=('', '_event'))

Merge seedings data.

In [20]:
matches = matches.merge(
    seedings.rename(columns={'Seeding': 'Player1Seeding', 'PlayerID': 'Player1ID'}), 
    on=['EventID', 'Player1ID'], 
    how='left'
)
matches = matches.merge(
    seedings.rename(columns={'Seeding': 'Player2Seeding', 'PlayerID': 'Player2ID'}), 
    on=['EventID', 'Player2ID'], 
    how='left'
)

Merge player data.

In [21]:
matches = matches.merge(
    players[['ID', 'FirstSeasonAsPro', 'Nationality']].rename(
        columns={'ID': 'Player1ID', 'FirstSeasonAsPro': 'Player1FirstSeasonAsPro', 'Nationality': 'Player1Nationality'}
    ), 
    on='Player1ID', 
    how='left'
)
matches = matches.merge(
    players[['ID', 'FirstSeasonAsPro', 'Nationality']].rename(
        columns={'ID': 'Player2ID', 'FirstSeasonAsPro': 'Player2FirstSeasonAsPro', 'Nationality': 'Player2Nationality'}
    ), 
    on='Player2ID', 
    how='left'
)
matches['Player1YearsAsPro'] = matches['Season'] - matches['Player1FirstSeasonAsPro']
matches['Player2YearsAsPro'] = matches['Season'] - matches['Player2FirstSeasonAsPro']

In [22]:
matches['LastSeason'] = matches['Season'] - 1

Merge rankings data.

In [23]:
matches = matches.merge(
    rankings[['PlayerID', 'Position', 'Season', 'Sum']].rename(columns={
        'PlayerID': 'Player1ID', 'Position': 'Player1LastSeasonRank', 'Season': 'LastSeason', 'Sum': 'Player1LastSeasonSum'
    }),
    on=['Player1ID', 'LastSeason'],
    how='left'
)
matches = matches.merge(
    rankings[['PlayerID', 'Position', 'Season', 'Sum']].rename(columns={
        'PlayerID': 'Player2ID', 'Position': 'Player2LastSeasonRank', 'Season': 'LastSeason', 'Sum': 'Player2LastSeasonSum'
    }),
    on=['Player2ID', 'LastSeason'],
    how='left'
)

### Save

In [24]:
matches.shape

(18032, 82)

In [25]:
matches.head(3)

Unnamed: 0,ID,EventID,Round,Number,Player1ID,Score1,Walkover1,Player2ID,Score2,Walkover2,WinnerID,Unfinished,OnBreak,WorldSnookerID,LiveUrl,DetailsUrl,PointsDropped,ShowCommonNote,Estimated,Type,TableNo,VideoURL,InitDate,ModDate,StartDate,EndDate,ScheduledDate,FrameScores,Sessions,Note,ExtendedNote,WIN,Name,StartDate_event,EndDate_event,Sponsor,Season,Type_event,Num,Venue,City,Country,Discipline,Main,Sex,AgeGroup,Url,Related,Stage,ValueType,ShortName,WorldSnookerId,RankingType,EventPredictionID,Team,Format,Twitter,HashTag,ConversionRate,AllRoundsAdded,PhotoURLs,NumCompetitors,NumUpcoming,NumActive,NumResults,Note_event,CommonNote,DefendingChampion,PreviousEdition,Player1Seeding,Player2Seeding,Player1FirstSeasonAsPro,Player1Nationality,Player2FirstSeasonAsPro,Player2Nationality,Player1YearsAsPro,Player2YearsAsPro,LastSeason,Player1LastSeasonRank,Player1LastSeasonSum,Player2LastSeasonRank,Player2LastSeasonSum
0,1564655,334,1,40,1,5,False,608,3,False,1,False,False,239344,,,False,True,False,1,0,,2014-05-17T09:20:17Z,2014-05-28T13:33:32Z,2014-05-28T13:33:32Z,2014-05-28T16:11:53Z,2014-05-28T13:30:00Z,,,,,1,Wuxi Classic Qualifiers,2014-05-24,2014-05-28,,2014,Qualifying,0,The Capital Venue,Gloucester,England,snooker,332,Both,O,,wuxi,Q,WUXI,,13742,WR,0,False,1,,WuxiClassic,1.0,True,,0,0,0,64,,"Watch on <a href=""http://www.worldsnooker.live...",0,0,,,1992.0,Wales,2013.0,England,22.0,1.0,2013,18.0,163388.0,112.0,4818.0
1,1590901,331,7,31,1,4,False,1529,0,False,1,False,False,241259,,,False,True,True,1,0,,2014-06-07T13:04:05Z,2014-06-18T11:09:12Z,2014-06-18T11:09:12Z,2014-06-18T11:51:21Z,2014-06-18T11:00:00Z,,,,,1,Asian Tour Event One,2014-06-17,2014-06-21,,2014,Players Tour Championship,1,Yixing Sports Centre,Yixing,China,snooker,331,Both,O,,ptc,F,APTC,Asian Tour 1,13749,WR,0,False,1,,YixingOpen,1.0,True,,141,0,0,140,Also known as the Yixing Open (2014).,"Watch on <a href=""http://www.my147.com/dating/...",0,0,8.0,,1992.0,Wales,,,22.0,,2013,18.0,163388.0,,
2,1589576,331,8,16,1,3,False,30,4,False,30,False,False,355721,,,False,True,True,1,0,,2014-06-07T13:10:41Z,2014-06-19T11:39:24Z,2014-06-19T11:39:24Z,2014-06-19T13:18:41Z,2014-06-19T11:00:00Z,,,,,0,Asian Tour Event One,2014-06-17,2014-06-21,,2014,Players Tour Championship,1,Yixing Sports Centre,Yixing,China,snooker,331,Both,O,,ptc,F,APTC,Asian Tour 1,13749,WR,0,False,1,,YixingOpen,1.0,True,,141,0,0,140,Also known as the Yixing Open (2014).,"Watch on <a href=""http://www.my147.com/dating/...",0,0,8.0,4.0,1992.0,Wales,1995.0,England,22.0,19.0,2013,18.0,163388.0,12.0,276081.0


In [26]:
matches.columns

Index(['ID', 'EventID', 'Round', 'Number', 'Player1ID', 'Score1', 'Walkover1',
       'Player2ID', 'Score2', 'Walkover2', 'WinnerID', 'Unfinished', 'OnBreak',
       'WorldSnookerID', 'LiveUrl', 'DetailsUrl', 'PointsDropped',
       'ShowCommonNote', 'Estimated', 'Type', 'TableNo', 'VideoURL',
       'InitDate', 'ModDate', 'StartDate', 'EndDate', 'ScheduledDate',
       'FrameScores', 'Sessions', 'Note', 'ExtendedNote', 'WIN', 'Name',
       'StartDate_event', 'EndDate_event', 'Sponsor', 'Season', 'Type_event',
       'Num', 'Venue', 'City', 'Country', 'Discipline', 'Main', 'Sex',
       'AgeGroup', 'Url', 'Related', 'Stage', 'ValueType', 'ShortName',
       'WorldSnookerId', 'RankingType', 'EventPredictionID', 'Team', 'Format',
       'Twitter', 'HashTag', 'ConversionRate', 'AllRoundsAdded', 'PhotoURLs',
       'NumCompetitors', 'NumUpcoming', 'NumActive', 'NumResults',
       'Note_event', 'CommonNote', 'DefendingChampion', 'PreviousEdition',
       'Player1Seeding', 'Player2Seeding'

In [27]:
matches.to_csv('data/Raw_data.csv', index=False)