# Fetch Snooker match data
[http://api.snooker.org/](http://api.snooker.org/)

In [71]:
import requests
import json

import pandas as pd

pd.options.display.max_columns = 500

In [72]:
def fetch_players(year_start, year_end):
    players_json = []
    for year in range(year_start, year_end+1, 1):
        players_json = players_json + requests.get('http://api.snooker.org/?t=10&st=p&s={}'.format(str(year))).json()
    return pd.DataFrame(players_json).drop_duplicates('ID')

def fetch_data(year_start, year_end, player_data=None):
    matches_json = []
    
    if player_data is None:
        player_data = fetch_players(year_start, year_end)
        
    for year in range(year_start, year_end+1, 1):
        for player_id in player_data['ID'].unique():
            try:
                matches_json = matches_json + requests.get('http://api.snooker.org/?t=8&p={}&s={}'.format(str(player_id), str(year))).json()
            except:
                continue
                
    return pd.DataFrame(matches_json).drop_duplicates()

def fetch_season_rankings(year_start, year_end):
    ranking_json = []
    for year in range(year_start-1, year_end, 1):  # season before the current event is what is required
        try:
            ranking_json = ranking_json + requests.get('http://api.snooker.org/?rt=MoneyRankings&s={}'.format(year)).json()
        except:
            continue
    
    return pd.DataFrame(ranking_json).drop_duplicates()


def fetch_events_data(year_start, year_end):
    events_json = []
    for year in range(year_start, year_end+1, 1):
        try:
            events_json = events_json + requests.get('http://api.snooker.org/?t=5&s={}'.format(year)).json()
        except:
            continue
    
    return pd.DataFrame(events_json).drop_duplicates()


def fetch_seedings_data(matches_data):
    seedings_json = []
    for event_id in matches['EventID'].unique():
        try:
            seedings_json = seedings_json + requests.get('http://api.snooker.org/?t=13&e={}'.format(event_id)).json()
        except:
            continue
    
    return pd.DataFrame(seedings_json).drop_duplicates()
    
    

In [3]:
players = fetch_players(2017, 2019)

In [5]:
players.columns

Index(['ID', 'Type', 'FirstName', 'MiddleName', 'LastName', 'TeamName',
       'TeamNumber', 'TeamSeason', 'ShortName', 'Nationality', 'Sex',
       'BioPage', 'Born', 'Twitter', 'SurnameFirst', 'License', 'Club', 'URL',
       'Photo', 'PhotoSource', 'FirstSeasonAsPro', 'LastSeasonAsPro', 'Info'],
      dtype='object')

In [6]:
players.head()

Unnamed: 0,ID,Type,FirstName,MiddleName,LastName,TeamName,TeamNumber,TeamSeason,ShortName,Nationality,Sex,BioPage,Born,Twitter,SurnameFirst,License,Club,URL,Photo,PhotoSource,FirstSeasonAsPro,LastSeasonAsPro,Info
0,1,1,Mark,J,Williams,,0,0,M J Williams,Wales,M,http://snooker.org/plr/bio/mwilliams.shtml,1975-03-21,markwil147,False,,,,http://snooker.org/img/players/MarkWilliams.png,,1992,0,
1,2,1,Stephen,,Maguire,,0,0,,Scotland,M,,1981-03-13,,False,,,,http://snooker.org/img/players/Maguire.png,,1998,0,
2,4,1,Marco,,Fu,,0,0,,Hong Kong,M,,1978-01-08,Marcofu18,False,,,,http://snooker.org/img/players/mfu.jpg,,1998,0,
3,5,1,Ronnie,,O'Sullivan,,0,0,R O'Sullivan,England,M,http://snooker.org/plr/bio/rosullivan.shtml,1975-12-05,ronnieo147,False,,,,http://snooker.org/img/players/rosullivan.jpg,,1992,0,
4,8,1,Tom,,Ford,,0,0,,England,M,,1983-08-17,tomford147,False,,,,http://snooker.org/img/players/TomFord.png,,2001,0,


In [7]:
players['ID'].value_counts().value_counts()

1    163
Name: ID, dtype: int64

Test

In [None]:
t = requests.get('http://api.snooker.org/?t=5&s=2015').json()
t

In [52]:
%%time
matches = fetch_data(2017, 2019, test)

CPU times: user 1.77 s, sys: 141 ms, total: 1.91 s
Wall time: 20min 38s


In [53]:
matches.columns

Index(['ID', 'EventID', 'Round', 'Number', 'Player1ID', 'Score1', 'Walkover1',
       'Player2ID', 'Score2', 'Walkover2', 'WinnerID', 'Unfinished', 'OnBreak',
       'WorldSnookerID', 'LiveUrl', 'DetailsUrl', 'PointsDropped',
       'ShowCommonNote', 'Estimated', 'Type', 'TableNo', 'VideoURL',
       'InitDate', 'ModDate', 'StartDate', 'EndDate', 'ScheduledDate',
       'FrameScores', 'Sessions', 'Note', 'ExtendedNote'],
      dtype='object')

In [54]:
matches.shape

(16266, 31)

In [55]:
matches.head()

Unnamed: 0,ID,EventID,Round,Number,Player1ID,Score1,Walkover1,Player2ID,Score2,Walkover2,WinnerID,Unfinished,OnBreak,WorldSnookerID,LiveUrl,DetailsUrl,PointsDropped,ShowCommonNote,Estimated,Type,TableNo,VideoURL,InitDate,ModDate,StartDate,EndDate,ScheduledDate,FrameScores,Sessions,Note,ExtendedNote
0,3759470,621,1,61,1,0,True,184,0,False,1,False,False,453170,,http://cuetracker.net/Tournaments/riga-masters...,False,False,False,1,0,,2017-05-12T17:50:23Z,2017-05-31T15:07:42Z,2017-05-31T15:07:42Z,2017-05-31T15:07:42Z,2017-06-02T15:00:00Z,,,,
1,4219494,622,1,60,1,5,False,90,3,False,1,False,False,453105,,http://cuetracker.net/Tournaments/china-champi...,False,True,False,1,0,https://www.youtube.com/watch?v=jNtP5eMacos,2017-05-12T21:07:27Z,2017-06-03T13:37:44Z,2017-06-03T13:37:44Z,2017-06-03T17:22:56Z,2017-06-03T13:30:00Z,,,,
2,3760426,620,7,31,1,4,False,42,1,False,1,False,False,453311,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://youtu.be/8EK-3PydURY,2017-05-12T21:14:35Z,2017-06-23T16:08:16Z,2017-06-23T16:08:16Z,2017-06-23T18:10:23Z,2017-06-23T16:00:00Z,,,,
3,3759791,620,8,16,1,4,False,101,1,False,1,False,False,506813,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://www.youtube.com/watch?v=3uRQRBKIAaQ,2017-05-12T21:16:41Z,2017-06-25T21:10:53Z,2017-06-24T13:02:31Z,2017-06-24T14:32:50Z,2017-06-24T12:00:00Z,,,,
4,3759799,620,9,8,96,2,False,1,4,False,1,False,False,506818,,http://cuetracker.net/Tournaments/riga-masters...,False,False,True,1,0,,2017-05-12T21:18:00Z,2017-06-24T16:24:22Z,2017-06-24T16:24:22Z,2017-06-24T17:52:23Z,2017-06-24T16:00:00Z,,,,


In [77]:
matches['WIN'] = (matches['WinnerID'] == matches['Player1ID']).astype(int)

In [24]:
matches.to_csv('Matches_data.csv', index=False)

In [73]:
matches = pd.read_csv('Matches_data.csv')

## Append other data sources

In [74]:
(~matches[['Walkover1', 'Walkover2']].any(axis=1)).value_counts()

True     8728
False     246
dtype: int64

In [75]:
matches = matches.loc[~matches[['Walkover1', 'Walkover2']].any(axis=1)]

In [76]:
matches.head()

Unnamed: 0,ID,EventID,Round,Number,Player1ID,Score1,Walkover1,Player2ID,Score2,Walkover2,WinnerID,Unfinished,OnBreak,WorldSnookerID,LiveUrl,DetailsUrl,PointsDropped,ShowCommonNote,Estimated,Type,TableNo,VideoURL,InitDate,ModDate,StartDate,EndDate,ScheduledDate,FrameScores,Sessions,Note,ExtendedNote,WIN
1,4219494,622,1,60,1,5,False,90,3,False,1,False,False,453105,,http://cuetracker.net/Tournaments/china-champi...,False,True,False,1,0,https://www.youtube.com/watch?v=jNtP5eMacos,2017-05-12T21:07:27Z,2017-06-03T13:37:44Z,2017-06-03T13:37:44Z,2017-06-03T17:22:56Z,2017-06-03T13:30:00Z,,,,,1
2,3760426,620,7,31,1,4,False,42,1,False,1,False,False,453311,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://youtu.be/8EK-3PydURY,2017-05-12T21:14:35Z,2017-06-23T16:08:16Z,2017-06-23T16:08:16Z,2017-06-23T18:10:23Z,2017-06-23T16:00:00Z,,,,,1
3,3759791,620,8,16,1,4,False,101,1,False,1,False,False,506813,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://www.youtube.com/watch?v=3uRQRBKIAaQ,2017-05-12T21:16:41Z,2017-06-25T21:10:53Z,2017-06-24T13:02:31Z,2017-06-24T14:32:50Z,2017-06-24T12:00:00Z,,,,,1
4,3759799,620,9,8,96,2,False,1,4,False,1,False,False,506818,,http://cuetracker.net/Tournaments/riga-masters...,False,False,True,1,0,,2017-05-12T21:18:00Z,2017-06-24T16:24:22Z,2017-06-24T16:24:22Z,2017-06-24T17:52:23Z,2017-06-24T16:00:00Z,,,,,0
5,3759803,620,13,4,48,1,False,1,4,False,1,False,False,506823,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://www.youtube.com/watch?v=bzPeIGRYroI,2017-05-12T21:18:27Z,2017-06-25T11:04:12Z,2017-06-25T11:04:12Z,2017-06-25T12:43:26Z,2017-06-25T09:00:00Z,,,,,0


In [None]:
events = fetch_events_data(2017, 2019)
seedings = fetch_seedings_data(matches)
rankings = fetch_season_rankings(2017, 2019)

In [34]:
players.to_csv('Players_data.csv', index=False)
events.to_csv('Events_data.csv', index=False)
seedings.to_csv('Seedings_data.csv', index=False)
rankings.to_csv('Rankings_data.csv', index=False)

In [77]:
players = pd.read_csv('Players_data.csv')
events = pd.read_csv('Events_data.csv')
seedings = pd.read_csv('Seedings_data.csv')
rankings = pd.read_csv('Rankings_data.csv')

In [78]:
matches = matches.merge(events.rename(columns={'ID': 'EventID'}), on='EventID', how='left', suffixes=('', '_event'))

In [79]:
matches = matches.merge(
    seedings.rename(columns={'Seeding': 'Player1Seeding', 'PlayerID': 'Player1ID'}), 
    on=['EventID', 'Player1ID'], 
    how='left'
)
matches = matches.merge(
    seedings.rename(columns={'Seeding': 'Player2Seeding', 'PlayerID': 'Player2ID'}), 
    on=['EventID', 'Player2ID'], 
    how='left'
)

In [80]:
matches = matches.merge(
    players[['ID', 'FirstSeasonAsPro']].rename(columns={'ID': 'Player1ID', 'FirstSeasonAsPro': 'Player1FirstSeasonAsPro'}), 
    on='Player1ID', 
    how='left'
)
matches = matches.merge(
    players[['ID', 'FirstSeasonAsPro']].rename(columns={'ID': 'Player2ID', 'FirstSeasonAsPro': 'Player2FirstSeasonAsPro'}), 
    on='Player2ID', 
    how='left'
)
matches['Player1YearsAsPro'] = matches['Season'] - matches['Player1FirstSeasonAsPro']
matches['Player2YearsAsPro'] = matches['Season'] - matches['Player2FirstSeasonAsPro']

In [81]:
matches['LastSeason'] = matches['Season'] - 1

In [82]:
matches = matches.merge(
    rankings[['PlayerID', 'Position', 'Season', 'Sum']].rename(columns={
        'PlayerID': 'Player1ID', 'Position': 'Player1LastSeasonRank', 'Season': 'LastSeason', 'Sum': 'Player1LastSeasonSum'
    }),
    on=['Player1ID', 'LastSeason'],
    how='left'
)
matches = matches.merge(
    rankings[['PlayerID', 'Position', 'Season', 'Sum']].rename(columns={
        'PlayerID': 'Player2ID', 'Position': 'Player2LastSeasonRank', 'Season': 'LastSeason', 'Sum': 'Player2LastSeasonSum'
    }),
    on=['Player2ID', 'LastSeason'],
    how='left'
)

In [83]:
matches.head(10)

Unnamed: 0,ID,EventID,Round,Number,Player1ID,Score1,Walkover1,Player2ID,Score2,Walkover2,WinnerID,Unfinished,OnBreak,WorldSnookerID,LiveUrl,DetailsUrl,PointsDropped,ShowCommonNote,Estimated,Type,TableNo,VideoURL,InitDate,ModDate,StartDate,EndDate,ScheduledDate,FrameScores,Sessions,Note,ExtendedNote,WIN,Name,StartDate_event,EndDate_event,Sponsor,Season,Type_event,Num,Venue,City,Country,Discipline,Main,Sex,AgeGroup,Url,Related,Stage,ValueType,ShortName,WorldSnookerId,RankingType,EventPredictionID,Team,Format,Twitter,HashTag,ConversionRate,AllRoundsAdded,PhotoURLs,NumCompetitors,NumUpcoming,NumActive,NumResults,Note_event,CommonNote,DefendingChampion,PreviousEdition,Player1Seeding,Player2Seeding,Player1FirstSeasonAsPro,Player2FirstSeasonAsPro,Player1YearsAsPro,Player2YearsAsPro,LastSeason,Player1LastSeasonRank,Player1LastSeasonSum,Player2LastSeasonRank,Player2LastSeasonSum
0,4219494,622,1,60,1,5,False,90,3,False,1,False,False,453105,,http://cuetracker.net/Tournaments/china-champi...,False,True,False,1,0,https://www.youtube.com/watch?v=jNtP5eMacos,2017-05-12T21:07:27Z,2017-06-03T13:37:44Z,2017-06-03T13:37:44Z,2017-06-03T17:22:56Z,2017-06-03T13:30:00Z,,,,,1,China Championship Qualifiers,2017-06-03,2017-06-06,EverGrande,2017,Qualifying,0,Guild Hall,Preston,England,snooker,623,Both,O,,,Q,CC,,13955,WR,0,False,1,,ChinaChampionship,1.0,True,,0,0,0,64,,"<a href=""http://www.eurosportplayer.com/"">Euro...",237,568,,,1992.0,2010.0,25.0,7.0,2016,15.0,211975.0,96.0,18862.0
1,3760426,620,7,31,1,4,False,42,1,False,1,False,False,453311,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://youtu.be/8EK-3PydURY,2017-05-12T21:14:35Z,2017-06-23T16:08:16Z,2017-06-23T16:08:16Z,2017-06-23T18:10:23Z,2017-06-23T16:00:00Z,,,,,1,Riga Masters,2017-06-23,2017-06-25,Kaspersky,2017,Ranking,0,Arena Riga,Riga,Latvia,snooker,620,Both,O,,riga,F,RM,,13953,WR,2701,False,1,,RigaMasters,1.0,True,,128,0,0,63,,"<a href=""http://www.eurosportplayer.com/"">Euro...",154,515,8.0,33.0,1992.0,1991.0,25.0,26.0,2016,15.0,211975.0,40.0,99387.0
2,3759791,620,8,16,1,4,False,101,1,False,1,False,False,506813,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://www.youtube.com/watch?v=3uRQRBKIAaQ,2017-05-12T21:16:41Z,2017-06-25T21:10:53Z,2017-06-24T13:02:31Z,2017-06-24T14:32:50Z,2017-06-24T12:00:00Z,,,,,1,Riga Masters,2017-06-23,2017-06-25,Kaspersky,2017,Ranking,0,Arena Riga,Riga,Latvia,snooker,620,Both,O,,riga,F,RM,,13953,WR,2701,False,1,,RigaMasters,1.0,True,,128,0,0,63,,"<a href=""http://www.eurosportplayer.com/"">Euro...",154,515,8.0,20.0,1992.0,2011.0,25.0,6.0,2016,15.0,211975.0,27.0,132075.0
3,3759799,620,9,8,96,2,False,1,4,False,1,False,False,506818,,http://cuetracker.net/Tournaments/riga-masters...,False,False,True,1,0,,2017-05-12T21:18:00Z,2017-06-24T16:24:22Z,2017-06-24T16:24:22Z,2017-06-24T17:52:23Z,2017-06-24T16:00:00Z,,,,,0,Riga Masters,2017-06-23,2017-06-25,Kaspersky,2017,Ranking,0,Arena Riga,Riga,Latvia,snooker,620,Both,O,,riga,F,RM,,13953,WR,2701,False,1,,RigaMasters,1.0,True,,128,0,0,63,,"<a href=""http://www.eurosportplayer.com/"">Euro...",154,515,46.0,8.0,2012.0,1992.0,5.0,25.0,2016,53.0,74650.0,15.0,211975.0
4,3759803,620,13,4,48,1,False,1,4,False,1,False,False,506823,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://www.youtube.com/watch?v=bzPeIGRYroI,2017-05-12T21:18:27Z,2017-06-25T11:04:12Z,2017-06-25T11:04:12Z,2017-06-25T12:43:26Z,2017-06-25T09:00:00Z,,,,,0,Riga Masters,2017-06-23,2017-06-25,Kaspersky,2017,Ranking,0,Arena Riga,Riga,Latvia,snooker,620,Both,O,,riga,F,RM,,13953,WR,2701,False,1,,RigaMasters,1.0,True,,128,0,0,63,,"<a href=""http://www.eurosportplayer.com/"">Euro...",154,515,41.0,8.0,2006.0,1992.0,11.0,25.0,2016,48.0,83662.0,15.0,211975.0
5,3759842,620,14,2,68,5,False,1,4,False,68,False,False,506828,,http://cuetracker.net/Tournaments/riga-masters...,False,True,True,1,0,https://www.youtube.com/watch?v=rGUyT6mgGxg,2017-05-12T21:18:54Z,2017-06-25T12:54:21Z,2017-06-25T12:54:21Z,2017-06-25T15:43:41Z,2017-06-25T12:00:00Z,"86-36, 82-41 (70), 0-81 (56), 8-74<br/>5-77 (5...",,,,1,Riga Masters,2017-06-23,2017-06-25,Kaspersky,2017,Ranking,0,Arena Riga,Riga,Latvia,snooker,620,Both,O,,riga,F,RM,,13953,WR,2701,False,1,,RigaMasters,1.0,True,,128,0,0,63,,"<a href=""http://www.eurosportplayer.com/"">Euro...",154,515,11.0,8.0,1998.0,1992.0,19.0,25.0,2016,18.0,197087.0,15.0,211975.0
6,3910645,635,1,37,1,5,False,416,0,False,1,False,False,453638,,http://cuetracker.net/Tournaments/world-open/2...,False,True,False,1,0,https://www.youtube.com/watch?v=orJE8c4YqFs,2017-07-15T09:48:14Z,2017-08-06T13:29:17Z,2017-08-06T13:29:17Z,2017-08-06T15:42:57Z,2017-08-06T13:30:00Z,,,,,1,World Open Qualifiers,2017-08-06,2017-08-09,,2017,Qualifying,0,Guild Hall,Preston,England,snooker,638,Both,O,,hwo,Q,WO,,13971,WR,0,False,1,,,1.0,True,,0,0,0,64,,"Watch on <a href=""http://www.eurosportplayer.c...",0,0,,,1992.0,2016.0,25.0,1.0,2016,15.0,211975.0,119.0,5050.0
7,3851063,623,7,30,218,2,False,1,5,False,1,False,False,517770,,http://cuetracker.net/Tournaments/china-champi...,False,False,False,1,0,,2017-05-12T21:27:18Z,2017-08-17T11:34:16Z,2017-08-17T11:34:16Z,2017-08-17T14:02:40Z,2017-08-17T11:30:00Z,,,,,0,China Championship,2017-08-16,2017-08-22,EverGrande,2017,Ranking,0,Guangzhou Sports Institute Asian Games Venue,Guangzhou,China,snooker,623,Both,O,,,F,CC,,13955,WR,2702,False,1,,ChinaChampionship,1.0,True,,128,0,0,63,,"Watch on <a href=""http://www.eurosportplayer.c...",237,568,50.0,15.0,2006.0,1992.0,11.0,25.0,2016,50.0,77575.0,15.0,211975.0
8,3853804,623,8,15,68,0,False,1,5,False,1,False,False,517790,,http://cuetracker.net/Tournaments/china-champi...,False,False,False,1,0,,2017-05-12T21:28:17Z,2017-08-18T11:22:20Z,2017-08-18T11:22:20Z,2017-08-18T13:20:48Z,2017-08-18T11:30:00Z,,,,,0,China Championship,2017-08-16,2017-08-22,EverGrande,2017,Ranking,0,Guangzhou Sports Institute Asian Games Venue,Guangzhou,China,snooker,623,Both,O,,,F,CC,,13955,WR,2702,False,1,,ChinaChampionship,1.0,True,,128,0,0,63,,"Watch on <a href=""http://www.eurosportplayer.c...",237,568,18.0,15.0,1998.0,1992.0,19.0,25.0,2016,18.0,197087.0,15.0,211975.0
9,3889522,623,9,8,1,5,False,8,3,False,1,False,False,517797,,http://cuetracker.net/Tournaments/china-champi...,False,False,False,1,0,,2017-05-12T21:28:57Z,2017-08-19T06:01:36Z,2017-08-19T06:01:36Z,2017-08-19T08:36:59Z,2017-08-19T06:00:00Z,,,,,1,China Championship,2017-08-16,2017-08-22,EverGrande,2017,Ranking,0,Guangzhou Sports Institute Asian Games Venue,Guangzhou,China,snooker,623,Both,O,,,F,CC,,13955,WR,2702,False,1,,ChinaChampionship,1.0,True,,128,0,0,63,,"Watch on <a href=""http://www.eurosportplayer.c...",237,568,15.0,31.0,1992.0,2001.0,25.0,16.0,2016,15.0,211975.0,31.0,122500.0


In [84]:
matches.columns

Index(['ID', 'EventID', 'Round', 'Number', 'Player1ID', 'Score1', 'Walkover1',
       'Player2ID', 'Score2', 'Walkover2', 'WinnerID', 'Unfinished', 'OnBreak',
       'WorldSnookerID', 'LiveUrl', 'DetailsUrl', 'PointsDropped',
       'ShowCommonNote', 'Estimated', 'Type', 'TableNo', 'VideoURL',
       'InitDate', 'ModDate', 'StartDate', 'EndDate', 'ScheduledDate',
       'FrameScores', 'Sessions', 'Note', 'ExtendedNote', 'WIN', 'Name',
       'StartDate_event', 'EndDate_event', 'Sponsor', 'Season', 'Type_event',
       'Num', 'Venue', 'City', 'Country', 'Discipline', 'Main', 'Sex',
       'AgeGroup', 'Url', 'Related', 'Stage', 'ValueType', 'ShortName',
       'WorldSnookerId', 'RankingType', 'EventPredictionID', 'Team', 'Format',
       'Twitter', 'HashTag', 'ConversionRate', 'AllRoundsAdded', 'PhotoURLs',
       'NumCompetitors', 'NumUpcoming', 'NumActive', 'NumResults',
       'Note_event', 'CommonNote', 'DefendingChampion', 'PreviousEdition',
       'Player1Seeding', 'Player2Seeding'

In [86]:
matches.to_csv('Raw_data.csv', index=False)

### Exploratory data analysis

1. Can we get Betfair historical data and compare how biased these "source of truth" probabilities are?
2. What are the most important features driving winning a match? Ranking difference, form, stage of competition etc etc 
3. Systematic way of defining features?

### Feature engineering