# Pull in player data from Soccerbase

In [22]:
import pandas as pd

sb_df = pd.read_csv('https://raw.githubusercontent.com/petebrown/scrape-player-info/main/data/player-info.csv')

sb_plr_ids = sb_df.player_id.to_list()

In [23]:
import concurrent.futures
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_soup(url):
    response = requests.get(url, headers=headers)
    return BeautifulSoup(response.text, "html.parser")

def get_info(url):
    pl = pd.concat([pd.read_html(url)[1], pd.read_html(url)[2]]).dropna()
    pl.columns = ['stat', 'value']
    pl['player_id'] = url.split('=')[-1]
    pl = pl.pivot(index='player_id', columns='stat', values='value').reset_index()
    return pl

def fetch_player_data(id):
    url = f"https://www.soccerbase.com/players/player.sd?player_id={id}"
    soup = get_soup(url)
    name = soup.find('table', {"class": 'imageHead'}).find('h1').text
    try:
        position = soup.find('div', class_='midfielder bull').get_text(strip=True, separator=' ').split(' ')[0]
    except:
        position = ''
    return {
        'player_id': id,
        'player_name': name.strip(),
        'soccerbase_pos': position,
        'player_info': get_info(url)
    }

player_info = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(fetch_player_data, id): id for id in sb_plr_ids}
    for future in tqdm(concurrent.futures.as_completed(futures)):
        player_data = future.result()
        player_info.append(player_data)

0it [00:00, ?it/s]

In [59]:
df = pd.DataFrame(player_info)

df.player_id = df.player_id.astype(int)

# Remove shirt numbers from scraped player names
df['player_name'] = df['player_name'].str.replace('^\\d+. ', '', regex=True)

In [60]:
name_fixes = {
    "Anthony Kay": "Antony Kay",
    "Corey Taylor": "Corey Blackett-Taylor",
    "Craig Carl Curran": "Craig Curran",
    "Chris Edwards": "Christian Edwards",
    "Daniel Robert Harrison": "Danny Harrison",
    "Dave Nugent": "David Nugent",
    "Dylan Mottley Henry": "Dylan Mottley-Henry",
    "Jay Turner-Cook": "Jay Turner-Cooke",
    "Jimmy McNulty": "Jim McNulty",
    "John-Louis Akpa Akpro": "Jean-Louis Akpa Akpro",
    "John Morrissey": "Johnny Morrissey",
    "Jonathon Margetts": "Johnny Margetts",
    "Joseph Maguire": "Joe Maguire",
    "Kaylden Brown": "Kayleden Brown",
    "Lewis Sinnot": "Lewis Sinnott",
    "Lateef Elford Alliyu": "Lateef Elford-Alliyu",
    "Matty Kennedy": "Matthew Kennedy",
    "Michael Jackson": "Mike Jackson",
    "Michael Jones": "Mike Jones",
    "Oliver Banks": "Ollie Banks",
    "Ousmane Kane": "Ousmane Kané",
    "Richard Sutton": "Ritchie Sutton",
    "Robert Apter": "Rob Apter",
    "Robert Taylor": "Rob Taylor",
    "Steven O'Leary": "Stephen O'Leary"
}

for old_name, new_name in tqdm(name_fixes.items()):
    df.loc[df.player_name == old_name, 'player_name'] = new_name

  0%|          | 0/25 [00:00<?, ?it/s]

In [61]:
sb_plr_names = df[['player_id', 'player_name']].copy()

sb_plr_names['player_id'] = sb_plr_names['player_id'].astype(int)

sb_plr_names.head(5)

Unnamed: 0,player_id,player_name
0,181422,Mikey Davies
1,137278,Jean Belehouan
2,101331,Josh Hawkes
3,162707,Ben Hockenhull
4,76807,Tom Davies


In [62]:
sb_pos_fixes = {
    "Billy Woods": "Midfielder",
    "Dariusz Kubicki": "Defender",
    "Dave Higgins": "Defender",
    "Gary Stevens": "Defender",
    "Andy Thorn": "Defender",
    "James Devine": "Midfielder",
    "Stephen O'Leary": "Midfielder",
    "Steve Mungall": "Defender"
}

sb_plr_pos = df[['player_id', 'player_name', 'soccerbase_pos']].copy()

for plr, pos in sb_pos_fixes.items():
    sb_plr_pos.loc[sb_plr_pos.player_name == plr, 'soccerbase_pos'] = pos

sb_plr_pos.head(3)

Unnamed: 0,player_id,player_name,soccerbase_pos
0,181422,Mikey Davies,Defender
1,137278,Jean Belehouan,Defender
2,101331,Josh Hawkes,Midfielder


In [63]:
plr_info = pd.concat(df['player_info'].tolist(), ignore_index=True)

plr_info['dob'] = plr_info.Age.str.extract(r'\d+ \(Born (.*)\)')
plr_info['dob'] = pd.to_datetime(plr_info['dob'])

sb_plr_dobs = plr_info[['player_id', 'dob']].copy()

sb_plr_dobs.player_id = sb_plr_dobs.player_id.astype(int)

sb_plr_dobs.head(5)

stat,player_id,dob
0,181422,NaT
1,137278,2000-09-01
2,101331,1999-01-28
3,162707,2001-09-03
4,76807,1992-04-18


In [64]:
sb_plrs = sb_plr_names.merge(sb_plr_dobs, on='player_id')

sb_plrs

Unnamed: 0,player_id,player_name,dob
0,181422,Mikey Davies,NaT
1,137278,Jean Belehouan,2000-09-01
2,101331,Josh Hawkes,1999-01-28
3,162707,Ben Hockenhull,2001-09-03
4,76807,Tom Davies,1992-04-18
...,...,...,...
474,5828,Pat Nevin,1963-09-06
475,94139,Sam Ilesanmi,NaT
476,24078,Perry Taylor,NaT
477,102630,Callum Lucy,NaT


In [65]:
dob_fixes = {
    'Akpo Sodje': '1980-01-31',
    'Arnaud Mendy': '1990-02-10',
    'Ben Tomlinson': '1989-10-31',
    'Brad Walker': '1996-04-25',
    'Callum Lucy': '1998-11-09',
    'Chris McCready': '1981-09-05',
    'Clayton McDonald': '1988-12-06',
    'Ernie Davies': '1916-01-31',
    'Emmanuel Dieseruvwe': '1995-02-20',
    'Kane Hemmings': '1991-04-08',
    'Mikey Davies': '2004-09-23',
    'Nathan Blissett': '1990-06-29',
    'Robbie Burns': '1990-11-15',
    'Perry Taylor': '1981-01-29',    
    'Rob Apter': '2003-04-23',
    'Ryan Edwards': '1993-10-07',
    'Sam Ilesanmi': '1998-07-13',
    'Scott Davies': '1987-02-23',
    "Stephen O'Leary": '1987-02-02',
    "Seyni N'Diaye": '1973-06-01',
    'Will Aimson': '1994-06-03'
}

for name, dob in dob_fixes.items():
    sb_plrs.loc[sb_plrs.player_name == name, 'dob'] = dob

sb_plrs

Unnamed: 0,player_id,player_name,dob
0,181422,Mikey Davies,2004-09-23
1,137278,Jean Belehouan,2000-09-01
2,101331,Josh Hawkes,1999-01-28
3,162707,Ben Hockenhull,2001-09-03
4,76807,Tom Davies,1992-04-18
...,...,...,...
474,5828,Pat Nevin,1963-09-06
475,94139,Sam Ilesanmi,1998-07-13
476,24078,Perry Taylor,1981-01-29
477,102630,Callum Lucy,1998-11-09


In [66]:
sb_plrs[~sb_plrs.dob.notnull()].reset_index(drop=True)

Unnamed: 0,player_id,player_name,dob
0,146009,Dylan Dwyer,NaT
1,134764,Jamie Timlin,NaT
2,111843,Nick Long,NaT
3,111841,James Devine,NaT
4,102625,James Divine,NaT
5,102626,Tom Coughan,NaT
6,102633,Jack Flemming,NaT
7,88395,Darren Askew,NaT
8,86848,Andy Mathieson,NaT
9,61489,John Courtney,NaT


In [67]:
sb_plrs = sb_plrs[sb_plrs.dob.notnull()].reset_index(drop=True)

sb_plrs

Unnamed: 0,player_id,player_name,dob
0,181422,Mikey Davies,2004-09-23
1,137278,Jean Belehouan,2000-09-01
2,101331,Josh Hawkes,1999-01-28
3,162707,Ben Hockenhull,2001-09-03
4,76807,Tom Davies,1992-04-18
...,...,...,...
458,5828,Pat Nevin,1963-09-06
459,94139,Sam Ilesanmi,1998-07-13
460,24078,Perry Taylor,1981-01-29
461,102630,Callum Lucy,1998-11-09


In [68]:
def split_name(name):
    if name == 'Pedro Miguel Matias':
        forename = 'Pedro Miguel'
        surname = 'Matias'
    else:
        parts = name.split()
        forename = parts[0]
        surname = ' '.join(parts[1:])
    return pd.Series([forename, surname])
    
sb_plrs[['forename', 'surname']] = sb_plrs['player_name'].apply(split_name)

sb_plrs.head(3)

Unnamed: 0,player_id,player_name,dob,forename,surname
0,181422,Mikey Davies,2004-09-23,Mikey,Davies
1,137278,Jean Belehouan,2000-09-01,Jean,Belehouan
2,101331,Josh Hawkes,1999-01-28,Josh,Hawkes


In [69]:
sb_plrs = sb_plrs.merge(sb_plr_pos, on=['player_id', 'player_name'], how='left')

In [70]:
sb_plrs['player_id'] = sb_plrs.surname + sb_plrs.forename + sb_plrs.dob.astype(str)

sb_plrs['player_id'] = sb_plrs['player_id'].str.replace('.0', '').str.replace('-', '').str.replace("'", '').str.replace(" ", '').str.replace('é', 'e')

sb_plrs = sb_plrs[['player_id', 'surname', 'forename', 'player_name', 'dob', 'soccerbase_pos']].copy()

sb_plrs

Unnamed: 0,player_id,surname,forename,player_name,dob,soccerbase_pos
0,DaviesMikey20040923,Davies,Mikey,Mikey Davies,2004-09-23,Defender
1,BelehouanJean20000901,Belehouan,Jean,Jean Belehouan,2000-09-01,Defender
2,HawkesJosh19990128,Hawkes,Josh,Josh Hawkes,1999-01-28,Midfielder
3,HockenhullBen20010903,Hockenhull,Ben,Ben Hockenhull,2001-09-03,Defender
4,DaviesTom19920418,Davies,Tom,Tom Davies,1992-04-18,Defender
...,...,...,...,...,...,...
458,NevinPat19630906,Nevin,Pat,Pat Nevin,1963-09-06,Forward
459,IlesanmiSam19980713,Ilesanmi,Sam,Sam Ilesanmi,1998-07-13,Forward
460,TaylorPerry19810129,Taylor,Perry,Perry Taylor,1981-01-29,Midfielder
461,LucyCallum19981109,Lucy,Callum,Callum Lucy,1998-11-09,Midfielder


In [71]:
sb_pos_codes = {
    'Goalkeeper': 'G',
    'Defender': 'D',
    'Midfielder': 'MF',
    'Forward': 'F'
}

sb_plrs['position'] = sb_plrs.soccerbase_pos.map(sb_pos_codes)

In [72]:
sb_plrs = sb_plrs.rename(columns={'dob': 'player_dob'})

In [73]:
sb_plrs = sb_plrs[['player_id', 'surname', 'forename', 'player_name', 'player_dob', 'soccerbase_pos']].sort_values('player_id').reset_index(drop=True)

sb_plrs.head(5)

Unnamed: 0,player_id,surname,forename,player_name,player_dob,soccerbase_pos
0,AchterbergJohn19710708,Achterberg,John,John Achterberg,1971-07-08,Goalkeeper
1,AhmedAdnan19840607,Ahmed,Adnan,Adnan Ahmed,1984-06-07,Midfielder
2,AimsonWill19940603,Aimson,Will,Will Aimson,1994-06-03,Defender
3,AistonSam19761121,Aiston,Sam,Sam Aiston,1976-11-21,Midfielder
4,AkammaduFranklyn19980811,Akammadu,Franklyn,Franklyn Akammadu,1998-08-11,Forward


In [74]:
sb_plrs.to_csv('./output/soccerbase.csv', index=False)

In [40]:
r_ids = pd.read_csv('~/Desktop/player_ids.csv')[['player_id', 'player_name']].drop_duplicates().reset_index(drop=True)

r_ids

Unnamed: 0,player_id,player_name
0,ACourtAlan19340930,Alan A'Court
1,AchterbergJohn19710708,John Achterberg
2,AdamsArthur,Arthur Adams
3,AdamsEdward19081112,Edward Adams
4,AdkinsNigel19650311,Nigel Adkins
...,...,...
1020,YatesWilf1898,Wilf Yates
1021,YeatsRon19371115,Ron Yeats
1022,YoungJack20001021,Jack Young
1023,YoungTommy19471224,Tommy Young


In [41]:
r_ids.query("~player_id.isin(@sb_plrs.player_id)")

Unnamed: 0,player_id,player_name
0,ACourtAlan19340930,Alan A'Court
2,AdamsArthur,Arthur Adams
3,AdamsEdward19081112,Edward Adams
4,AdkinsNigel19650311,Nigel Adkins
12,AlcockEdward1914,Edward Alcock
...,...,...
1016,WorthingtonFrank19481123,Frank Worthington
1017,YardleyGeorge19421008,George Yardley
1020,YatesWilf1898,Wilf Yates
1021,YeatsRon19371115,Ron Yeats


In [42]:
sb_plrs.query("~player_id.isin(@r_ids.player_id)").sort_values(['surname', 'forename'])

Unnamed: 0,player_id,surname,forename,player_name,player_dob,soccerbase_pos
7,AkpanUdi19990210,Akpan,Udi,Udi Akpan,1999-02-10,Forward
32,BensonLiam19920225,Benson,Liam,Liam Benson,1992-02-25,Defender
40,BolandAntonie19941230,Boland,Antonie,Antonie Boland,1994-12-30,Defender
92,DaviesLiam19960702,Davies,Liam,Liam Davies,1996-07-02,Midfielder
100,DevineJay19990726,Devine,Jay,Jay Devine,1999-07-26,Midfielder
127,FisherMax20031003,Fisher,Max,Max Fisher,2003-10-03,Midfielder
149,GouldbourneEthan20001010,Gouldbourne,Ethan,Ethan Gouldbourne,2000-10-10,Midfielder
209,JohnstonMichael19871216,Johnston,Michael,Michael Johnston,1987-12-16,Defender
212,JonesBen20001129,Jones,Ben,Ben Jones,2000-11-29,Goalkeeper
214,JonesEthan20041016,Jones,Ethan,Ethan Jones,2004-10-16,Defender
