In [1]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import concurrent.futures
import pandas as pd
import re
import requests


def get_soup(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

In [2]:
def get_seasons(url):
    soup = get_soup(url)
    
    seasons = soup.find('select', {'name': 'saison_id'}).find_all('option')
    
    ssn_list = []
    for season in seasons:
        season_id = season['value'].strip()
        season_name = season.text.strip()
        ssn_list.append({
            'season': season_name,
            'season_id': season_id
        })
    return ssn_list

club_url = 'https://www.transfermarkt.us/tranmere-rovers/kader/verein/1074/saison_id/2024/'

seasons = get_seasons(club_url)

seasons[:3]

[{'season': '24/25', 'season_id': '2024'},
 {'season': '23/24', 'season_id': '2023'},
 {'season': '22/23', 'season_id': '2022'}]

In [3]:
def get_ssn_url(season_id):
    return f'https://www.transfermarkt.us/tranmere-rovers/kader/verein/1074/saison_id/{season_id}'

In [4]:
def get_players(soup):
    return soup.find_all(class_='posrela')

def get_player_link(player):
    p = player.find(class_='hauptlink').find('a')
    player_name = p.text.strip()
    player_url = p['href']
    return {
        'player_name': player_name,
        'player_url': player_url
    }

# Get player names and URLs for every season

In [5]:
def process_season(season):
    season_id = season['season_id']
    season_url = get_ssn_url(season_id)
    soup = get_soup(season_url)
    players = get_players(soup)
    
    player_links = []
    for player in players:
        player_link = get_player_link(player)
        player_links.append(player_link)
    return player_links

plr_list = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_season, s) for s in seasons]
    
    for future in tqdm(concurrent.futures.as_completed(futures)):
        plr_list.extend(future.result())

0it [00:00, ?it/s]

In [6]:
plr_list = pd.DataFrame(plr_list).drop_duplicates().to_dict(orient='records')

plr_list[:3]

[{'player_name': 'Iain Turner',
  'player_url': '/iain-turner/profil/spieler/3582'},
 {'player_name': 'Scott Davies',
  'player_url': '/scott-davies/profil/spieler/67273'},
 {'player_name': 'Paddy Wharton',
  'player_url': '/paddy-wharton/profil/spieler/504952'}]

# Get HTML content of player profile

In [7]:
def process_player(plr):
    plr_url = f"https://www.transfermarkt.us{plr['player_url']}"
    plr_name = plr['player_name']
    plr_soup = get_soup(plr_url)
    return {
        'player_name': plr_name,
        'player_url': plr_url,
        'player_html': plr_soup
    }

plrs_html = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_player, plr) for plr in plr_list]
    
    for future in tqdm(concurrent.futures.as_completed(futures)):
        plrs_html.append(future.result())

0it [00:00, ?it/s]

# Pull player positions from player html

In [8]:
df = pd.DataFrame(plrs_html)

def get_position(soup):
    positions = soup.find_all('dd', class_='detail-position__position')
    return [pos.text.strip() for pos in positions]

df['positions'] = df['player_html'].apply(get_position)

df['positions'][:3]

0                         [Goalkeeper]
1                        [Centre-Back]
2    [Centre-Back, Defensive Midfield]
Name: positions, dtype: object

# Fix names pulled from Transfermarkt to match names in main dataset

In [9]:
name_fixes = {
    'Ken Beamish': 'Kenny Beamish',
    'Dave Burgess': 'David Burgess',
    'Jim Cassidy': 'James Cassidy',
    'James Cumbes': 'Jim Cumbes',
    'Ron Dellow': 'Ronnie Dellow',
    'Stanley Docking': 'Stan Docking',
    'Herbert Hamilton': 'Duke Hamilton',
    'Jim Harvey': 'Jimmy Harvey',
    'Steve Jennings': 'Steven Jennings',
    'Ousmane Kane': 'Ousmane Kané',
    'Matty Kennedy': 'Matthew Kennedy',
    'John King': 'Johnny King',
    'Nathaniel Knight-Percival': 'Nat Knight-Percival',
    'Shay Logan': 'Shaleum Logan',
    'Jonathon Margetts': 'Johnny Margetts',
    'Hugh McAuley': 'Hughie McAuley',
    'Jay McEveley': 'James McEveley',
    'Manny Monthe': 'Emmanuel Monthe',
    'John Morrissey': 'Johnny Morrissey',
    'Joe Starbuck': 'Joseph Starbuck',
    'James Steel': 'Jim Steel',
    "Steven O'Leary": "Stephen O'Leary",
    'Sam Taylor': 'Samuel Taylor',
    'Danny Woodards': 'Dan Woodards',
}

for name in name_fixes:
    df.loc[df['player_name'] == name, 'player_name'] = name_fixes[name]

# Add player positions to dataframe

In [10]:
# Find the maximum length of the lists in 'positions' column
max_len = df['positions'].apply(len).max()

# Create new columns 'position_1', 'position_2', etc.
for i in range(max_len):
    df[f'tm_pos_{i+1}'] = df['positions'].apply(lambda x: x[i] if i < len(x) else None)

# Drop the original 'positions' column
df = df.drop(columns=['positions'])

# Pull players' birth dates from Transfermarky HTML

In [11]:
def get_dob(soup):
    soup = str(soup)
    pattern = r'\d{4}-\d{2}-\d{2}'
    match = re.search(pattern, soup)
    if match:
        return match.group()
    else:
        return None
    
df['dob'] = df['player_html'].apply(get_dob)

# Fix incorrect dates of birth found in Transfermarkt data

In [12]:
dob_fixes = {
    'Ernie Davies': '1916-01-31',
    'Kane Hemmings': '1991-04-08',
    'Rob Apter': '2003-04-23',
    'Ryan Edwards': '1993-10-07',
    'Scott Davies': '1987-02-23',
    "Stephen O'Leary": '1987-02-02',
    'Will Aimson': '1994-06-03',
}

for name in dob_fixes:
    df.loc[df.player_name==name, 'dob'] = dob_fixes[name]

In [13]:
pos_df = df.drop(columns=['player_url', 'player_html']).copy()

In [14]:
pos_df

Unnamed: 0,player_name,tm_pos_1,tm_pos_2,tm_pos_3,dob
0,Paddy Wharton,Goalkeeper,,,2000-05-27
1,Evan Gumbs,Centre-Back,,,1997-07-21
2,Declan Drysdale,Centre-Back,Defensive Midfield,,1999-11-14
3,Michael Ihiekwe,Centre-Back,Right-Back,,1992-11-20
4,Iain Turner,Goalkeeper,,,1984-01-26
...,...,...,...,...,...
582,Mark Duffy,Central Midfield,Attacking Midfield,Right Midfield,1985-10-07
583,John Griffiths,,,,
584,Josh McPake,Right Midfield,Right Winger,Left Winger,2001-08-31
585,Callum McManaman,Right Winger,Left Winger,Second Striker,1991-04-25


# Split names to create forename and surname columns

In [16]:
def split_name(name):
    if name == 'Pedro Miguel Matias':
        forename = 'Pedro Miguel'
        surname = 'Matias'
    else:
        parts = name.split()
        forename = parts[0]
        surname = ' '.join(parts[1:])
    return pd.Series([forename, surname])
    
pos_df[['forename', 'surname']] = pos_df['player_name'].apply(split_name)

pos_df = pos_df.sort_values(['surname', 'forename', 'dob'])

# Fix errors in Transfermarkt player positions

In [17]:
pos_fixes = {
    'Lucas Akins': ['Right Winger', 'Centre-Forward'],
    'Graham Allen': ['Centre-Back', 'Right-Back'],
    'Doug Anderson': ['Winger'],
    'John Aspinall': ['Winger'],
    'Michael Black': ['Left Winger', 'Right Winger'],
    'Paul Black': ['Left-Back', None],
    'Ivano Bonetti': ['Right Winger', 'Left Winger'],
    'Ged Brannan': ['Central Midfield', 'Left-Back'],
    'Paul Brown': ['Left Midfield'],
    'Shane Cansdell-Sherriff': ['Left-Back', 'Centre-Back'],
    'Logan Chalmers': ['Left Winger', 'Right Winger', None],
    'Paul Cook': ['Central Midfield', 'Left Midfield'],
    'Tom Curtis': ['Central Midfield'],
    'Janoi Donacien': ['Centre-Back', 'Right-Back', None],
    'Sean Flynn': ['Central Midfield'],
    'Stephen Frail': ['Right-Back', 'Defensive Midfield'],
    'Wayne Gill': ['Attacking Midfielder'],
    'Jermaine Grandison': ['Centre-Back'],
    'Jimmy Harvey': ['Central Midfield'],
    'Alex Hay': ['Centre-Forward', 'Right Winger'],
    'Dave Higgins': ['Centre-Back'],
    'Matt Hill': ['Left-Back', None],
    'Mark Hughes': ['Centre-Back', 'Sweeper'],
    'Gary Jones': ['Central Midfield', 'Striker', 'Centre-Back'],
    'Dave Martindale': ['Central Midfield', 'Defensive Midfield'],
    'Pedro Miguel Matias': ['Left Winger'],
    'Micky Mellon': ['Central Midfield'],
    "Seyni N'Diaye": ['Center-Forward'],
    'James Olsen': ['Left-Back', 'Left Midfield'],
    'Elliot Osborne': ['Central Midfield'],
    'Andy Thompson': ['Left-Back'],
    'Ryan Williams': ['Attacking Midfielder'],
    'Steve Yates': ['Centre-Back', 'Right-Back', 'Left-Back']
}

for player, positions in pos_fixes.items():
    for i, pos in enumerate(positions):
        pos_df.loc[pos_df.player_name==player, f'tm_pos_{i+1}'] = pos

pos_df = pos_df[['surname', 'forename',  'player_name', 'dob', 'tm_pos_1', 'tm_pos_2', 'tm_pos_3']]

pos_df.head(5)

Unnamed: 0,surname,forename,player_name,dob,tm_pos_1,tm_pos_2,tm_pos_3
262,Achterberg,John,John Achterberg,1971-07-08,Goalkeeper,,
486,Adkins,Nigel,Nigel Adkins,1965-03-11,Goalkeeper,,
277,Ahmed,Adnan,Adnan Ahmed,1984-06-07,Central Midfield,,
193,Aimson,Will,Will Aimson,1994-06-03,Centre-Back,,
404,Aiston,Sam,Sam Aiston,1976-11-21,Left Midfield,,


In [18]:
import re

def view_special_chars(df, col):
    pattern = re.compile(r'[^\x00-\x7F]+')
    return df[df[col].str.contains(pattern)]

view_special_chars(pos_df, 'player_name')

Unnamed: 0,surname,forename,player_name,dob,tm_pos_1,tm_pos_2,tm_pos_3
311,Carole,Sébastien,Sébastien Carole,1982-09-08,Right Winger,Attacking Midfield,Right Midfield
368,Dadi,Eugène,Eugène Dadi,1973-08-20,Centre-Forward,,
131,Fôn Williams,Owain,Owain Fôn Williams,1987-03-17,Goalkeeper,,
325,Golobart,Román,Román Golobart,1992-03-21,Centre-Back,,
223,Gulácsi,Péter,Péter Gulácsi,1990-05-06,Goalkeeper,,
436,Kané,Ousmane,Ousmane Kané,2001-07-23,Defensive Midfield,Central Midfield,
380,Sidibé,Mamady,Mamady Sidibé,1979-12-18,Centre-Forward,,
25,Sousa,Érico,Érico Sousa,1995-03-12,Left Winger,Right Winger,
88,Traoré,Drissa,Drissa Traoré,1992-03-25,Central Midfield,Defensive Midfield,


In [19]:
def replace_special_chars(name):
    name = name.replace('á', 'a')
    name = name.replace('é', 'e')
    name = name.replace('é', 'e')
    name = name.replace('è', 'e')
    name = name.replace('É', 'E')
    name = name.replace('ô', 'o')
    return name

view_special_chars(pos_df, 'player_name')

Unnamed: 0,surname,forename,player_name,dob,tm_pos_1,tm_pos_2,tm_pos_3
311,Carole,Sébastien,Sébastien Carole,1982-09-08,Right Winger,Attacking Midfield,Right Midfield
368,Dadi,Eugène,Eugène Dadi,1973-08-20,Centre-Forward,,
131,Fôn Williams,Owain,Owain Fôn Williams,1987-03-17,Goalkeeper,,
325,Golobart,Román,Román Golobart,1992-03-21,Centre-Back,,
223,Gulácsi,Péter,Péter Gulácsi,1990-05-06,Goalkeeper,,
436,Kané,Ousmane,Ousmane Kané,2001-07-23,Defensive Midfield,Central Midfield,
380,Sidibé,Mamady,Mamady Sidibé,1979-12-18,Centre-Forward,,
25,Sousa,Érico,Érico Sousa,1995-03-12,Left Winger,Right Winger,
88,Traoré,Drissa,Drissa Traoré,1992-03-25,Central Midfield,Defensive Midfield,


# Create `player_id` column

In [20]:
def create_player_id(row):
    forename = row['forename']
    forename = replace_special_chars(forename)
    surname = row['surname']
    surname = replace_special_chars(surname)
    dob = row['dob']
    player_id = f'{surname}{forename}{dob}'.replace('.0', '').replace('-', '').replace("'", '').replace(' ', '')
    return player_id

pos_df['player_id'] = pos_df.apply(create_player_id, axis=1)

pos_df

Unnamed: 0,surname,forename,player_name,dob,tm_pos_1,tm_pos_2,tm_pos_3,player_id
262,Achterberg,John,John Achterberg,1971-07-08,Goalkeeper,,,AchterbergJohn19710708
486,Adkins,Nigel,Nigel Adkins,1965-03-11,Goalkeeper,,,AdkinsNigel19650311
277,Ahmed,Adnan,Adnan Ahmed,1984-06-07,Central Midfield,,,AhmedAdnan19840607
193,Aimson,Will,Will Aimson,1994-06-03,Centre-Back,,,AimsonWill19940603
404,Aiston,Sam,Sam Aiston,1976-11-21,Left Midfield,,,AistonSam19761121
...,...,...,...,...,...,...,...,...
402,Yates,Steve,Steve Yates,1970-01-29,Centre-Back,Right-Back,Left-Back,YatesSteve19700129
509,Yeats,Ron,Ron Yeats,1937-11-15,Centre-Back,,,YeatsRon19371115
65,Young,Jack,Jack Young,2000-10-21,Central Midfield,Defensive Midfield,,YoungJack20001021
512,Young,Tommy,Tommy Young,1947-12-24,Centre-Forward,Attacking Midfield,,YoungTommy19471224


In [21]:
r_ids = pd.read_csv('~/Desktop/player_ids.csv')[['player_name', 'player_id', 'player_dob']].drop_duplicates().reset_index(drop=True)

for name in dob_fixes:
    df.loc[df.player_name==name, 'dob'] = dob_fixes[name]

r_ids

Unnamed: 0,player_name,player_id,player_dob
0,Alan A'Court,ACourtAlan19340930,1934-09-30
1,John Achterberg,AchterbergJohn19710708,1971-07-08
2,Arthur Adams,AdamsArthur,
3,Edward Adams,AdamsEdward19081112,1908-11-12
4,Nigel Adkins,AdkinsNigel19650311,1965-03-11
...,...,...,...
1020,Wilf Yates,YatesWilf1898,
1021,Ron Yeats,YeatsRon19371115,1937-11-15
1022,Jack Young,YoungJack20001021,2000-10-21
1023,Tommy Young,YoungTommy19471224,1947-12-24


In [26]:
errors = pos_df[~pos_df.player_id.isin(r_ids.player_id)][['player_name', 'player_id']]

errors

Unnamed: 0,player_name,player_id
43,Udoyen Akpan,AkpanUdoyen19990210
464,John Aldridge,AldridgeJohn19590918
11,Darren Askew,AskewDarrenNone
326,Tom Baker,BakerTom19850328
455,Gary Bennett,BennettGary19630920
...,...,...
495,Gordon West,WestGordon19430324
147,Jack Williams,WilliamsJack20030516
479,John Williams,WilliamsJohn19611003
376,Josh Williams,WilliamsJosh20040713


In [23]:
errors = pd.read_csv('~/Desktop/error_actions.csv')

errors = errors[~errors.player_id.isin(r_ids.player_id)]

errors

Unnamed: 0,player_name,player_id,action
3,Ernie Davies,DaviesErnie19160131,dob
5,Oliver James,JamesOliver19870130,check dob
6,Ethan Jones,JonesEthan20041016,check dob
7,Gwyn Jones,JonesGwyn19120221,dob
8,Jack Kearns,KearnsJack19140104,dob
10,Clayton McDonald,McDonaldClayton19881206,check dob
11,Arnaud Mendy,MendyArnaud19900210,check dob
12,Ronnie Moore,MooreRonnie19530429,check dob
13,Seyni N'Diaye,NDiayeSeyni19730906,check dob
14,James Norwood,NorwoodJames19900905,check dob


In [24]:
# Transfermarkt DOBs that need to be fixed
tm_dob_fix_names = ['Clayton McDonald', 'Ronnie Moore', 'Seyni N\'Diaye', 'James Norwood']

tm_dob_fixes = r_ids[r_ids.player_name.isin(tm_dob_fix_names)].to_dict(orient='records')

for fix in tm_dob_fixes:
    pos_df.loc[pos_df.player_name==fix['player_name'], 'dob'] = fix['player_dob']

In [25]:
remove = pos_df[~pos_df.player_name.apply(replace_special_chars).isin(r_ids.player_name.apply(replace_special_chars))]

remove

Unnamed: 0,surname,forename,player_name,dob,tm_pos_1,tm_pos_2,tm_pos_3,player_id
43,Akpan,Udoyen,Udoyen Akpan,1999-02-10,Centre-Forward,Second Striker,Right Winger,AkpanUdoyen19990210
11,Askew,Darren,Darren Askew,,,,,AskewDarrenNone
326,Baker,Tom,Tom Baker,1985-03-28,Central Midfield,,,BakerTom19850328
227,Benson,Liam,Liam Benson,1992-02-25,Left-Back,Centre-Back,Right-Back,BensonLiam19920225
160,Boland,Antonie,Antonie Boland,1994-12-30,Centre-Back,Left-Back,Defensive Midfield,BolandAntonie19941230
132,Courtney,John,John Courtney,1994-01-12,Goalkeeper,,,CourtneyJohn19940112
23,Croughan,Tom,Tom Croughan,1999-09-19,Second Striker,Attacking Midfield,Central Midfield,CroughanTom19990919
481,Currie,Tony,Tony Currie,1950-01-01,Attacking Midfield,,,CurrieTony19500101
44,Davies,Liam,Liam Davies,1996-07-02,Left Midfield,Left Winger,Right Midfield,DaviesLiam19960702
452,Davis,Neil,Neil Davis,1973-08-15,,,,DavisNeil19730815
