In [1]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import concurrent.futures
import pandas as pd
import re
import requests


def get_soup(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

In [2]:
def get_seasons(url):
    soup = get_soup(url)
    
    seasons = soup.find('select', {'name': 'saison_id'}).find_all('option')
    
    ssn_list = []
    for season in seasons:
        season_id = season['value'].strip()
        season_name = season.text.strip()
        ssn_list.append({
            'season': season_name,
            'season_id': season_id
        })
    return ssn_list

club_url = 'https://www.transfermarkt.us/tranmere-rovers/kader/verein/1074/saison_id/2024/'

seasons = get_seasons(club_url)

seasons[:3]

[{'season': '24/25', 'season_id': '2024'},
 {'season': '23/24', 'season_id': '2023'},
 {'season': '22/23', 'season_id': '2022'}]

In [3]:
def get_ssn_url(season_id):
    return f'https://www.transfermarkt.us/tranmere-rovers/kader/verein/1074/saison_id/{season_id}'

In [4]:
def get_players(soup):
    return soup.find_all(class_='posrela')

def get_player_link(player):
    p = player.find(class_='hauptlink').find('a')
    player_name = p.text.strip()
    player_url = p['href']
    return {
        'player_name': player_name,
        'player_url': player_url
    }

# Get player names and URLs for every season

In [5]:
def process_season(season):
    season_id = season['season_id']
    season_url = get_ssn_url(season_id)
    soup = get_soup(season_url)
    players = get_players(soup)
    
    player_links = []
    for player in players:
        player_link = get_player_link(player)
        player_links.append(player_link)
    return player_links

plr_list = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_season, s) for s in seasons]
    
    for future in tqdm(concurrent.futures.as_completed(futures)):
        plr_list.extend(future.result())

0it [00:00, ?it/s]

In [6]:
plr_list = pd.DataFrame(plr_list).drop_duplicates().to_dict(orient='records')

plr_list[:3]

[{'player_name': 'Luke McGee',
  'player_url': '/luke-mcgee/profil/spieler/183301'},
 {'player_name': 'Joe Murphy',
  'player_url': '/joe-murphy/profil/spieler/3655'},
 {'player_name': 'Reuben Egan',
  'player_url': '/reuben-egan/profil/spieler/1138111'}]

# Get HTML content of player profile

In [7]:
def process_player(plr):
    plr_url = f"https://www.transfermarkt.us{plr['player_url']}"
    plr_name = plr['player_name']
    plr_soup = get_soup(plr_url)
    return {
        'player_name': plr_name,
        'player_url': plr_url,
        'player_html': plr_soup
    }

plrs_html = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_player, plr) for plr in plr_list]
    
    for future in tqdm(concurrent.futures.as_completed(futures)):
        plrs_html.append(future.result())

0it [00:00, ?it/s]

# Pull player positions from player html

In [8]:
df = pd.DataFrame(plrs_html)

def get_position(soup):
    positions = soup.find_all('dd', class_='detail-position__position')
    return [pos.text.strip() for pos in positions]

df['positions'] = df['player_html'].apply(get_position)

df['positions'][:3]

0                                     [Goalkeeper]
1                       [Left-Back, Left Midfield]
2    [Right-Back, Defensive Midfield, Centre-Back]
Name: positions, dtype: object

# Fix names pulled from Transfermarkt to match names in main dataset

In [9]:
name_fixes = {
    'Ken Beamish': 'Kenny Beamish',
    'Dave Burgess': 'David Burgess',
    'Jim Cassidy': 'James Cassidy',
    'Tom Coughan': 'Tom Croughan',
    'James Cumbes': 'Jim Cumbes',
    'Ron Dellow': 'Ronnie Dellow',
    'Jay Devine': 'James Devine',
    'Stanley Docking': 'Stan Docking',
    'Jack Flemming': 'Jack Fleming',
    'Herbert Hamilton': 'Duke Hamilton',
    'Jim Harvey': 'Jimmy Harvey',
    'Steve Jennings': 'Steven Jennings',
    'Ousmane Kane': 'Ousmane Kané',
    'Matty Kennedy': 'Matthew Kennedy',
    'John King': 'Johnny King',
    'Nathaniel Knight-Percival': 'Nat Knight-Percival',
    'Shay Logan': 'Shaleum Logan',
    'Jonathon Margetts': 'Johnny Margetts',
    'Hugh McAuley': 'Hughie McAuley',
    'Jay McEveley': 'James McEveley',
    'Manny Monthe': 'Emmanuel Monthe',
    'John Morrissey': 'Johnny Morrissey',
    'Andrew Ralph': 'Andy Ralph',
    'Joe Starbuck': 'Joseph Starbuck',
    'James Steel': 'Jim Steel',
    "Steven O'Leary": "Stephen O'Leary",
    'Sam Taylor': 'Samuel Taylor',
    'Danny Woodards': 'Dan Woodards',
}

for name in name_fixes:
    df.loc[df['player_name'] == name, 'player_name'] = name_fixes[name]

# Add player positions to dataframe

In [10]:
# Find the maximum length of the lists in 'positions' column
max_len = df['positions'].apply(len).max()

# Create new columns 'position_1', 'position_2', etc.
for i in range(max_len):
    df[f'tm_pos_{i+1}'] = df['positions'].apply(lambda x: x[i] if i < len(x) else None)

# Drop the original 'positions' column
df = df.drop(columns=['positions'])

# Pull players' birth dates from Transfermarkt HTML

In [11]:
def get_dob(soup):
    soup = str(soup)
    pattern = r'\d{4}-\d{2}-\d{2}'
    match = re.search(pattern, soup)
    if match:
        return match.group()
    else:
        return None
    
df['player_dob'] = df['player_html'].apply(get_dob)

# Fix incorrect dates of birth found in Transfermarkt data

In [12]:
dob_fixes = {
    'Alex Woodyard': '1993-05-03',
    'Callum Lucy': '1998-11-09',
    'Christian Edwards': '1975-11-23',
    'Eddie Bishop': '1962-11-28',
    'Edgar Walkden': '1914-11-04', # Complete Record
    'Elliot Osborne': '1996-05-12', # Unclear, but most common
    'Ernie Davies': '1916-01-31',
    'Ethan Gouldbourne': '2000-10-10',
    'Gilbert Wassell': '1910-04-09',
    'Gordon West': '1943-04-24',
    'John Aldridge': '1958-09-18',
    'John Griffiths': '1916-06-30', # Complete Record
    'John Williams': '1960-10-03',
    'Kane Hemmings': '1991-04-08',
    'Neil Gibson': '1979-10-10', # Complete Record
    'Oliver James': '1987-01-13', # Official site Wayback Machine
    'Rob Apter': '2003-04-23',
    'Robbie Burns': '1990-11-15', # LCFC Wayback Machine
    'Ronnie Moore': '1953-01-29',
    'Ryan Edwards': '1993-10-07',
    'Ryan Shotton': '1988-10-30',
    'Samuel Taylor': '2003-12-23',
    'Scott Davies': '1987-02-23',
    "Seyni N'Diaye": '1973-06-01',
    "Stephen O'Leary": '1987-02-02',
    'Ted Buckley': '1912-09-13',
    'Will Aimson': '1994-06-03',
}

for name in dob_fixes:
    df.loc[df.player_name==name, 'player_dob'] = dob_fixes[name]

In [13]:
pos_df = df.drop(columns=['player_url', 'player_html']).copy()

In [14]:
pos_df

Unnamed: 0,player_name,tm_pos_1,tm_pos_2,tm_pos_3,player_dob
0,Joe Murphy,Goalkeeper,,,1981-08-21
1,Connor Wood,Left-Back,Left Midfield,,1996-07-17
2,Lee O'Connor,Right-Back,Defensive Midfield,Centre-Back,2000-07-28
3,Brad Walker,Defensive Midfield,Central Midfield,Centre-Back,1996-04-25
4,Reuben Egan,Goalkeeper,,,2005-07-27
...,...,...,...,...,...
585,Andy Ralph,Goalkeeper,,,1983-05-28
586,Tolani Omotola,Centre-Forward,,,1998-04-16
587,Danny Johnson,Centre-Forward,,,1993-02-28
588,Guy Madjo,Centre-Forward,,,1984-06-01


# Split names to create forename and surname columns

In [15]:
def split_name(name):
    if name == 'Pedro Miguel Matias':
        forename = 'Pedro Miguel'
        surname = 'Matias'
    else:
        parts = name.split()
        forename = parts[0]
        surname = ' '.join(parts[1:])
    return pd.Series([forename, surname])
    
pos_df[['forename', 'surname']] = pos_df['player_name'].apply(split_name)

pos_df = pos_df.sort_values(['surname', 'forename', 'player_dob'])

# Fix errors in Transfermarkt player positions

In [16]:
pos_fixes = {
    'Lucas Akins': ['Right Winger', 'Centre-Forward'],
    'Graham Allen': ['Centre-Back', 'Right-Back'],
    'Doug Anderson': ['Winger'],
    'John Aspinall': ['Winger'],
    'Michael Black': ['Left Winger', 'Right Winger'],
    'Paul Black': ['Left-Back', None],
    'Ivano Bonetti': ['Right Winger', 'Left Winger'],
    'Ged Brannan': ['Central Midfield', 'Left-Back'],
    'Paul Brown': ['Left Midfield'],
    'Shane Cansdell-Sherriff': ['Left-Back', 'Centre-Back'],
    'Logan Chalmers': ['Left Winger', 'Right Winger', None],
    'Paul Cook': ['Central Midfield', 'Left Midfield'],
    'Tom Curtis': ['Central Midfield'],
    'Janoi Donacien': ['Centre-Back', 'Right-Back', None],
    'Sean Flynn': ['Central Midfield'],
    'Stephen Frail': ['Right-Back', 'Defensive Midfield'],
    'Wayne Gill': ['Attacking Midfielder'],
    'Jermaine Grandison': ['Centre-Back'],
    'Jimmy Harvey': ['Central Midfield'],
    'Alex Hay': ['Centre-Forward', 'Right Winger'],
    'Dave Higgins': ['Centre-Back'],
    'Matt Hill': ['Left-Back', None],
    'Mark Hughes': ['Centre-Back', 'Sweeper'],
    'Gary Jones': ['Central Midfield', 'Striker', 'Centre-Back'],
    'Dave Martindale': ['Central Midfield', 'Defensive Midfield'],
    'Pedro Miguel Matias': ['Left Winger'],
    'Micky Mellon': ['Central Midfield'],
    "Seyni N'Diaye": ['Center-Forward'],
    'James Olsen': ['Left-Back', 'Left Midfield'],
    'Elliot Osborne': ['Central Midfield'],
    'Andy Thompson': ['Left-Back'],
    'Ryan Williams': ['Attacking Midfielder'],
    'Steve Yates': ['Centre-Back', 'Right-Back', 'Left-Back']
}

for player, positions in pos_fixes.items():
    for i, pos in enumerate(positions):
        pos_df.loc[pos_df.player_name==player, f'tm_pos_{i+1}'] = pos

pos_df = pos_df[['surname', 'forename',  'player_name', 'player_dob', 'tm_pos_1', 'tm_pos_2', 'tm_pos_3']]

pos_df.head(5)

Unnamed: 0,surname,forename,player_name,player_dob,tm_pos_1,tm_pos_2,tm_pos_3
277,Achterberg,John,John Achterberg,1971-07-08,Goalkeeper,,
462,Adkins,Nigel,Nigel Adkins,1965-03-11,Goalkeeper,,
283,Ahmed,Adnan,Adnan Ahmed,1984-06-07,Central Midfield,,
567,Aimson,Will,Will Aimson,1994-06-03,Centre-Back,,
335,Aiston,Sam,Sam Aiston,1976-11-21,Left Winger,,


In [17]:
import re

def view_special_chars(df, col):
    pattern = re.compile(r'[^\x00-\x7F]+')
    return df[df[col].str.contains(pattern)]

view_special_chars(pos_df, 'player_name')

Unnamed: 0,surname,forename,player_name,player_dob,tm_pos_1,tm_pos_2,tm_pos_3
306,Carole,Sébastien,Sébastien Carole,1982-09-08,Right Winger,Attacking Midfield,Right Midfield
355,Dadi,Eugène,Eugène Dadi,1973-08-20,Centre-Forward,,
142,Fôn Williams,Owain,Owain Fôn Williams,1987-03-17,Goalkeeper,,
211,Golobart,Román,Román Golobart,1992-03-21,Centre-Back,,
243,Gulácsi,Péter,Péter Gulácsi,1990-05-06,Goalkeeper,,
79,Kané,Ousmane,Ousmane Kané,2001-07-23,Defensive Midfield,Central Midfield,
226,Sidibé,Mamady,Mamady Sidibé,1979-12-18,Centre-Forward,,
183,Sousa,Érico,Érico Sousa,1995-03-12,Left Winger,Right Winger,
360,Traoré,Drissa,Drissa Traoré,1992-03-25,Central Midfield,Defensive Midfield,


In [18]:
def replace_special_chars(name):
    name = name.replace('á', 'a')
    name = name.replace('é', 'e')
    name = name.replace('é', 'e')
    name = name.replace('è', 'e')
    name = name.replace('É', 'E')
    name = name.replace('ô', 'o')
    return name

# Create `player_id` column

In [19]:
def create_player_id(row):
    forename = row['forename']
    forename = replace_special_chars(forename)
    surname = row['surname']
    surname = replace_special_chars(surname)
    dob = row['player_dob']
    player_id = f'{surname}{forename}{dob}'.replace('.0', '').replace('-', '').replace("'", '').replace(' ', '').replace('None', '')
    return player_id

pos_df['player_id'] = pos_df.apply(create_player_id, axis=1)

pos_df

Unnamed: 0,surname,forename,player_name,player_dob,tm_pos_1,tm_pos_2,tm_pos_3,player_id
277,Achterberg,John,John Achterberg,1971-07-08,Goalkeeper,,,AchterbergJohn19710708
462,Adkins,Nigel,Nigel Adkins,1965-03-11,Goalkeeper,,,AdkinsNigel19650311
283,Ahmed,Adnan,Adnan Ahmed,1984-06-07,Central Midfield,,,AhmedAdnan19840607
567,Aimson,Will,Will Aimson,1994-06-03,Centre-Back,,,AimsonWill19940603
335,Aiston,Sam,Sam Aiston,1976-11-21,Left Winger,,,AistonSam19761121
...,...,...,...,...,...,...,...,...
395,Yates,Steve,Steve Yates,1970-01-29,Centre-Back,Right-Back,Left-Back,YatesSteve19700129
496,Yeats,Ron,Ron Yeats,1937-11-15,Centre-Back,,,YeatsRon19371115
191,Young,Jack,Jack Young,2000-10-21,Central Midfield,Defensive Midfield,,YoungJack20001021
501,Young,Tommy,Tommy Young,1947-12-24,Centre-Forward,Attacking Midfield,,YoungTommy19471224


In [20]:
pos_df = pos_df[['player_id', 'surname', 'forename', 'player_name', 'player_dob', 'tm_pos_1', 'tm_pos_2', 'tm_pos_3']].sort_values(['player_id'])

pos_df.to_csv('./output/transfermarkt.csv', index=False)

In [21]:
# r_ids = pd.read_csv('~/Desktop/player_ids.csv')[['player_id', 'player_name', 'player_dob']].drop_duplicates().reset_index(drop=True)

# r_ids

In [22]:
# Find players in Tranfermarkt data but not R dataset
# This mostly likely means players that did not make a first team appearance
# errors = pos_df[~pos_df.player_id.isin(r_ids.player_id)][['player_id', 'player_name', 'player_dob']]

# errors.sort_values(['player_dob'])

In [23]:
# errors.merge(r_ids[['player_name', 'player_dob']], on='player_name', how='left') \
#     .rename(columns={'player_dob_x': 'tm_dob', 'player_dob_y': 'r_dob'}) \
#     .query("~r_dob.isnull()")