### Description

This script will scrape `official` player height and weight data from the ATP website

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)

import requests

from bs4 import BeautifulSoup
from dateutil.parser import parse


In [2]:
def collect_atp_official_player_data(player_profile_url):
    
    # Connect to player page
    player_page = requests.get(link).text

    # Get (entire) HTML File
    player_html = BeautifulSoup(player_page, 'lxml')
    
    # Player Name
    player_first_name_html = player_html.find("div", attrs={"class": "first-name"})
    player_surname_html = player_html.find("div", attrs={"class": "last-name"})
    
    try:
        player_first_name = player_first_name_html.text.strip()
        player_surname = player_surname_html.text.strip()
        
        player_name = player_first_name.encode('utf-8').strip() + ' ' + player_surname.encode('utf-8').strip()
        #player_name = str(player_name)
    except (AttributeError, UnicodeEncodeError):
        player_name = None
        
    # -- Get player weight in imperial units
    try:
        player_weight_lbs_html = player_html.find("span", attrs={"class": "table-weight-lbs"})
        player_weight_lbs = player_weight_lbs_html.text.strip()
        player_weight_lbs = str(player_weight_lbs)
    except (KeyError, AttributeError):
        player_weight_lbs = None
        
    
    # -- Get player weight in kg
    try:
        player_weight_kg_html = player_html.find("span", attrs={"class": "table-weight-kg-wrapper"})
        player_weight_kg = player_weight_kg_html.text.lower().replace('(','').replace('kg)','')
        
        player_weight_kg = str(player_weight_kg)
        
    except (KeyError, AttributeError):
        player_weight_kg = None
         
    
    
    # -- Get player height in feet & inches
    try:
        player_height_imp_html = player_html.find("span", attrs={"class": "table-height-ft"})
        player_height_imp = player_height_imp_html.text
        
        player_height_imp = str(player_height_imp)
        
    except (KeyError, AttributeError):
        player_height_imp = None
         
    
    # -- Get player height in cm
    try:
        player_height_cm_html = player_html.find("span", attrs={"class": "table-height-cm-wrapper"})
        player_height_cm = player_height_cm_html.text.lower().replace('(','').replace('cm)','')
        
        player_height_cm = str(player_height_cm)
        
    except (KeyError, AttributeError):
        player_height_cm = None
        
        # -- Get player handedness
    player_handedness = None
    for item in player_html.find_all("div", attrs={"class": "table-value"}):
        if 'handed' in item.text.lower().strip():
            
            try:
                player_handedness = item.text.lower().strip().split(',')[0]

                player_handedness = str(player_handedness)
                
            except (UnicodeEncodeError):
                player_handedness = player_handedness.encode('utf-8').strip()
    
    player_dictionary = {
        'player_name': player_name,
        'player_height_inches': player_height_imp,
        'player_height_cm': player_height_cm,
        'player_weight_lbs': player_weight_lbs,
        'player_weight_kg': player_weight_kg,
        'player_handedness': player_handedness
    }

    return player_dictionary

In [3]:
# Loop through to collect all data
atp_link_data = pd.read_csv('./data/raw_data/atp_official_site_urls.csv')
atp_link_data
atp_data_list = []
for link in atp_link_data['bio_link']:
    if 'overview' in link:
        print(link)
        atp_data_list.append( collect_atp_official_player_data(link) )

https://www.atptour.com/en/players/novak-djokovic/d643/overview
https://www.atptour.com/en/players/rafael-nadal/n409/overview
https://www.atptour.com/en/players/dominic-thiem/tb69/overview
https://www.atptour.com/en/players/roger-federer/f324/overview
https://www.atptour.com/en/players/daniil-medvedev/mm58/overview
https://www.atptour.com/en/players/stefanos-tsitsipas/te51/overview
https://www.atptour.com/en/players/alexander-zverev/z355/overview
https://www.atptour.com/en/players/matteo-berrettini/bk40/overview
https://www.atptour.com/en/players/gael-monfils/mc65/overview
https://www.atptour.com/en/players/david-goffin/gb88/overview
https://www.atptour.com/en/players/fabio-fognini/f510/overview
https://www.atptour.com/en/players/roberto-bautista-agut/bd06/overview
https://www.atptour.com/en/players/diego-schwartzman/sm37/overview
https://www.atptour.com/en/players/andrey-rublev/re44/overview
https://www.atptour.com/en/players/karen-khachanov/ke29/overview
https://www.atptour.com/en/pl

https://www.atptour.com/en/players/andy-murray/mc10/overview
https://www.atptour.com/en/players/federico-gaio/gb53/overview
https://www.atptour.com/en/players/mohamed-safwat/sk47/overview
https://www.atptour.com/en/players/prajnesh-gunneswaran/ga94/overview
https://www.atptour.com/en/players/cedrik-marcel-stebe/sk94/overview
https://www.atptour.com/en/players/facundo-bagnis/bf23/overview
https://www.atptour.com/en/players/juan-pablo-varillas/v836/overview
https://www.atptour.com/en/players/antoine-hoang/ha71/overview
https://www.atptour.com/en/players/henri-laaksonen/l949/overview
https://www.atptour.com/en/players/ilya-ivashka/i305/overview
https://www.atptour.com/en/players/guido-andreozzi/a887/overview
https://www.atptour.com/en/players/zhizhen-zhang/z371/overview
https://www.atptour.com/en/players/blaz-rola/ra51/overview
https://www.atptour.com/en/players/hyeon-chung/ch27/overview
https://www.atptour.com/en/players/yannick-hanfmann/h997/overview
https://www.atptour.com/en/players/j

https://www.atptour.com/en/players/goran-ivanisevic/i034/overview
https://www.atptour.com/en/players/carlos-moya/m605/overview
https://www.atptour.com/en/players/brad-gilbert/g016/overview
https://www.atptour.com/en/players/jan-kodes/k049/overview
https://www.atptour.com/en/players/guillermo-perez-roldan/p190/overview
https://www.atptour.com/en/players/jay-berger/b281/overview
https://www.atptour.com/en/players/todd-martin/m442/overview
https://www.atptour.com/en/players/harold-solomon/s065/overview
https://www.atptour.com/en/players/richard-pancho-gonzales/g077/overview
https://www.atptour.com/en/players/wojtek-fibak/f020/overview
https://www.atptour.com/en/players/johan-kriek/k022/overview
https://www.atptour.com/en/players/balazs-taroczy/t007/overview
https://www.atptour.com/en/players/tommy-haas/h355/overview
https://www.atptour.com/en/players/tim-mayotte/m041/overview
https://www.atptour.com/en/players/martin-jaite/j004/overview
https://www.atptour.com/en/players/bob-hewitt/h058/o

https://www.atptour.com/en/players/shlomo-glickstein/g019/overview
https://www.atptour.com/en/players/ulrich-pinner/p034/overview
https://www.atptour.com/en/players/andrew-pattison/p074/overview
https://www.atptour.com/en/players/alexandr-dolgopolov/d801/overview
https://www.atptour.com/en/players/sjeng-schalken/s572/overview
https://www.atptour.com/en/players/charlie-pasarell/p072/overview
https://www.atptour.com/en/players/gilles-muller/ma30/overview
https://www.atptour.com/en/players/darren-cahill/c001/overview
https://www.atptour.com/en/players/victor-amaya/a044/overview
https://www.atptour.com/en/players/slobodan-zivojinovic/z006/overview
https://www.atptour.com/en/players/kim-warwick/w006/overview
https://www.atptour.com/en/players/mariano-puerta/p372/overview
https://www.atptour.com/en/players/mark-dickson/d028/overview
https://www.atptour.com/en/players/nicolas-massu/m655/overview
https://www.atptour.com/en/players/david-pate/p012/overview
https://www.atptour.com/en/players/fer

https://www.atptour.com/en/players/shuzo-matsuoka/m338/overview
https://www.atptour.com/en/players/albert-montanes/m824/overview
https://www.atptour.com/en/players/eduardo-bengoechea/b035/overview
https://www.atptour.com/en/players/marty-davis/d005/overview
https://www.atptour.com/en/players/steve-darcis/d632/overview
https://www.atptour.com/en/players/peter-lundgren/l044/overview
https://www.atptour.com/en/players/michiel-schapers/s021/overview
https://www.atptour.com/en/players/gilbert-schaller/s315/overview
https://www.atptour.com/en/players/chip-hooper/h033/overview
https://www.atptour.com/en/players/henrik-holm/h191/overview
https://www.atptour.com/en/players/tomas-carbonell/c216/overview
https://www.atptour.com/en/players/vincent-spadea/s544/overview
https://www.atptour.com/en/players/jacco-eltingh/e106/overview
https://www.atptour.com/en/players/olivier-rochus/r397/overview
https://www.atptour.com/en/players/ronald-agenor/a006/overview
https://www.atptour.com/en/players/patrick-

https://www.atptour.com/en/players/colin-dowdeswell/d037/overview
https://www.atptour.com/en/players/antony-dupuis/d272/overview
https://www.atptour.com/en/players/juan-antonio-marin/m578/overview
https://www.atptour.com/en/players/nicola--spear/s146/overview
https://www.atptour.com/en/players/nduka-odizor/o006/overview
https://www.atptour.com/en/players/simon-youl/y002/overview
https://www.atptour.com/en/players/eddie-edwards/e006/overview
https://www.atptour.com/en/players/donald-young/y124/overview
https://www.atptour.com/en/players/john-yuill/y008/overview
https://www.atptour.com/en/players/lars-jonsson/j084/overview
https://www.atptour.com/en/players/jaime-pinto-bravo/p067/overview
https://www.atptour.com/en/players/diego-nargiso/n109/overview
https://www.atptour.com/en/players/nicolas-pereira/p218/overview
https://www.atptour.com/en/players/belus-prajoux/p046/overview
https://www.atptour.com/en/players/trey-waltke/w045/overview
https://www.atptour.com/en/players/john-james/j029/o

https://www.atptour.com/en/players/stefano-napolitano/n679/overview
https://www.atptour.com/en/players/akira-santillan/sq80/overview
https://www.atptour.com/en/players/bjorn-fratangelo/f811/overview
https://www.atptour.com/en/players/tristan-lamasine/lc11/overview
https://www.atptour.com/en/players/viktor-galovic/gb61/overview
https://www.atptour.com/en/players/geoffrey-blancaneaux/bu54/overview
https://www.atptour.com/en/players/darian-king/kc86/overview
https://www.atptour.com/en/players/matthias-bachinger/bc65/overview
https://www.atptour.com/en/players/guilherme-clezar/cc32/overview
https://www.atptour.com/en/players/shuichi-sekiguchi/sl92/overview
https://www.atptour.com/en/players/james-ward/w503/overview
https://www.atptour.com/en/players/goncalo-oliveira/o482/overview
https://www.atptour.com/en/players/riccardo-bonadio/bi47/overview
https://www.atptour.com/en/players/pavel-kotov/k09f/overview
https://www.atptour.com/en/players/roberto-ortega-olmedo/o340/overview
https://www.atp

https://www.atptour.com/en/players/genaro-alberto-olivieri/o660/overview
https://www.atptour.com/en/players/rio-noguchi/n09u/overview
https://www.atptour.com/en/players/ze-zhang/z272/overview
https://www.atptour.com/en/players/alexey-zakharov/z0ab/overview
https://www.atptour.com/en/players/filip-cristian-jianu/j09x/overview
https://www.atptour.com/en/players/jiri-lehecka/l0bv/overview
https://www.atptour.com/en/players/eduardo-struvay/si36/overview
https://www.atptour.com/en/players/facundo-diaz-acosta/d0cg/overview
https://www.atptour.com/en/players/yshai-oliel/o674/overview
https://www.atptour.com/en/players/michael-geerts/gg65/overview
https://www.atptour.com/en/players/jonas-forejtek/f0bt/overview
https://www.atptour.com/en/players/evgenii-tiurnev/td47/overview
https://www.atptour.com/en/players/evan-king/kb37/overview
https://www.atptour.com/en/players/harold-mayot/m0g4/overview
https://www.atptour.com/en/players/alexander-sarkissian/sl07/overview
https://www.atptour.com/en/playe

https://www.atptour.com/en/players/vladyslav-manafov/mm54/overview
https://www.atptour.com/en/players/gerardo-lopez-villasenor/lg87/overview
https://www.atptour.com/en/players/simon-carr/co11/overview
https://www.atptour.com/en/players/gastao-elias/e698/overview
https://www.atptour.com/en/players/peter-nagy/n729/overview
https://www.atptour.com/en/players/marco-bortolotti/bh08/overview
https://www.atptour.com/en/players/nick-hardt/h0a4/overview
https://www.atptour.com/en/players/thomas-fancutt/fa50/overview
https://www.atptour.com/en/players/kyrian-jacquet/j0az/overview
https://www.atptour.com/en/players/vitaliy-sachko/ss25/overview
https://www.atptour.com/en/players/alexandar-lazarov/li39/overview
https://www.atptour.com/en/players/aidan-mchugh/m0cw/overview
https://www.atptour.com/en/players/jurgen-zopp/z254/overview
https://www.atptour.com/en/players/lucas-poullain/pj74/overview
https://www.atptour.com/en/players/juan-ignacio-galarza/gd91/overview
https://www.atptour.com/en/players/

https://www.atptour.com/en/players/benjamin-dhoe/dh54/overview
https://www.atptour.com/en/players/andrey-chepelev/co01/overview
https://www.atptour.com/en/players/alexander-brown/bv13/overview
https://www.atptour.com/en/players/thiemo-de-bakker/d776/overview
https://www.atptour.com/en/players/rinky-hijikata/h0bh/overview
https://www.atptour.com/en/players/gauthier-onclin/o0a2/overview
https://www.atptour.com/en/players/luca-giacomini/gk24/overview
https://www.atptour.com/en/players/kelsey-stevenson/sm18/overview
https://www.atptour.com/en/players/ronan-joncour/j709/overview
https://www.atptour.com/en/players/jakub-paul/p0d0/overview
https://www.atptour.com/en/players/michal-konecny/k906/overview
https://www.atptour.com/en/players/karl-friberg/f09j/overview
https://www.atptour.com/en/players/patrik-rikl/rh13/overview
https://www.atptour.com/en/players/artem-smirnov/sh77/overview
https://www.atptour.com/en/players/gustav-hansson/hb55/overview
https://www.atptour.com/en/players/valentin-r

https://www.atptour.com/en/players/dragos-dima/da13/overview
https://www.atptour.com/en/players/benjamin-pietri/p09s/overview
https://www.atptour.com/en/players/matteo-donati/da67/overview
https://www.atptour.com/en/players/javier-marti/mi08/overview
https://www.atptour.com/en/players/franco-agamenone/aa27/overview
https://www.atptour.com/en/players/justin-roberts/rh17/overview
https://www.atptour.com/en/players/billy-harris/hd68/overview
https://www.atptour.com/en/players/nicolae-frunza/f952/overview
https://www.atptour.com/en/players/gergely-madarasz/mo76/overview
https://www.atptour.com/en/players/hiroyasu-ehara/e757/overview
https://www.atptour.com/en/players/mwendwa-mbithi/mw69/overview
https://www.atptour.com/en/players/mariano-navone/n0bs/overview
https://www.atptour.com/en/players/damien-wenger/w09b/overview
https://www.atptour.com/en/players/takashi-saito/sr78/overview
https://www.atptour.com/en/players/soichiro-moritani/mo12/overview
https://www.atptour.com/en/players/federic

https://www.atptour.com/en/players/luca-nardi/n0bg/overview
https://www.atptour.com/en/players/nathan-seateun/s09y/overview
https://www.atptour.com/en/players/julien-cagnina/cg32/overview
https://www.atptour.com/en/players/sumit-sarkar/s0j5/overview
https://www.atptour.com/en/players/jung-woong-na/n653/overview
https://www.atptour.com/en/players/arthur-cazaux/c0h0/overview
https://www.atptour.com/en/players/christian-samuelsson/sr97/overview
https://www.atptour.com/en/players/gianni-mina/mj89/overview
https://www.atptour.com/en/players/milan-welte/w0an/overview
https://www.atptour.com/en/players/jacob-dunbar/dg86/overview
https://www.atptour.com/en/players/alan-kohen/kd99/overview
https://www.atptour.com/en/players/marco-brugnerotto/bo07/overview
https://www.atptour.com/en/players/dostanbek-tashbulatov/t09l/overview
https://www.atptour.com/en/players/ajeet-rai/r0dc/overview
https://www.atptour.com/en/players/koray-kirci/ki91/overview
https://www.atptour.com/en/players/valentin-guenther

https://www.atptour.com/en/players/yanais-laurent/lc18/overview
https://www.atptour.com/en/players/aron-pierce/pl48/overview
https://www.atptour.com/en/players/tom-pisane/p0d1/overview
https://www.atptour.com/en/players/luis-pati%C3%B1o/pe42/overview
https://www.atptour.com/en/players/sagadat-ayap/ab19/overview
https://www.atptour.com/en/players/michael-zhu/z350/overview
https://www.atptour.com/en/players/isaac-stoute/so99/overview
https://www.atptour.com/en/players/maksim-tikhomirov/ta66/overview
https://www.atptour.com/en/players/christian-harrison/hb05/overview
https://www.atptour.com/en/players/ryohei-tagata/t0cl/overview
https://www.atptour.com/en/players/timo-stodder/s0ar/overview
https://www.atptour.com/en/players/dejan-katic/k767/overview
https://www.atptour.com/en/players/alon-elia/bk54/overview
https://www.atptour.com/en/players/lucas-renard/rb49/overview
https://www.atptour.com/en/players/florent-diep/da59/overview
https://www.atptour.com/en/players/alexandre-peyrot/p0bx/ove

In [None]:
# -- Add some missing players
missing_links = [
    'https://www.atptour.com/en/players/sam-groth/g940/overview',
    'https://www.atptour.com/en/players/pere-riba/r750/overview',
    'https://www.atptour.com/en/players/jan-hajek/h571/overview',
    'https://www.atptour.com/en/players/flavio-cipolla/c723/overview',
    'https://www.atptour.com/en/players/michal-przysiezny/p701/overview',
    'https://www.atptour.com/en/players/ryan-sweeting/sg87/overview',
    'https://www.atptour.com/en/players/grega-zemlja/z189/overview',
    'https://www.atptour.com/en/players/maximo-gonzalez/g585/overview',
    'https://www.atptour.com/en/players/eduardo-schwank/se66/overview'
]

for link in missing_links:
        print(link)
        atp_data_list.append( collect_atp_official_player_data(link) )

In [11]:
atp_height_weight_data = pd.DataFrame(atp_data_list)
atp_height_weight_data

Unnamed: 0,player_handedness,player_height_cm,player_height_inches,player_name,player_weight_kg,player_weight_lbs
0,right-handed,188,"6'2""",Novak Djokovic,77,170
1,left-handed,185,"6'1""",Rafael Nadal,85,187
2,right-handed,185,"6'1""",Dominic Thiem,79,174
3,right-handed,185,"6'1""",Roger Federer,85,187
4,right-handed,198,"6'6""",Daniil Medvedev,83,182
5,right-handed,193,"6'4""",Stefanos Tsitsipas,89,196
6,right-handed,198,"6'6""",Alexander Zverev,90,198
7,right-handed,196,"6'5""",Matteo Berrettini,95,209
8,right-handed,193,"6'4""",Gael Monfils,85,187
9,right-handed,180,"5'11""",David Goffin,70,154


In [12]:
# -- Remove all hyphens from names (I think this is more consistent with how Tennis Abstract codes their names???)
atp_height_weight_data['player_name'] = atp_height_weight_data['player_name'].str.replace('-', ' ')

# -- Manually change some names...
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Jo Wilfried Tsonga', 'player_name']= 'Jo-Wilfried Tsonga'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Albert Ramos Vinolas', 'player_name']= 'Albert Ramos'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Thiemo de Bakker', 'player_name']= 'Thiemo De Bakker'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Alex Bogomolov Jr.', 'player_name']= 'Alex Bogomolov Jr'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Alex de Minaur', 'player_name']= 'Alex De Minaur'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Andreas Maurer', 'player_name']= 'Andreas Haider Maurer'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Victor Estrella Burgos', 'player_name']= 'Victor Estrella'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Cristian Garin', 'player_name']= 'Christian Garin'
atp_height_weight_data.loc[atp_height_weight_data['player_name'] == 'Fred Gil', 'player_name']= 'Frederico Gil'


In [13]:
atp_height_weight_data.to_csv('./data/processed_data/official_atp_height_2020.csv', index = False)

### Appendix: Prototype Code

In [None]:
link = 'https://www.atptour.com/en/players/joao-domingues/d985/overview'
collect_atp_official_player_data(link)

In [None]:
link = 'https://www.atptour.com/en/players/benoit-paire/pd31/overview'

# Connect to player page
player_page = requests.get(link).text

# Get (entire) HTML File
player_html = BeautifulSoup(player_page, 'lxml')
player_html

In [None]:
# -- Get Player Name
player_first_name_html = player_html.find("div", attrs={"class": "first-name"})
player_first_name = player_first_name_html.text.strip()

player_surname_html = player_html.find("div", attrs={"class": "last-name"})
player_surname = player_surname_html.text.strip()

player_name = player_first_name + ' ' + player_surname
player_name

In [None]:
# -- Get player weight in imperial units
player_weight_lbs = player_html.find("span", attrs={"class": "table-weight-lbs"})
player_weight_lbs.text

In [None]:
# -- Get player weight in kg
player_weight_kg = player_html.find("span", attrs={"class": "table-weight-kg-wrapper"})
player_weight_kg.text.lower().replace('(','').replace('kg)','')

In [None]:
# -- Get player height in feet & inches
player_height_imp = player_html.find("span", attrs={"class": "table-height-ft"})
player_height_imp.text

In [None]:
# -- Get player height in cm
player_height_cm = player_html.find("span", attrs={"class": "table-height-cm-wrapper"})
player_height_cm.text.lower().replace('(','').replace('cm)','')

In [None]:
# -- Get player handedness
for item in player_html.find_all("div", attrs={"class": "table-value"}):
    if 'handed' in item.text.lower().strip():
        player_handedness = item.text.lower().strip().split(',')[0]
player_handedness