### Description

This script will scrape `official` player height and weight data from the ATP website

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 100)

import requests

from bs4 import BeautifulSoup
from dateutil.parser import parse


In [32]:
def collect_atp_official_player_data(player_profile_url):
    
    # Connect to player page
    player_page = requests.get(link).text

    # Get (entire) HTML File
    player_html = BeautifulSoup(player_page, 'lxml')
    
    # Player Name
    player_first_name_html = player_html.find("div", attrs={"class": "first-name"})
    player_first_name = player_first_name_html.text.strip()

    player_surname_html = player_html.find("div", attrs={"class": "last-name"})
    player_surname = player_surname_html.text.strip()

    player_name = player_first_name + ' ' + player_surname
    
    # -- Get player weight in imperial units
    try:
        player_weight_lbs_html = player_html.find("span", attrs={"class": "table-weight-lbs"})
        player_weight_lbs = player_weight_lbs_html.text.strip()
    except (KeyError, AttributeError):
        player_weight_lbs = None
        
    
    # -- Get player weight in kg
    try:
        player_weight_kg_html = player_html.find("span", attrs={"class": "table-weight-kg-wrapper"})
        player_weight_kg = player_weight_kg_html.text.lower().replace('(','').replace('kg)','')
    except (KeyError, AttributeError):
        player_weight_kg = None
         
    
    
    # -- Get player height in feet & inches
    try:
        player_height_imp_html = player_html.find("span", attrs={"class": "table-height-ft"})
        player_height_imp = player_height_imp_html.text
    except (KeyError, AttributeError):
        player_height_imp = None
         
    
    # -- Get player height in cm
    try:
        player_height_cm_html = player_html.find("span", attrs={"class": "table-height-cm-wrapper"})
        player_height_cm = player_height_cm_html.text.lower().replace('(','').replace('cm)','')
    except (KeyError, AttributeError):
        player_height_cm = None
        
        # -- Get player handedness
    for item in player_html.find_all("div", attrs={"class": "table-value"}):
        if 'handed' in item.text.lower().strip():
            player_handedness = item.text.lower().strip().split(',')[0]


    
    player_dictionary = {
        'player_name': player_name,
        'player_height_inches': player_height_imp,
        'player_height_cm': player_height_cm,
        'player_weight_lbs': player_weight_lbs,
        'player_weight_kg': player_weight_kg,
        'player_handedness': player_handedness
    }

    return player_dictionary

In [34]:
# Loop through to collect all data
atp_link_data = pd.read_csv('./data/raw_data/atp_official_site_urls.csv')
atp_link_data
atp_data_list = []
for link in atp_link_data['bio_link']:
    if 'overview' in link:
        print(link)
        atp_data_list.append( collect_atp_official_player_data(link) )
    
atp_height_weight_data = pd.DataFrame(atp_data_list)

https://www.atptour.com/en/players/novak-djokovic/d643/overview
https://www.atptour.com/en/players/rafael-nadal/n409/overview
https://www.atptour.com/en/players/dominic-thiem/tb69/overview
https://www.atptour.com/en/players/roger-federer/f324/overview
https://www.atptour.com/en/players/daniil-medvedev/mm58/overview
https://www.atptour.com/en/players/stefanos-tsitsipas/te51/overview
https://www.atptour.com/en/players/alexander-zverev/z355/overview
https://www.atptour.com/en/players/matteo-berrettini/bk40/overview
https://www.atptour.com/en/players/gael-monfils/mc65/overview
https://www.atptour.com/en/players/david-goffin/gb88/overview
https://www.atptour.com/en/players/fabio-fognini/f510/overview
https://www.atptour.com/en/players/roberto-bautista-agut/bd06/overview
https://www.atptour.com/en/players/diego-schwartzman/sm37/overview
https://www.atptour.com/en/players/andrey-rublev/re44/overview
https://www.atptour.com/en/players/karen-khachanov/ke29/overview
https://www.atptour.com/en/pl

https://www.atptour.com/en/players/andy-murray/mc10/overview
https://www.atptour.com/en/players/federico-gaio/gb53/overview
https://www.atptour.com/en/players/mohamed-safwat/sk47/overview
https://www.atptour.com/en/players/prajnesh-gunneswaran/ga94/overview
https://www.atptour.com/en/players/cedrik-marcel-stebe/sk94/overview
https://www.atptour.com/en/players/facundo-bagnis/bf23/overview
https://www.atptour.com/en/players/juan-pablo-varillas/v836/overview
https://www.atptour.com/en/players/antoine-hoang/ha71/overview
https://www.atptour.com/en/players/henri-laaksonen/l949/overview
https://www.atptour.com/en/players/ilya-ivashka/i305/overview
https://www.atptour.com/en/players/guido-andreozzi/a887/overview
https://www.atptour.com/en/players/zhizhen-zhang/z371/overview
https://www.atptour.com/en/players/blaz-rola/ra51/overview
https://www.atptour.com/en/players/hyeon-chung/ch27/overview
https://www.atptour.com/en/players/yannick-hanfmann/h997/overview
https://www.atptour.com/en/players/j

In [35]:
atp_height_weight_data.head(100)

Unnamed: 0,player_name,player_height_inches,player_height_cm,player_weight_lbs,player_weight_kg,player_handedness
0,Novak Djokovic,"6'2""",188,170,77,right-handed
1,Rafael Nadal,"6'1""",185,187,85,left-handed
2,Dominic Thiem,"6'1""",185,174,79,right-handed
3,Roger Federer,"6'1""",185,187,85,right-handed
4,Daniil Medvedev,"6'6""",198,182,83,right-handed
5,Stefanos Tsitsipas,"6'4""",193,196,89,right-handed
6,Alexander Zverev,"6'6""",198,198,90,right-handed
7,Matteo Berrettini,"6'5""",196,209,95,right-handed
8,Gael Monfils,"6'4""",193,187,85,right-handed
9,David Goffin,"5'11""",180,154,70,right-handed


In [36]:
atp_height_weight_data.to_csv('./data/processed_data/official_atp_height_2020.csv', index = False)

### Appendix: Prototype Code

In [33]:
link = 'https://www.atptour.com/en/players/joao-domingues/d985/overview'
collect_atp_official_player_data(link)

{'player_name': 'Joao Domingues',
 'player_height_inches': None,
 'player_height_cm': None,
 'player_weight_lbs': '154',
 'player_weight_kg': '70',
 'player_handedness': 'right-handed'}

In [3]:
link = 'https://www.atptour.com/en/players/benoit-paire/pd31/overview'

# Connect to player page
player_page = requests.get(link).text

# Get (entire) HTML File
player_html = BeautifulSoup(player_page, 'lxml')
player_html

<!DOCTYPE html>
<!-- START : /modules/global/head --><!--[if lt IE 7]>
    <html class="no-js lt-ie10 lt-ie9 lt-ie8 lt-ie7 ">
<![endif]--><!--[if IE 7]>
    <html class="no-js lt-ie10 lt-ie9 lt-ie8 ">
<![endif]--><!--[if IE 8]>
    <html class="no-js lt-ie10 lt-ie9 ">
<![endif]--><!--[if IE 9]>
    <html class="no-js lt-ie10 ">
<![endif]--><!--[if gt IE 9]><!--><html class="no-js" translate="no">
<!--<![endif]-->
<head>
<!-- disable auto format for telephone numbers -->
<meta content="telephone=no" name="format-detection"/>
<title>
	Benoit Paire | Overview | ATP Tour | Tennis
</title>
<meta content="initial-scale=1.0, width=768, user-scalable=yes, minimum-scale=1.0, maximum-scale=1.25" name="viewport"/>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="" name="keywords"/>
<meta content="Official tennis player profile of Benoit Paire on the ATP Tour. Featuring news, bio, rankings, playing activity, coach, stats, win-loss, points break

In [18]:
# -- Get Player Name
player_first_name_html = player_html.find("div", attrs={"class": "first-name"})
player_first_name = player_first_name_html.text.strip()

player_surname_html = player_html.find("div", attrs={"class": "last-name"})
player_surname = player_surname_html.text.strip()

player_name = player_first_name + ' ' + player_surname
player_name

'Benoit Paire'

In [5]:
# -- Get player weight in imperial units
player_weight_lbs = player_html.find("span", attrs={"class": "table-weight-lbs"})
player_weight_lbs.text

'176'

In [10]:
# -- Get player weight in kg
player_weight_kg = player_html.find("span", attrs={"class": "table-weight-kg-wrapper"})
player_weight_kg.text.lower().replace('(','').replace('kg)','')

'80'

In [11]:
# -- Get player height in feet & inches
player_height_imp = player_html.find("span", attrs={"class": "table-height-ft"})
player_height_imp.text

'6\'5"'

In [15]:
# -- Get player height in cm
player_height_cm = player_html.find("span", attrs={"class": "table-height-cm-wrapper"})
player_height_cm.text.lower().replace('(','').replace('cm)','')

'196'

In [24]:
# -- Get player handedness
for item in player_html.find_all("div", attrs={"class": "table-value"}):
    if 'handed' in item.text.lower().strip():
        player_handedness = item.text.lower().strip().split(',')[0]
player_handedness

'right-handed'