### Description

This script will scrape `official` player height and weight data from the WTA and ATP websites

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)

import requests

from bs4 import BeautifulSoup
from dateutil.parser import parse


In [5]:
def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

def collect_official_player_data(player_profile_url):
    
    # Connect to player page
    player_page = requests.get(player_profile_url).text

    # Get (entire) HTML File
    player_html = BeautifulSoup(player_page, 'lxml')
    
    # -- Get Player Name
    player_first_name_html = player_html.find("span", attrs={"class": "profile-header-info__firstname"})
    player_first_name = player_first_name_html.text.strip()

    player_surname_html = player_html.find("span", attrs={"class": "profile-header-info__surname"})
    player_surname = player_surname_html.text.strip()

    player_name = player_first_name + ' ' + player_surname
    
    # -- Get player height in imperial units
    player_height_inches_html = player_html.find("span", attrs={"class": "profile-header-info__detail-height"})
    
    try:
        player_height_inches = player_height_inches_html.text.strip()
        
    except AttributeError: 
        player_height_inches = None
    
    
    # -- Get player height in metres, handedness, date of birth and hometown
    
    # Other physiological data like Handedness, Height, Weight
    player_raw_info_list = []
    player_handedness = None
    player_hometown = None
    player_height_metres = None
    player_birthday = None
    
    for item in player_html.find_all("div", attrs={"class": "profile-header-info__detail-stat--small"}):
        player_raw_info_list.append(item.text.strip())
        
        if 'handed' in item.text.lower().strip():
            player_handedness = item.text.lower().strip()
            
        if ',' in item.text.strip():
            player_hometown = item.text.strip()
            
        if ('m' in item.text.lower().strip()) & ('.' in item.text.strip()):
            player_height_metres = item.text.lower().strip().replace('m', '')
            
        if is_date(item.text.strip(), fuzzy=False):
            player_birthday = item.text.strip()
            

            
    if len(player_raw_info_list) != 4:
        player_raw_info_list = [None, None, None, None]

    player_dictionary = {
        'player_name': player_name,
        'player_height_inches': player_height_inches,
        'player_height_metres': player_height_metres,
        'player_handedness': player_handedness,
        'player_birthday': player_birthday,
        'player_hometown': player_hometown
        }
    
    return player_dictionary
    
    

In [6]:
wta_link_data = pd.read_csv('./data/raw_data/wta_official_site_urls.csv')
wta_link_data
wta_data_list = []
for link in wta_link_data['bio_link']:
    print(link)
    wta_data_list.append( collect_official_player_data(link) )
    
wta_height_weight_data = pd.DataFrame(wta_data_list)

https://www.wtatennis.com/players/318033/ashleigh-barty
https://www.wtatennis.com/players/314320/simona-halep
https://www.wtatennis.com/players/313974/karolina-pliskova
https://www.wtatennis.com/players/320942/sofia-kenin
https://www.wtatennis.com/players/316738/elina-svitolina
https://www.wtatennis.com/players/325088/bianca-andreescu
https://www.wtatennis.com/players/314584/kiki-bertens
https://www.wtatennis.com/players/319001/belinda-bencic
https://www.wtatennis.com/players/230234/serena-williams
https://www.wtatennis.com/players/319998/naomi-osaka
https://www.wtatennis.com/players/320760/aryna-sabalenka
https://www.wtatennis.com/players/314206/petra-kvitova
https://www.wtatennis.com/players/316959/madison-keys
https://www.wtatennis.com/players/313711/petra-martic
https://www.wtatennis.com/players/313907/johanna-konta
https://www.wtatennis.com/players/316239/garbi-e-muguruza
https://www.wtatennis.com/players/324166/elena-rybakina
https://www.wtatennis.com/players/323027/marketa-vondr

https://www.wtatennis.com/players/312536/yanina-wickmayer
https://www.wtatennis.com/players/319489/harriet-dart
https://www.wtatennis.com/players/324028/leonie-kung
https://www.wtatennis.com/players/310431/mandy-minella
https://www.wtatennis.com/players/315296/lara-arruabarrena
https://www.wtatennis.com/players/310761/sara-errani
https://www.wtatennis.com/players/315279/nicole-gibbs
https://www.wtatennis.com/players/316266/martina-trevisan
https://www.wtatennis.com/players/312413/arina-rodionova
https://www.wtatennis.com/players/318913/varvara-flink
https://www.wtatennis.com/players/310915/giulia-gatto-monticone
https://www.wtatennis.com/players/160515/pauline-parmentier
https://www.wtatennis.com/players/320277/magdalena-frech
https://www.wtatennis.com/players/314793/veronica-cepede-royg
https://www.wtatennis.com/players/317443/sachia-vickery
https://www.wtatennis.com/players/318352/irina-bara
https://www.wtatennis.com/players/319042/allie-kiick
https://www.wtatennis.com/players/318494

https://www.wtatennis.com/players/311579/timea-bacsinszky
https://www.wtatennis.com/players/313215/alexandra-cadantu
https://www.wtatennis.com/players/314981/martina-caregaro
https://www.wtatennis.com/players/321743/hanna-chang
https://www.wtatennis.com/players/320329/shilin-xu
https://www.wtatennis.com/players/319856/zoe-hives
https://www.wtatennis.com/players/319250/valentini-grammatikopoulou
https://www.wtatennis.com/players/320421/francoise-abanda
https://www.wtatennis.com/players/327077/diane-parry
https://www.wtatennis.com/players/313485/rebecca-marino
https://www.wtatennis.com/players/325940/jule-niemeier
https://www.wtatennis.com/players/320218/louisa-chirico
https://www.wtatennis.com/players/320497/irina-fetecau
https://www.wtatennis.com/players/321193/elizabeth-halbauer
https://www.wtatennis.com/players/323171/lea-boskovic
https://www.wtatennis.com/players/318488/jesika-maleckova
https://www.wtatennis.com/players/317784/stefania-rubini
https://www.wtatennis.com/players/320862

https://www.wtatennis.com/players/314181/chieh-yu-hsu
https://www.wtatennis.com/players/324782/yuliya-hatouka
https://www.wtatennis.com/players/321329/wushuang-zheng
https://www.wtatennis.com/players/322704/alexandra-bozovic
https://www.wtatennis.com/players/312582/federica-di-sarra
https://www.wtatennis.com/players/318907/rutuja-bhosale
https://www.wtatennis.com/players/320039/olivia-tjandramulia
https://www.wtatennis.com/players/318480/victoria-bosio
https://www.wtatennis.com/players/318201/camilla-rosatello
https://www.wtatennis.com/players/314563/tara-moore
https://www.wtatennis.com/players/322417/ye-xin-ma
https://www.wtatennis.com/players/312964/angelina-gabueva
https://www.wtatennis.com/players/321271/gozal-ainitdinova
https://www.wtatennis.com/players/314346/yuuki-tanaka
https://www.wtatennis.com/players/318181/estelle-cascino
https://www.wtatennis.com/players/324236/rosa-vicens-mas
https://www.wtatennis.com/players/314419/paula-cristina-goncalves
https://www.wtatennis.com/play

In [7]:
wta_height_weight_data.head(100)

Unnamed: 0,player_name,player_height_inches,player_height_metres,player_handedness,player_birthday,player_hometown
0,Ashleigh Barty,"5' 5""",1.66,right-handed,Apr 24 1996,"Ipswich, Australia"
1,Simona Halep,"5' 6""",1.68,right-handed,Sep 27 1991,"Constanta, Romania"
2,Karolina Pliskova,"6' 1""",1.86,right-handed,Mar 21 1992,"Louny, Czech Republic"
3,Sofia Kenin,5’ 7”,1.7,right-handed,Nov 14 1998,"Moscow, Russia"
4,Elina Svitolina,"5' 9""",1.74,right-handed,Sep 12 1994,"Odessa, Ukraine"
5,Bianca Andreescu,5' 7'',1.7,right-handed,Jun 16 2000,"Mississauga, Ontario, Canada"
6,Kiki Bertens,"6' 0""",1.82,right-handed,Dec 10 1991,"Wateringen, Netherlands"
7,Belinda Bencic,"5' 9""",1.75,right-handed,Mar 10 1997,"Flawil, Switzerland"
8,Serena Williams,"5' 9""",1.75,right-handed,Sep 26 1981,"Saginaw, MI, USA"
9,Naomi Osaka,"5' 11""",1.8,right-handed,Oct 16 1997,"Osaka, Japan"


In [9]:
wta_height_weight_data.to_csv('./data/processed_data/official_wta_height_2020.csv', index = False)

### Testing portion of script:

In [2]:
test_link = 'https://www.wtatennis.com/players/314610/camila-giorgi'
page = requests.get(test_link).text

html_soup = BeautifulSoup(page, 'lxml')

player_raw_info_list = []
for item in html_soup.find_all("div", attrs={"class": "profile-header-info__detail-stat--small"}):
    player_raw_info_list.append(item.text.strip())

player_raw_info_list

html_soup.find_all("div", attrs={"class": "profile-header-info__detail-stat--small"})

[<div class="profile-header-info__detail-stat--small">
                                         1.68m
                                     </div>,
 <div class="profile-header-info__detail-stat--small">
                                         Right-Handed
                                     </div>,
 <div class="profile-header-info__detail-stat--small">
                                             Dec 30 1991
                                         </div>,
 <div class="profile-header-info__detail-stat--small">
                                         Macerata, Italy
                                     </div>]

In [4]:
player_first_name_html = html_soup.find("span", attrs={"class": "profile-header-info__firstname"})
player_first_name = player_first_name_html.text.strip()

player_surname_html = html_soup.find("span", attrs={"class": "profile-header-info__surname"})
player_surname = player_surname_html.text.strip()

player_name = player_first_name + ' ' + player_surname

player_name

'Camila Giorgi'