In [1]:
from selenium import webdriver
import re
from bs4 import BeautifulSoup
import pandas as pd
import time

# Scrape the Data
Obtain all of the match scores for the European Champions League water polo tournaments between 2008-2022 from flashscore.com

In [2]:
base_url = 'https://www.flashscore.com/water-polo/europe/champions-league'
# for the current year, no year exists in the url, hence the empty string at the start of the 'years' list
years = ['-2008-2009', '-2009-2010', '-2010-2011', '-2011-2012',   
         '-2012-2013', '-2013-2014', '-2014-2015', '-2015-2016', 
         '-2016-2017', '-2017-2018', '-2018-2019', '-2019-2020',
         '-2020-2021', '-2021-2022']
# xpath for the "Show more matches" link often initially at the bottom of the table, hiding some of the match sores
show_more_xpath = '/html/body/div[3]/div[1]/div/div/main/div[4]/div[2]/div[1]/div[1]/div/div/a'

# function to determine if the tag is part of the data targeted for extraction
def target_tags(c):
    target_class = ["event__header",
                    "event__round",
                    "event__match"]
    return c in target_class


df = pd.DataFrame()

for y in years:
    url = base_url + y + '/results/'
    
    driver = webdriver.Firefox()
    driver.get(url)
    
    # many (but not all) pages have matches that are hidden by a "Show more matches" header
    # this section clicks the header, if it there
    try:
        element = driver.find_element("xpath", show_more_xpath)
        driver.execute_script("arguments[0].scrollIntoView();", element)
        element.click()
        time.sleep(10) # waits for 10 seconds for the page to load/update from the click()
    except:
        pass

    sauce = driver.page_source
    driver.quit()
    soup = BeautifulSoup(sauce,'lxml')
    
    games = soup.find_all(class_=target_tags)

    l = []
    obj = {}
    event_round = None
    event_name = None
    season = soup.find("div", {"class": "heading__info"}).text


    for a in range(0, len(games)):

        if games[a]['class'][0] == 'event__round':
            event_round = games[a].text
        if games[a]['class'][0] == 'event__header':
            event_name = games[a].find("span", {"class": "event__title--name"}).text
        
        obj["event_name"] = event_name
        obj["event_round"] = event_round
    
        try:
            obj["match_time"] = games[a].find("div", {"class": "event__time"}).text
        except: 
            obj["match_time"] = None
        try:
            obj["home_team"] = games[a].find("div", {"class": re.compile("^(event__participant event__participant--home)")}).text
        except:
            obj["home_team"] = None
        try:
            obj["away_team"] = games[a].find("div", {"class": re.compile("^(event__participant event__participant--away)")}).text
        except:
            obj["away_team"] = None
        try:
            obj["home_score"] = games[a].find("div", {"class": "event__score event__score--home"}).text
        except:
            obj["home_score"] = None
        try:
            obj["away_score"] = games[a].find("div", {"class": "event__score event__score--away"}).text
        except:
            obj["away_score"] = None
        
        if pd.notna(obj["home_team"]):
            l.append(obj)
        obj = {}
    
    
    # Loop through and append list to data frame.
    for i in l:
        event_name = i["event_name"]
        event_round = i["event_round"]
        match_time = i["match_time"]
        home_team = i["home_team"]
        away_team = i["away_team"]
        home_score = i["home_score"]
        away_score = i["away_score"]
    
        df = df.append(
            {"event_name": event_name,
             "event_round": event_round,
             "match_time": match_time,
             "home_team": home_team,
             "away_team": away_team,
             "home_score": home_score,
             "away_score": away_score,
             "season": season
            }, ignore_index=True
        )

Unnamed: 0,away_score,away_team,event_name,event_round,home_score,home_team,match_time,season
0,8,Primorac,Champions League - Play Offs,Final,7,Pro Recco,23.05. 11:00AET,2008/2009
1,13,Mladost,Champions League - Play Offs,3rd place,14,Jug Dubrovnik,23.05. 09:30,2008/2009
2,11,Primorac,Champions League - Play Offs,Semi-finals,9,Mladost,22.05. 11:00,2008/2009
3,9,Pro Recco,Champions League - Play Offs,Semi-finals,6,Jug Dubrovnik,22.05. 09:30,2008/2009
4,9,Primorac,Champions League - Play Offs,Quarter-finals,8,Jadran HN,22.04. 10:00,2008/2009
...,...,...,...,...,...,...,...,...
1544,12,OSC Ujbuda,Champions League,Round 3,12,Hannover,20.11. 05:00,2021/2022
1545,14,Brescia,Champions League,Round 3,8,Radnicki,19.11. 11:30,2021/2022
1546,17,Novi Beograd,Champions League,Round 3,12,Jadran ST,19.11. 10:00,2021/2022
1547,9,Olympiacos,Champions League,Round 3,9,Ferencvarosi,19.11. 09:30,2021/2022


In [3]:
# save data by exporting to a cvs file
df.to_csv('champions_league_rawData_completedSeasons.csv', index=False)