In [1]:
from selenium import webdriver
import re
from bs4 import BeautifulSoup
import pandas as pd
import time

# Scrape the Data
Obtain all of the match scores for the European Champions League water polo tournaments for the current season (2022-2023) from flashscore.com

In [3]:
url = 'https://www.flashscore.com/water-polo/europe/champions-league/results/'

# xpath for the "Show more matches" link often initially at the bottom of the table, hiding some of the match sores
show_more_xpath = '/html/body/div[3]/div[1]/div/div/main/div[4]/div[2]/div[1]/div[1]/div/div/a'

# function to determine if the tag is part of the data targeted for extraction
def target_tags(c):
    target_class = ["event__header",
                    "event__round",
                    "event__match"]
    return c in target_class


driver = webdriver.Firefox()
driver.get(url)
    
# many (but not all) pages have matches that are hidden by a "Show more matches" header
# this section clicks the header, if it is there
try:
    element = driver.find_element("xpath", show_more_xpath)
    driver.execute_script("arguments[0].scrollIntoView();", element)
    element.click()
    time.sleep(10) # waits for 10 seconds for the page to load/update from the click()
except:
    pass

sauce = driver.page_source
driver.quit()
soup = BeautifulSoup(sauce,'lxml')
    
games = soup.find_all(class_=target_tags)

df = pd.DataFrame()
l = []
obj = {}
event_round = None
event_name = None
season = soup.find("div", {"class": "heading__info"}).text


for a in range(0, len(games)):

    if games[a]['class'][0] == 'event__round':
        event_round = games[a].text
    if games[a]['class'][0] == 'event__header':
        event_name = games[a].find("span", {"class": "event__title--name"}).text
        
    obj["event_name"] = event_name
    obj["event_round"] = event_round
    
    try:
        obj["match_time"] = games[a].find("div", {"class": "event__time"}).text
    except: 
        obj["match_time"] = None
    try:
        obj["home_team"] = games[a].find("div", {"class": re.compile("^(event__participant event__participant--home)")}).text
    except:
        obj["home_team"] = None
    try:
        obj["away_team"] = games[a].find("div", {"class": re.compile("^(event__participant event__participant--away)")}).text
    except:
        obj["away_team"] = None
    try:
        obj["home_score"] = games[a].find("div", {"class": "event__score event__score--home"}).text
    except:
        obj["home_score"] = None
    try:
        obj["away_score"] = games[a].find("div", {"class": "event__score event__score--away"}).text
    except:
        obj["away_score"] = None
        
    if pd.notna(obj["home_team"]):
        l.append(obj)
    obj = {}
    
    
# Loop through and append list to data frame.
for i in l:
    event_name = i["event_name"]
    event_round = i["event_round"]
    match_time = i["match_time"]
    home_team = i["home_team"]
    away_team = i["away_team"]
    home_score = i["home_score"]
    away_score = i["away_score"]
    
    df = df.append(
        {"event_name": event_name,
         "event_round": event_round,
         "match_time": match_time,
         "home_team": home_team,
         "away_team": away_team,
         "home_score": home_score,
         "away_score": away_score,
         "season": season
        }, ignore_index=True
    )

Unnamed: 0,away_score,away_team,event_name,event_round,home_score,home_team,match_time,season
0,10,Olympiacos,Champions League,Round 8,10,Radnicki,18.02. 11:30,2022/2023
1,5,Dinamo Tbilisi,Champions League,Round 8,20,Pro Recco,18.02. 10:30,2022/2023
2,9,Hannover,Champions League,Round 8,10,Vouliagmeni,18.02. 09:00,2022/2023
3,6,Jadran ST,Champions League,Round 8,8,Barceloneta,18.02. 04:00,2022/2023
4,13,Marseille,Champions League,Round 8,15,Brescia,17.02. 11:30,2022/2023
...,...,...,...,...,...,...,...,...
103,17,Jadran ST,Champions League - Qualification - Second stage,Quarter-finals,7,Steaua Bucuresti,13.10. 11:30,2022/2023
104,16,Panionios,Champions League - Qualification - Second stage,Quarter-finals,9,Zaibas,13.10. 10:30,2022/2023
105,8,CSM Oradea,Champions League - Qualification - Second stage,Quarter-finals,9,Jadran HN,13.10. 10:15,2022/2023
106,13,Vasas,Champions League - Qualification - Second stage,Quarter-finals,4,Sabac,13.10. 09:30,2022/2023


In [4]:
# import the data from previous seasons and append with updated current season data
df_old = pd.read_csv('champions_league_rawData_completedSeasons.csv')
df_old = df_old.append(df)

Unnamed: 0,away_score,away_team,event_name,event_round,home_score,home_team,match_time,season
0,8,Primorac,Champions League - Play Offs,Final,7,Pro Recco,23.05. 11:00AET,2008/2009
1,13,Mladost,Champions League - Play Offs,3rd place,14,Jug Dubrovnik,23.05. 09:30,2008/2009
2,11,Primorac,Champions League - Play Offs,Semi-finals,9,Mladost,22.05. 11:00,2008/2009
3,9,Pro Recco,Champions League - Play Offs,Semi-finals,6,Jug Dubrovnik,22.05. 09:30,2008/2009
4,9,Primorac,Champions League - Play Offs,Quarter-finals,8,Jadran HN,22.04. 10:00,2008/2009
...,...,...,...,...,...,...,...,...
103,17,Jadran ST,Champions League - Qualification - Second stage,Quarter-finals,7,Steaua Bucuresti,13.10. 11:30,2022/2023
104,16,Panionios,Champions League - Qualification - Second stage,Quarter-finals,9,Zaibas,13.10. 10:30,2022/2023
105,8,CSM Oradea,Champions League - Qualification - Second stage,Quarter-finals,9,Jadran HN,13.10. 10:15,2022/2023
106,13,Vasas,Champions League - Qualification - Second stage,Quarter-finals,4,Sabac,13.10. 09:30,2022/2023


In [5]:
# save the combined data by exporting to a cvs file
df_old.to_csv('champions_league_rawData.csv', index=False)