In [6]:
from selenium import webdriver
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

# Scrape the Data
Obtain all of the unplayed, but scheduled, matchups for the European Champions League water polo tournaments this season (2022-2023) from flashscore.com

In [2]:
url = 'https://www.flashscore.com/water-polo/europe/champions-league/fixtures'

driver = webdriver.Firefox()
driver.get(url)

sauce = driver.page_source
driver.quit()
soup = BeautifulSoup(sauce,'lxml')

In [3]:
# function to determine if the tag is part of the data targeted for extraction
def target_tags(c):
    target_class = ["event__header",
                    "event__round",
                    "event__match"]
    return c in target_class

games = soup.find_all(class_=target_tags)

df = pd.DataFrame()
l = []
obj = {}
event_round = None
event_name = None
season = soup.find("div", {"class": "heading__info"}).text


for a in range(0, len(games)):

    if games[a]['class'][0] == 'event__round':
        event_round = games[a].text
    if games[a]['class'][0] == 'event__header':
        event_name = games[a].find("span", {"class": "event__title--name"}).text
        
    obj["event_name"] = event_name
    obj["event_round"] = event_round
    
    try:
        obj["match_time"] = games[a].find("div", {"class": "event__time"}).text
    except: 
        obj["match_time"] = None
    try:
        obj["home_team"] = games[a].find("div", {"class": re.compile("^(event__participant event__participant--home)")}).text
    except:
        obj["home_team"] = None
    try:
        obj["away_team"] = games[a].find("div", {"class": re.compile("^(event__participant event__participant--away)")}).text
    except:
        obj["away_team"] = None
    
        
    if pd.notna(obj["home_team"]):
        l.append(obj)
    obj = {}
    
    
# Loop through and append list to data frame.
for i in l:
    event_name = i["event_name"]
    event_round = i["event_round"]
    match_time = i["match_time"]
    home_team = i["home_team"]
    away_team = i["away_team"]
    
    df = df.append(
        {"event_name": event_name,
         "event_round": event_round,
         "match_time": match_time,
         "home_team": home_team,
         "away_team": away_team,
         "season": season
        }, ignore_index=True
    )

In [4]:
# separate the match month, day, and time
df[['day', 'month', 'start_time']] = df.pop('match_time').str.split('.', expand=True)
df[['day', 'month']] = df[['day', 'month']].astype('int64')

In [7]:
# identify the year of the match
# note: July (month 7) always separates the beginning and end of successive seasons in the water polo Champions League
df[['a', 'b']] = df['season'].str.split('/', expand=True)
df['year'] = np.where(df['month'] > 7, df['a'], df['b'])
df.drop(['a', 'b'], axis=1, inplace=True)

In [8]:
# combine the day, month, year into a datetime variable
df['date'] = pd.to_datetime(df[['day', 'month', 'year']])

In [10]:
# save the data by exporting to a cvs file
df.to_csv('champions_league_scheduledGames.csv', index=False)