In [1]:
import requests
from bs4 import BeautifulSoup

# First, try to scrape attendance value from specific match URL

In [2]:
# These test URLS contain 22/23 Belgium 1st div game, 07/08 English League One game,
# 20/21 (covid!) English League One game, 16/17 Cyprus 1st div game

urls = [
    "https://www.worldfootball.net/report/eerste-klasse-a-2022-2023-krc-genk-sv-zulte-waregem/",
    "https://www.worldfootball.net/report/league-one-2007-2008-millwall-fc-tranmere-rovers/",
    "https://www.worldfootball.net/report/league-one-2020-2021-ipswich-town-fleetwood-town/",
    "https://www.worldfootball.net/report/first-division-2016-2017-apoel-nikosia-anorthosis-famagusta-fc/"
]

# Loop to scrape attendance figures from URL list:

for URL in urls:
    print(f"Fetching data for: {URL}")

    # Fetch the webpage
    response = requests.get(URL)
    response.raise_for_status()  # Will raise an exception if there's an error

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    results = soup.find(id="site").find_all("td", class_="dunkel")

    attendance_td = None
    for i, td in enumerate(results):
        img = td.find('img', title='Attendance')
        if img:
            attendance_td = results[i + 1]  # Get the next <td> element after the img
            break

    if attendance_td:
        attendance = attendance_td.get_text(strip=True)
        print(f"Attendance: {attendance}")
    else:
        print("Attendance not found.")
    print("--------------------")  # To separate results for clarity


Fetching data for: https://www.worldfootball.net/report/eerste-klasse-a-2022-2023-krc-genk-sv-zulte-waregem/
Attendance: 14.111
--------------------
Fetching data for: https://www.worldfootball.net/report/league-one-2007-2008-millwall-fc-tranmere-rovers/
Attendance: 8.925
--------------------
Fetching data for: https://www.worldfootball.net/report/league-one-2020-2021-ipswich-town-fleetwood-town/
Attendance: without spectators.
--------------------
Fetching data for: https://www.worldfootball.net/report/first-division-2016-2017-apoel-nikosia-anorthosis-famagusta-fc/
Attendance not found.
--------------------


# Next, try to scrape attendance values from all matches in specific game week

In [3]:
URL = "https://www.worldfootball.net/schedule/fra-ligue-2-2018-2019-spieltag/10/"

# Fetch the webpage
response = requests.get(URL)
response.raise_for_status()  # Will raise an exception if there's an error

# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find(id="site").find_all('td', align='center')

urls = []

base_url = "https://www.worldfootball.net"

for td in results:
    a_tag = td.find('a', href=True)
    if a_tag and 'report' in a_tag['href']:
        URL = base_url + a_tag['href']
        urls.append(URL)

for URL in urls:
    print(f"Fetching data for: {URL}")

    # Fetch the webpage
    response = requests.get(URL)
    response.raise_for_status()  # Will raise an exception if there's an error

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    results = soup.find(id="site").find_all("td", class_="dunkel")

    attendance_td = None
    for i, td in enumerate(results):
        img = td.find('img', title='Attendance')
        if img:
            attendance_td = results[i + 1]  # Get the next <td> element after the img
            break

    if attendance_td:
        attendance = attendance_td.get_text(strip=True)
        print(f"Attendance: {attendance}")
    else:
        print("Attendance not found.")
    print("--------------------")  # To separate results for clarity

Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-gfc-ajaccio-us-orleans/
Attendance: 2.498
--------------------
Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-lb-chateauroux-estac-troyes/
Attendance: 6.861
--------------------
Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-grenoble-foot-38-clermont-foot/
Attendance: 6.389
--------------------
Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-le-havre-ac-as-beziers/
Attendance: 5.297
--------------------
Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-fc-lorient-as-nancy/
Attendance: 6.952
--------------------
Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-chamois-niortais-paris-fc/
Attendance: 3.397
--------------------
Fetching data for: https://www.worldfootball.net/report/ligue-2-2018-2019-red-star-fc-ac-ajaccio/
Attendance: 1.863
--------------------
Fetching data for: https://w

# Next, try to also fetch day, date, GW, home team, away team data

In [92]:
import requests
from bs4 import BeautifulSoup
import re

URL = "https://www.worldfootball.net/schedule/fra-ligue-2-2018-2019-spieltag/10/"

# Use regex to extract league, season, and GW number directly from URL
match = re.search(r"https://www\.worldfootball\.net/schedule/(.+?)-(\d{4}-\d{4})-.*?/(\d+)/$", URL)
league = match.group(1)  # The league is in the first group of the regex match
season = match.group(2)  # The season is in the second group of the regex match
gw = match.group(3)  # The gameweek is in the third group of the regex match

# Fetch the webpage
response = requests.get(URL)
response.raise_for_status()  # Will raise an exception if there's an error

# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find(id="site").find_all('td', align='center')

urls = []

base_url = "https://www.worldfootball.net"

for td in results:
    a_tag = td.find('a', href=True)
    if a_tag and 'report' in a_tag['href']:
        url = base_url + a_tag['href']
        urls.append(url)

# Loop through each URL in the list
for url in urls:
    
    print(f"League: {league}")
    print(f"Season: {season}")
    print(f"Round: {gw}")
    
    # Fetch the webpage
    response = requests.get(url)
    response.raise_for_status()  # Will raise an exception if there's an error

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    results1 = soup.find(id="site").find_all("th", align="center")
    results2 = soup.find(id="site").find_all("td", class_="dunkel")

    for i, th in enumerate(results1):
        a_tag = th.find('a', href=True)
        if a_tag:
            if i == 0:
                print("Home team:", a_tag['title'])
            elif i == 2:
                print("Away team:", a_tag['title'])

    # Regular expression pattern to extract weekday, date, and time
    pattern = r"(\w+day), (\d{1,2}\. \w+ \d{4})(\d{2}:\d{2})"

    match = re.search(pattern, results1[1].text)

    if match:
        weekday = match.group(1)  # Extracting the weekday
        date = match.group(2)    # Extracting the date
        time = match.group(3)    # Extracting the time
        
        print(f"Weekday: {weekday}")
        print(f"Date: {date}")
        print(f"Time: {time}")
    else:
        print("Pattern not found in the text.")

    attendance_td = None
    for i, td in enumerate(results2):
        img = td.find('img', title='Attendance')
        if img:
            attendance_td = results2[i + 1]  # Get the next <td> element after the img
            break

    if attendance_td:
        attendance = attendance_td.get_text(strip=True)
        print(f"Attendance: {attendance}")
    else:
        print("Attendance not found.")

    print("\n" + "="*40 + "\n")  # To separate the results from different URLs

League: fra-ligue-2
Season: 2018-2019
Round: 10
Home team: Gazélec FC Ajaccio
Away team: US Orléans
Weekday: Friday
Date: 5. October 2018
Time: 19:00
Attendance: 2.498


League: fra-ligue-2
Season: 2018-2019
Round: 10
Home team: LB Châteauroux
Away team: ESTAC Troyes
Weekday: Friday
Date: 5. October 2018
Time: 19:00
Attendance: 6.861


League: fra-ligue-2
Season: 2018-2019
Round: 10
Home team: Grenoble Foot 38
Away team: Clermont Foot 63
Weekday: Friday
Date: 5. October 2018
Time: 19:00
Attendance: 6.389


League: fra-ligue-2
Season: 2018-2019
Round: 10
Home team: Havre AC
Away team: AS Béziers
Weekday: Friday
Date: 5. October 2018
Time: 19:00
Attendance: 5.297


League: fra-ligue-2
Season: 2018-2019
Round: 10
Home team: FC Lorient
Away team: AS Nancy Lorraine
Weekday: Friday
Date: 5. October 2018
Time: 19:00
Attendance: 6.952


League: fra-ligue-2
Season: 2018-2019
Round: 10
Home team: Chamois Niortais
Away team: Paris FC
Weekday: Friday
Date: 5. October 2018
Time: 19:00
Attendance: 3

# Next, try to iterate through the gameweeks for a specific league and season

In [99]:
import requests
from bs4 import BeautifulSoup
import re

base_url = "https://www.worldfootball.net"

# Iterate through numbers from 1 to 10
for num in range(1, 11):

    URL = f"https://www.worldfootball.net/schedule/fra-ligue-2-2018-2019-spieltag/{num}/"

    # Use regex to extract league, season, and GW number directly from URL
    pattern = r"https://www\.worldfootball\.net/schedule/(.+?)-(\d{4}-\d{4})-.*?/(\d+)/$"
    match = re.search(pattern, URL)
    league = match.group(1)  # The league is in the first group of the regex match
    season = match.group(2)  # The season is in the second group of the regex match
    gw = match.group(3)  # The gameweek is in the third group of the regex match

    # Fetch the webpage
    response = requests.get(URL)
    response.raise_for_status()  # Will raise an exception if there's an error

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.find(id="site").find_all('td', align='center')

    urls = []

    for td in results:
        a_tag = td.find('a', href=True)
        if a_tag and 'report' in a_tag['href']:
            url = base_url + a_tag['href']
            urls.append(url)

    # Loop through each URL in the list
    for url in urls:

        print(f"League: {league}")
        print(f"Season: {season}")
        print(f"Round: {gw}")

        # Fetch the webpage
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception if there's an error

        # Parse the content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        results1 = soup.find(id="site").find_all("th", align="center")
        results2 = soup.find(id="site").find_all("td", class_="dunkel")

        for i, th in enumerate(results1):
            a_tag = th.find('a', href=True)
            if a_tag:
                if i == 0:
                    print("Home team:", a_tag['title'])
                elif i == 2:
                    print("Away team:", a_tag['title'])

        # Regular expression pattern to extract weekday, date, and time
        pattern = r"(\w+day), (\d{1,2}\. \w+ \d{4})(\d{2}:\d{2})"

        match = re.search(pattern, results1[1].text)

        if match:
            weekday = match.group(1)  # Extracting the weekday
            date = match.group(2)    # Extracting the date
            time = match.group(3)    # Extracting the time
        
            print(f"Weekday: {weekday}")
            print(f"Date: {date}")
            print(f"Time: {time}")
        else:
            print("Pattern not found in the text.")

        attendance_td = None
        for i, td in enumerate(results2):
            img = td.find('img', title='Attendance')
            if img:
                attendance_td = results2[i + 1]  # Get the next <td> element after the img
                break

        if attendance_td:
            attendance = attendance_td.get_text(strip=True)
            print(f"Attendance: {attendance}")
        else:
            print("Attendance not found.")

        print("\n" + "="*40 + "\n")  # To separate the results from different URLs


League: fra-ligue-2
Season: 2018-2019
Round: 1
Home team: AC Ajaccio
Away team: ESTAC Troyes
Weekday: Friday
Date: 27. July 2018
Time: 19:00
Attendance: without spectators.


League: fra-ligue-2
Season: 2018-2019
Round: 1
Home team: Gazélec FC Ajaccio
Away team: Paris FC
Weekday: Friday
Date: 27. July 2018
Time: 19:00
Attendance: 2.803


League: fra-ligue-2
Season: 2018-2019
Round: 1
Home team: Clermont Foot 63
Away team: LB Châteauroux
Weekday: Friday
Date: 27. July 2018
Time: 19:00
Attendance: 2.873


League: fra-ligue-2
Season: 2018-2019
Round: 1
Home team: Grenoble Foot 38
Away team: FC Sochaux
Weekday: Friday
Date: 27. July 2018
Time: 19:00
Attendance: 7.032


League: fra-ligue-2
Season: 2018-2019
Round: 1
Home team: AS Nancy Lorraine
Away team: AS Béziers
Weekday: Friday
Date: 27. July 2018
Time: 19:00
Attendance: 10.205


League: fra-ligue-2
Season: 2018-2019
Round: 1
Home team: US Orléans
Away team: RC Lens
Weekday: Friday
Date: 27. July 2018
Time: 19:00
Attendance: 5.000


Lea

League: fra-ligue-2
Season: 2018-2019
Round: 5
Home team: AC Ajaccio
Away team: FC Lorient
Weekday: Friday
Date: 24. August 2018
Time: 19:00
Attendance: 2.882


League: fra-ligue-2
Season: 2018-2019
Round: 5
Home team: Stade Brestois
Away team: Havre AC
Weekday: Friday
Date: 24. August 2018
Time: 19:00
Attendance: 7.056


League: fra-ligue-2
Season: 2018-2019
Round: 5
Home team: Clermont Foot 63
Away team: FC Sochaux
Weekday: Friday
Date: 24. August 2018
Time: 19:00
Attendance: 2.544


League: fra-ligue-2
Season: 2018-2019
Round: 5
Home team: Grenoble Foot 38
Away team: LB Châteauroux
Weekday: Friday
Date: 24. August 2018
Time: 19:00
Attendance: 6.522


League: fra-ligue-2
Season: 2018-2019
Round: 5
Home team: AS Nancy Lorraine
Away team: Chamois Niortais
Weekday: Friday
Date: 24. August 2018
Time: 19:00
Attendance: 9.096


League: fra-ligue-2
Season: 2018-2019
Round: 5
Home team: US Orléans
Away team: Paris FC
Weekday: Friday
Date: 24. August 2018
Time: 19:00
Attendance: 3.420


Leagu

League: fra-ligue-2
Season: 2018-2019
Round: 9
Home team: AC Ajaccio
Away team: Valenciennes FC
Weekday: Friday
Date: 28. September 2018
Time: 19:00
Attendance: 2.368


League: fra-ligue-2
Season: 2018-2019
Round: 9
Home team: AS Béziers
Away team: Chamois Niortais
Weekday: Friday
Date: 28. September 2018
Time: 19:00
Attendance: 2.439


League: fra-ligue-2
Season: 2018-2019
Round: 9
Home team: Stade Brestois
Away team: LB Châteauroux
Weekday: Friday
Date: 28. September 2018
Time: 19:00
Attendance: 6.469


League: fra-ligue-2
Season: 2018-2019
Round: 9
Home team: Clermont Foot 63
Away team: Gazélec FC Ajaccio
Weekday: Friday
Date: 28. September 2018
Time: 19:00
Attendance: 2.493


League: fra-ligue-2
Season: 2018-2019
Round: 9
Home team: RC Lens
Away team: Paris FC
Weekday: Friday
Date: 28. September 2018
Time: 19:00
Attendance: 26.027


League: fra-ligue-2
Season: 2018-2019
Round: 9
Home team: AS Nancy Lorraine
Away team: FC Sochaux
Weekday: Friday
Date: 28. September 2018
Time: 19:00


# Next, store data in dataframe