In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
base_url = "https://www.worldfootball.net"

# Create an empty DataFrame
df = pd.DataFrame(columns=['League', 'Season', 'Round', 'Home Team', 'Away Team', 'Weekday', 'Date', 'Time', 'Attendance'])

# Iterate through seasons
for year in range(2010, 2011):
    season = f"{year}-{year+1}"
    
    # Iterate through numbers from 1 to 38 (for gameweeks)
    for num in range(1, 39):
        URL = f"https://www.worldfootball.net/schedule/fra-ligue-2-{season}-spieltag/{num}/"

        pattern = r"https://www\.worldfootball\.net/schedule/(.+?)-(\d{4}-\d{4})-.*?/(\d+)/$"
        match = re.search(pattern, URL)
        league = match.group(1)
        season = match.group(2)
        gw = match.group(3)

        response = requests.get(URL)
        if response.status_code != 200:  # If not a successful request, skip this iteration
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        results = soup.find(id="site").find_all('td', align='center')

        urls = [base_url + td.find('a', href=True)['href'] for td in results if td.find('a', href=True) and 'report' in td.find('a', href=True)['href']]

        for url in urls:
            response = requests.get(url)
            if response.status_code != 200:  # If not a successful request, skip this iteration
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            results1 = soup.find(id="site").find_all("th", align="center")
            results2 = soup.find(id="site").find_all("td", class_="dunkel")

            home_team = results1[0].find('a', href=True)['title'] if results1[0].find('a', href=True) else None
            away_team = results1[2].find('a', href=True)['title'] if results1[2].find('a', href=True) else None

            match = re.search(r"(\w+day), (\d{1,2}\. \w+ \d{4})(\d{2}:\d{2})", results1[1].text)
            if match:
                weekday = match.group(1)
                date = match.group(2)
                time = match.group(3)
            else:
                weekday, date, time = None, None, None

            attendance_td = next((results2[i + 1] for i, td in enumerate(results2) if td.find('img', title='Attendance')), None)
            attendance = attendance_td.get_text(strip=True) if attendance_td else None

            # Append data to the DataFrame
            df = df.append({
                'League': league,
                'Season': season,
                'Round': gw,
                'Home Team': home_team,
                'Away Team': away_team,
                'Weekday': weekday,
                'Date': date,
                'Time': time,
                'Attendance': attendance
            }, ignore_index=True)
            
        print(f"Season {season} Round {num} done.")
    
print("Dataframe is created.")

Season 2010-2011 Round 1 done.
Season 2010-2011 Round 2 done.
Season 2010-2011 Round 3 done.
Season 2010-2011 Round 4 done.
Season 2010-2011 Round 5 done.
Season 2010-2011 Round 6 done.
Season 2010-2011 Round 7 done.
Season 2010-2011 Round 8 done.
Season 2010-2011 Round 9 done.
Season 2010-2011 Round 10 done.
Season 2010-2011 Round 11 done.
Season 2010-2011 Round 12 done.
Season 2010-2011 Round 13 done.
Season 2010-2011 Round 14 done.
Season 2010-2011 Round 15 done.
Season 2010-2011 Round 16 done.
Season 2010-2011 Round 17 done.
Season 2010-2011 Round 18 done.
Season 2010-2011 Round 19 done.
Season 2010-2011 Round 20 done.
Season 2010-2011 Round 21 done.
Season 2010-2011 Round 22 done.
Season 2010-2011 Round 23 done.
Season 2010-2011 Round 24 done.
Season 2010-2011 Round 25 done.
Season 2010-2011 Round 26 done.
Season 2010-2011 Round 27 done.
Season 2010-2011 Round 28 done.
Season 2010-2011 Round 29 done.
Season 2010-2011 Round 30 done.
Season 2010-2011 Round 31 done.
Season 2010-2011 

In [3]:
df1 = df

rename_teams_dict = {'AC Ajaccio': 'Ajaccio',
               'Nîmes Olympique': 'Nimes',
               'US Boulogne': 'Boulogne',
               'Clermont Foot Auvergne 63': 'Clermont',
               'LB Châteauroux': 'Chateauroux',
               'FC Istres': 'Istres',
               'Dijon FCO': 'Dijon',
               'Angers SCO': 'Angers',
               'Grenoble Foot 38': 'Grenoble',
               'Havre AC': 'Le Havre',
               'FC Metz': 'Metz',
               'Évian Thonon Gaillard': 'Evian Thonon Gaillard',
               'Stade Reims': 'Reims',
               'Stade Laval': 'Laval',
               'Tours FC': 'Tours',
               'CS Sedan': 'Sedan',
               'Vannes OC': 'Vannes',
               'ESTAC Troyes': 'Troyes',
               'FC Nantes': 'Nantes',
               'Le Mans FC': 'Le Mans',
              }

df1['Home Team'] = df1['Home Team'].replace(rename_teams_dict)
df1['Away Team'] = df1['Away Team'].replace(rename_teams_dict)
df1 = df1.rename(columns={
    'Home Team': 'HomeTeam',
    'Away Team': 'AwayTeam',
    
})
df1.drop(['Date'], axis=1, inplace=True)

df1

Unnamed: 0,League,Season,Round,HomeTeam,AwayTeam,Weekday,Time,Attendance
0,fra-ligue-2,2010-2011,1,Ajaccio,Nimes,Friday,19:00,5.000
1,fra-ligue-2,2010-2011,1,Boulogne,Clermont,Friday,19:00,7.963
2,fra-ligue-2,2010-2011,1,Chateauroux,Istres,Friday,19:00,4.993
3,fra-ligue-2,2010-2011,1,Dijon,Angers,Friday,19:00,5.111
4,fra-ligue-2,2010-2011,1,Grenoble,Le Havre,Friday,19:00,5.135
...,...,...,...,...,...,...,...,...
375,fra-ligue-2,2010-2011,38,Le Havre,Grenoble,Friday,19:30,6.943
376,fra-ligue-2,2010-2011,38,Le Mans,Nantes,Friday,19:30,23.572
377,fra-ligue-2,2010-2011,38,Nimes,Ajaccio,Friday,19:30,5.799
378,fra-ligue-2,2010-2011,38,Sedan,Tours,Friday,19:30,9.766


In [4]:
df2 = pd.read_csv("data/fra-ligue-2-2010-2011.csv")
df2 = df2[['Date', 'HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'BbMxH','BbAvH','BbMxD','BbAvD','BbMxA','BbAvA', 'BbMx>2.5', 'BbAv>2.5']]
df2.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,B365H,B365D,B365A,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbMx>2.5,BbAv>2.5
0,06/08/10,Ajaccio,Nimes,2.15,3.1,3.6,2.25,2.1,3.1,2.98,3.9,3.57,2.49,2.32
1,06/08/10,Boulogne,Clermont,2.0,3.1,4.0,2.1,2.01,3.1,3.05,4.0,3.74,2.33,2.22
2,06/08/10,Chateauroux,Istres,2.0,3.1,4.0,2.08,1.96,3.1,3.05,4.2,3.95,2.34,2.26
3,06/08/10,Dijon,Angers,2.2,3.0,3.6,2.3,2.22,3.0,2.95,3.6,3.29,2.5,2.36
4,06/08/10,Grenoble,Le Havre,2.1,3.1,3.75,2.28,2.08,3.1,3.01,3.8,3.6,2.4,2.31
5,06/08/10,Metz,Evian Thonon Gaillard,1.8,3.3,4.75,2.13,2.02,3.2,3.07,4.0,3.85,2.38,2.26
6,06/08/10,Reims,Laval,2.4,3.0,3.1,2.5,2.37,3.05,2.95,3.3,3.02,2.35,2.28
7,06/08/10,Tours,Sedan,2.15,3.1,3.6,2.4,2.24,3.1,2.98,3.5,3.24,2.35,2.27
8,06/08/10,Vannes,Troyes,2.15,3.1,3.6,2.28,2.12,3.1,2.99,3.7,3.5,2.37,2.26
9,09/08/10,Nantes,Le Mans,2.55,3.0,2.9,2.75,2.6,3.1,2.94,2.93,2.7,2.36,2.23


In [5]:
merged_df = pd.merge(df1, df2, on=['HomeTeam', 'AwayTeam'])
merged_df

Unnamed: 0,League,Season,Round,HomeTeam,AwayTeam,Weekday,Time,Attendance,Date,B365H,B365D,B365A,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbMx>2.5,BbAv>2.5
0,fra-ligue-2,2010-2011,1,Ajaccio,Nimes,Friday,19:00,5.000,06/08/10,2.15,3.10,3.60,2.25,2.10,3.10,2.98,3.90,3.57,2.49,2.32
1,fra-ligue-2,2010-2011,1,Boulogne,Clermont,Friday,19:00,7.963,06/08/10,2.00,3.10,4.00,2.10,2.01,3.10,3.05,4.00,3.74,2.33,2.22
2,fra-ligue-2,2010-2011,1,Chateauroux,Istres,Friday,19:00,4.993,06/08/10,2.00,3.10,4.00,2.08,1.96,3.10,3.05,4.20,3.95,2.34,2.26
3,fra-ligue-2,2010-2011,1,Dijon,Angers,Friday,19:00,5.111,06/08/10,2.20,3.00,3.60,2.30,2.22,3.00,2.95,3.60,3.29,2.50,2.36
4,fra-ligue-2,2010-2011,1,Grenoble,Le Havre,Friday,19:00,5.135,06/08/10,2.10,3.10,3.75,2.28,2.08,3.10,3.01,3.80,3.60,2.40,2.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,fra-ligue-2,2010-2011,38,Le Havre,Grenoble,Friday,19:30,6.943,27/05/11,1.80,3.50,4.50,1.80,1.74,3.96,3.48,4.94,4.47,1.70,1.62
376,fra-ligue-2,2010-2011,38,Le Mans,Nantes,Friday,19:30,23.572,27/05/11,1.33,4.75,10.00,1.36,1.32,5.52,4.70,10.34,8.77,1.70,1.59
377,fra-ligue-2,2010-2011,38,Nimes,Ajaccio,Friday,19:30,5.799,27/05/11,11.00,5.00,1.29,11.31,9.49,6.02,5.07,1.32,1.28,1.63,1.57
378,fra-ligue-2,2010-2011,38,Sedan,Tours,Friday,19:30,9.766,27/05/11,1.62,4.00,5.00,1.73,1.58,4.35,3.88,6.00,5.14,1.63,1.54
