# Web Scrapping from a Public Football Website using BeautifulSoup
website:  www.worldfootball.net
Description: Football scores, Leaderboard etc

We begin loading libraries

In [1]:
# loading libraries
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

### Parse the url and then scrap to a DataFrame

In [2]:
# Loop to dataframe
# We are interested in the fisrt 16 matchdays
for i in range(1,int(17)):
    url="https://www.worldfootball.net/schedule/eng-premier-league-2022-2023-spieltag/" + str(i)
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    
    lst=[]
    table_body=soup.find('table',{'class':'standard_tabelle'})
    rows = table_body.find_all('tr')
    for row in rows:
        cols=row.find_all('td')
        for col in cols:
            lst.append(col.text.strip())
        lst[-1]=i
    if i==1:
        EPL_data=pd.DataFrame(np.array_split(lst,len(lst)/7))[[0,1,2,4,5,6]]
    else:
        EPL_data=pd.concat([EPL_data,pd.DataFrame(np.array_split(lst,len(lst)/7))[[0,1,2,4,5,6]]])

In [3]:
#View dataframe
Raw_EPL_2022_23 = EPL_data
Raw_EPL_2022_23

Unnamed: 0,0,1,2,4,5,6
0,05/08/2022,20:00,Crystal Palace,Arsenal FC,0:2 (0:1),1
1,06/08/2022,12:30,Fulham FC,Liverpool FC,2:2 (1:0),1
2,,15:00,AFC Bournemouth,Aston Villa,2:0 (1:0),1
3,,15:00,Leeds United,Wolverhampton Wanderers,2:1 (1:1),1
4,,15:00,Newcastle United,Nottingham Forest,2:0 (0:0),1
...,...,...,...,...,...,...
5,,15:00,West Ham United,Leicester City,0:2 (0:1),16
6,,17:30,Newcastle United,Chelsea FC,1:0 (0:0),16
7,,19:45,Wolverhampton Wanderers,Arsenal FC,0:2 (0:0),16
8,13/11/2022,14:00,Brighton & Hove Albion,Aston Villa,1:2 (1:1),16


### Data Cleaning

In [4]:
# Fill down date
EPL_2022_23=EPL_data.replace({'':np.NaN})
EPL_2022_23[0].ffill(inplace=True)
EPL_2022_23.reset_index(drop=True, inplace=True)

# rename columns
EPL_2022_23.columns=['Date','Time','Home Team','Away Team', 'FT HT','MatchDay']
EPL_2022_23

Unnamed: 0,Date,Time,Home Team,Away Team,FT HT,MatchDay
0,05/08/2022,20:00,Crystal Palace,Arsenal FC,0:2 (0:1),1
1,06/08/2022,12:30,Fulham FC,Liverpool FC,2:2 (1:0),1
2,06/08/2022,15:00,AFC Bournemouth,Aston Villa,2:0 (1:0),1
3,06/08/2022,15:00,Leeds United,Wolverhampton Wanderers,2:1 (1:1),1
4,06/08/2022,15:00,Newcastle United,Nottingham Forest,2:0 (0:0),1
...,...,...,...,...,...,...
155,12/11/2022,15:00,West Ham United,Leicester City,0:2 (0:1),16
156,12/11/2022,17:30,Newcastle United,Chelsea FC,1:0 (0:0),16
157,12/11/2022,19:45,Wolverhampton Wanderers,Arsenal FC,0:2 (0:0),16
158,13/11/2022,14:00,Brighton & Hove Albion,Aston Villa,1:2 (1:1),16


In [7]:
# Expand columns_5

EPL_Expand=EPL_2022_23['FT HT'].str.split(' ',expand=True)
EPL_Expand.columns=['FT','HT']
EPL_Expand=EPL_Expand.join(EPL_Expand['FT'].str.split(':',expand=True))
EPL_Expand.columns=['FT','HT','GF','GA']
EPL_Expand['HT']=EPL_Expand['HT'].str.extract('(\d+:\d+)')
EPL_Expand=EPL_Expand.join(EPL_Expand['HT'].str.split(':',expand=True))
EPL_Expand.columns=['FT','HT','GF','GA','HGF','HGA']
EPL_Expand

Unnamed: 0,FT,HT,GF,GA,HGF,HGA
0,0:2,0:1,0,2,0,1
1,2:2,1:0,2,2,1,0
2,2:0,1:0,2,0,1,0
3,2:1,1:1,2,1,1,1
4,2:0,0:0,2,0,0,0
...,...,...,...,...,...,...
155,0:2,0:1,0,2,0,1
156,1:0,0:0,1,0,0,0
157,0:2,0:0,0,2,0,0
158,1:2,1:1,1,2,1,1


In [8]:
# join dataframes
final_epl_dataset = EPL_2022_23.join(EPL_Expand)

In [9]:
final_epl_dataset

Unnamed: 0,Date,Time,Home Team,Away Team,FT HT,MatchDay,FT,HT,GF,GA,HGF,HGA
0,05/08/2022,20:00,Crystal Palace,Arsenal FC,0:2 (0:1),1,0:2,0:1,0,2,0,1
1,06/08/2022,12:30,Fulham FC,Liverpool FC,2:2 (1:0),1,2:2,1:0,2,2,1,0
2,06/08/2022,15:00,AFC Bournemouth,Aston Villa,2:0 (1:0),1,2:0,1:0,2,0,1,0
3,06/08/2022,15:00,Leeds United,Wolverhampton Wanderers,2:1 (1:1),1,2:1,1:1,2,1,1,1
4,06/08/2022,15:00,Newcastle United,Nottingham Forest,2:0 (0:0),1,2:0,0:0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
155,12/11/2022,15:00,West Ham United,Leicester City,0:2 (0:1),16,0:2,0:1,0,2,0,1
156,12/11/2022,17:30,Newcastle United,Chelsea FC,1:0 (0:0),16,1:0,0:0,1,0,0,0
157,12/11/2022,19:45,Wolverhampton Wanderers,Arsenal FC,0:2 (0:0),16,0:2,0:0,0,2,0,0
158,13/11/2022,14:00,Brighton & Hove Albion,Aston Villa,1:2 (1:1),16,1:2,1:1,1,2,1,1


In [10]:
# save dataset
final_epl_dataset.to_csv('EPL 2022_23 Scrapped.csv', index=False)