<a href="https://colab.research.google.com/github/mateuszandzelak01/football_web_scraping/blob/main/football.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = 'https://www.transfermarkt.pl/pko-ekstraklasa/startseite/wettbewerb/PL1/plus/?saison_id=2021'
headers = {"User-Agent":"Mozilla/5.0"}
response = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
table = soup.find('table', {'class' : 'items'})

In [None]:
row = table.findAll('tr')

## **LECH POZNAŃ EXAMPLE**

In [None]:
row[2].text

'\nLech Poznań Lech Poznań3225,2151,03 mln €32,80 mln €1,03 mln €32,80 mln €'

## **Read the whole table with a loop**

In [None]:
team = []
squad = []
age = []
foreigners = []
total_market_value = []
average_market_value = []
for row in table.findAll('tr'):
    try:
        col = row.findAll('td')
        team.append(col[2].text)
        squad.append(col[3].text)
        age.append(col[4].text)
        foreigners.append(col[5].text)
        total_market_value.append(col[6].text)
        average_market_value.append(col[7].text)
    except:
        pass

## **BUILD DATAFRAME**

In [None]:
df_ekstraklasa = pd.DataFrame({'Team': team[1:],'Squad': squad[1:], 
                              'Age': age[1:], 'Foreigners': foreigners[1:],
                              'Total Value': total_market_value[1:], 
                              'Average value': average_market_value[1:]})

In [None]:
print(df_ekstraklasa)

              Team Squad   Age Foreigners Total Value Average value
0      Lech Poznań    32  25,2         15  1,03 mln €   32,80 mln €
1   Legia Warszawa    31  25,0         15  894 tys. €   27,73 mln €
2   Pogoń Szczecin    29  25,8          8  852 tys. €   24,70 mln €
3            Raków    30  25,1         17  645 tys. €   19,35 mln €
4    Lechia Gdańsk    29  25,0         14  616 tys. €   17,85 mln €
5      Jagiellonia    38  24,5         11  384 tys. €   14,60 mln €
6   Zagłębie Lubin    31  24,3          9  440 tys. €   13,63 mln €
7    Górnik Zabrze    27  24,5         10  468 tys. €   12,63 mln €
8         Cracovia    36  23,8         17  340 tys. €   12,25 mln €
9    Śląsk Wrocław    32  25,8          9  373 tys. €   11,95 mln €
10    Wisła Kraków    30  25,7         12  397 tys. €   11,90 mln €
11   Piast Gliwice    27  26,8         10  434 tys. €   11,73 mln €
12     Wisła Płock    29  25,4          8  363 tys. €   10,53 mln €
13        Radomiak    29  26,3         11  282 t

## **SEASON 2012-2021**

In [None]:
dct_pl = {}

for m in range(2012,2019):
    dct_pl['df_pl_%s' % m] = pd.DataFrame()

## **List with our needed URLs for each league**

In [None]:
league_urls = (['https://www.transfermarkt.pl/pko-ekstraklasa/startseite/wettbewerb/PL1/plus/?saison_id='])

## **Scrapping all data**

In [None]:
for n in range(2012,2019):
        df_soccer1 = None
        url = league_urls[l] + str(n)
        headers = {"User-Agent":"Mozilla/5.0"}
        response = requests.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(response.text, 'html.parser')

        #Table 1 with information about the value
        table = soup.find("table", {"class" : "items"})

        team = []
        squad = []
        age = []
        foreigners = []
        total_market_value = []
        average_market_value = []

        for row in table.findAll('tr'):
            try:
                col = row.findAll('td')
                team.append(col[2].text)
                squad.append(col[3].text)
                age.append(col[4].text)
                foreigners.append(col[5].text)
                total_market_value.append(col[6].text)
                average_market_value.append(col[7].text)
            except:
                pass

        team = [elem.replace('\n','').replace('\xa0','').strip() for elem in team]

        #Table 2 with information about placement, goals and points
        df_soccer2 = None

        table2 = soup.findAll("div", {"class" : "responsive-table"})

        team2 = []
        place = []
        matches = []
        difference = []
        pts = []

        if len(table2) <= 2:
            for row in table2[1].findAll('tr'):
                try:
                    col = row.findAll('td')
                    team2.append(col[2].text)
                    place.append(col[0].text)
                    matches.append(col[3].text)
                    difference.append(col[4].text)
                    pts.append(col[5].text)
                except:
                    pass
        else:
            #Sometimes the information you need is in another table
            for row in table2[2].findAll('tr'):
                try:
                    col = row.findAll('td')
                    team2.append(col[2].text)
                    place.append(col[0].text)
                    matches.append(col[3].text)
                    difference.append(col[4].text)
                    pts.append(col[5].text)
                except:
                    pass

        team2 = [elem.replace('\n','').replace('\xa0','').strip() for elem in team2]

        df_soccer1 = pd.DataFrame({'Team': team[1:], 'Season': n, 'Squad': squad[1:], 'Age': age[1:], 'Foreigners': foreigners[1:],
                                     'Total Value': total_market_value[1:], 'Average value': average_market_value[1:]})

        df_soccer2 = pd.DataFrame({'Team': team2, 'Place': place, 'Matches': matches, 'Difference': difference,
                                 'Points': pts})
        
        #Store all dictionaries in a list
        dct_all = [dct_pl]
        
        #Merge df_soccer1 and df_soccer2 for each season
        dct_all[l]['df_pl_%s' % n] = pd.merge(df_soccer1, df_soccer2, how="inner", left_on="Team", right_on="Team")