In [1]:
# libraries and stuff
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import os
import time

In [2]:
def get_mv(page):
    """ Get the market value table from transfermarkt
    
    Args:
        page::[str]
            the url of the page that contains the table
    Returns:
        season::[pandas dataframe]
            the market value table
    """
    # get the html for the page
    headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    tree = requests.get(page, headers=headers)
    soup = BeautifulSoup(tree.content, 'html.parser')
    
    # get the content of the market_value_table
    tables = soup.findAll("tbody")
    mv_table = tables[1]
    
    rows = mv_table.findAll('tr')
    season = pd.DataFrame()
    col_list = ['team', 'squad', 'age', 'foreigners', 'total_market_value']

    for i in range(len(rows)):
        fields = rows[i].findAll('td')
        temp_list = []
        for field in fields:
            a = field.text.strip().encode()
            text = a.decode("utf-8")
            temp_list.append(text)
        season = season.append(pd.DataFrame(np.array(temp_list[2:7]).reshape(1, -1), columns=[col_list]))

    season = season.reset_index(drop=True)
    return season

In [3]:
# dataframe to store the final result with all season
mv_final = pd.DataFrame()

# get data for pl season 2005 to 2019
for i in range(2005, 2020):
    url = 'https://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1/plus/?saison_id={}'.format(i)
    temp_df = get_mv(url)
    temp_df['season'] = i
    mv_final = mv_final.append(temp_df)
    time.sleep(10)
    
mv_final = mv_final.reset_index(drop=True)

In [6]:
def get_standings(page):
    """ Get the standings for a particular season from transfermarkt
    
    Args:
        page::[str]
            the url of the page that contains the table
    Returns:
        season::[pandas dataframe]
            the standings
    """
    # get the html for the page
    headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    tree = requests.get(page, headers=headers)
    soup = BeautifulSoup(tree.content, 'html.parser')
    
    # get the content of the market_value_table
    tables = soup.findAll("tbody")
    standings_table = tables[3]
    
    rows = standings_table.findAll('tr')
    season = pd.DataFrame()
    col_list = ['position', 'useless', 'team', 'matches', 'gd', 'pts']

    for i in range(len(rows)):
        fields = rows[i].findAll('td')
        temp_list = []
        for field in fields:
            a = field.text.strip().encode()
            text = a.decode("utf-8")
            temp_list.append(text)
        season = season.append(pd.DataFrame(np.array(temp_list).reshape(1, -1), columns=[col_list]))

    season = season.reset_index(drop=True)
    return season

In [7]:
standings_final = pd.DataFrame()

# get data for pl season 2005 to 2019
for i in range(2005, 2020):
    url = 'https://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1/plus/?saison_id={}'.format(i)
    temp_df = get_standings(url)
    temp_df['season'] = i
    standings_final = standings_final.append(temp_df)
    time.sleep(10)

standings_final.drop('useless', axis=1, inplace=True)
standings_final = standings_final.reset_index(drop=True)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [8]:
# export to csv
if not os.path.isdir('data'):
    os.mkdir('data')

standings_final.to_csv("data/pl_standings.csv", index=False)
mv_final.to_csv("data/pl_mv.csv", index=False)

In [9]:
print(standings_final.shape)
print(mv_final.shape)

(300, 6)
(300, 6)
