# Transfermarkt Scraper

In [1]:
from urllib.request import *
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
import requests
import json
import os
%matplotlib inline

In [2]:
base_url = "https://www.transfermarkt.com"

In [3]:
def parseContinent(ref):
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + "/" + ref

    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    
    leagues = []
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})

    for row in rows:
        val = row.find("td",{"class":"rechts hauptlink"}).text
        val = ".".join(val.split(","))
        rest = val.split(" ")[1]
        val = val.split(" ")[0]
        if "Bill" in rest:
            val = float(val) * 10**9
        else:
            if "Mill" in rest:
                val = float(val) * 10**6
            else: 
                val = 0
        if val > 200*10**6:
            league = {}
            league["href"] = row.findAll('a')[1]['href']
            league["name"] = row.find("img")["title"]
            league["country"] = row.find("td",{"class","zentriert"}).find("img")["title"]
            league["tot_value"] = val
            leagues.append(league)
            
    return leagues

In [4]:
def parse_league(league_ref): #get teams in league
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + league_ref
    teams = []
    
    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
    
    for row in rows:
            
            url_team = base_url + row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
            r_team = requests.get(url_team, headers=HEADERS)
            response_team = BeautifulSoup(r_team.text, 'html.parser')
            stadium_info =response_team.find("div",{"id":"main"}).findAll("span",{"class":"dataValue"})[4].text
            
            stadium_info=stadium_info.replace(u'\xa0',u'')
            stadium_info=stadium_info.replace(u'\n',u'')


            split_stadium= re.split(r'(\d+)',stadium_info)
            stadium = split_stadium[0]
#             num_seats = float(split_stadium[1]+'.'+split_stadium[3])
           
            
            team = {}
            team["name"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['title']
            team["href"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
            team["squad"] = row.findAll("td",{"class":"zentriert"})[1].text
            team["market_value"] = row.find("td",{"class":"rechts show-for-small show-for-pad nowrap"}).text
            team["stadium"] = stadium
#             team["stadium_seats"] = num_seats
            teams.append(team)

    
    
    return teams

    

In [5]:
teams = parse_league('/premier-league/startseite/wettbewerb/GB1')
teams += parse_league('/serie-a/startseite/wettbewerb/IT1')


In [6]:
teams

[{'href': '/chelsea-fc/kader/verein/631/saison_id/2017',
  'market_value': '631,90 Mill. €',
  'name': 'Chelsea FC',
  'squad': '26',
  'stadium': 'Stamford Bridge'},
 {'href': '/manchester-united/kader/verein/985/saison_id/2017',
  'market_value': '592,75 Mill. €',
  'name': 'Manchester United',
  'squad': '25',
  'stadium': 'Old Trafford'},
 {'href': '/liverpool-fc/kader/verein/31/saison_id/2017',
  'market_value': '514,50 Mill. €',
  'name': 'Liverpool FC',
  'squad': '29',
  'stadium': 'Anfield'},
 {'href': '/everton-fc/kader/verein/29/saison_id/2017',
  'market_value': '335,25 Mill. €',
  'name': 'Everton FC',
  'squad': '31',
  'stadium': 'Goodison Park'},
 {'href': '/leicester-city/kader/verein/1003/saison_id/2017',
  'market_value': '243,50 Mill. €',
  'name': 'Leicester City',
  'squad': '26',
  'stadium': 'King Power Stadium'},
 {'href': '/crystal-palace/kader/verein/873/saison_id/2017',
  'market_value': '190,60 Mill. €',
  'name': 'Crystal Palace',
  'squad': '27',
  'stadi

In [47]:
leagues = parseContinent("wettbewerbe/europa")
leagues += parseContinent("wettbewerbe/amerika")
leagues += parseContinent("wettbewerbe/asien")

In [48]:
with open("data/leagues.json", "w") as out:
    json.dump(leagues, out)

In [10]:
def parsePlayer(player_ref):    
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + "/" + player_ref

    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    
    playerInfos = str(response.find("table", {"class":"auflistung"}))
    player = {}
    player["href"] = player_ref
    player["number"] = response.find("span", {"class":"dataRN"}).text
    player["name"] = response.find("h1", {"itemprop":"name"}).text
    player["player_id"] = player_ref.split("/")[-1]
    position = BeautifulSoup(playerInfos.split("Position")[1], 'html.parser').find("td").text
    reg = re.compile( "[a-zA-Z -]")
    player["position"] = "".join(reg.findall(position))
    player["birthdate"] = BeautifulSoup(playerInfos.split("Date of birth")[1], 'html.parser').find("td").text
    player["nationality"] = BeautifulSoup(playerInfos.split("Nationality")[1], 'html.parser').find("td").find("img")["title"]
    player["current_club"] = BeautifulSoup(playerInfos.split("Current club")[1], 'html.parser').find("td").find_all("a")[-1].text

    transfers = []
    trans = response.find("div",{"class" : "box transferhistorie"}).find("table").find("tbody").find_all("tr", {"class":"zeile-transfer"})

    for t in trans:
        transfer = {}
        transfer["player"] = player_ref.split("/")[-1]
        transfer["date"] = t.find_all("td", {"class":"zentriert hide-for-small"})[1].text
        transfer["from"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[0].find("a")["id"]
        transfer["to"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[1].find("a")["id"]
        transfer["fee"] = t.find("td", {"class":"zelle-mw"}).text

        transfers.append(transfer)
        
    return player, transfers

In [11]:
href = "ivan-rakitic/profil/spieler/32467"
player, transfers = parsePlayer(href)

In [12]:
player

{'birthdate': 'Mar 10, 1988 ',
 'current_club': 'FC Barcelona',
 'href': 'ivan-rakitic/profil/spieler/32467',
 'name': 'Ivan Rakitic',
 'nationality': 'Croatia',
 'number': '#4',
 'player_id': '32467',
 'position': 'Midfield - Central Midfield'}

In [23]:
transfers

[{'date': 'Jul 1, 2014',
  'fee': '20,00 Mill. €',
  'from': '368',
  'player': '32467',
  'to': '131'},
 {'date': 'Jan 28, 2011',
  'fee': '9,00 Mill. €',
  'from': '33',
  'player': '32467',
  'to': '368'},
 {'date': 'Jul 1, 2007',
  'fee': '5,00 Mill. €',
  'from': '26',
  'player': '32467',
  'to': '33'},
 {'date': 'Jul 1, 2005',
  'fee': '-',
  'from': '5299',
  'player': '32467',
  'to': '26'},
 {'date': 'Jul 1, 2004',
  'fee': '-',
  'from': '14322',
  'player': '32467',
  'to': '5299'}]

In [14]:
type(transfers)

list

In [49]:
leagues

[{'country': 'England',
  'href': '/premier-league/startseite/wettbewerb/GB1',
  'name': 'Premier League',
  'tot_value': 5790000000.0},
 {'country': 'Italy',
  'href': '/serie-a/startseite/wettbewerb/IT1',
  'name': 'Serie A',
  'tot_value': 3160000000.0},
 {'country': 'France',
  'href': '/ligue-1/startseite/wettbewerb/FR1',
  'name': 'Ligue 1',
  'tot_value': 2080000000.0},
 {'country': 'Portugal',
  'href': '/liga-nos/startseite/wettbewerb/PO1',
  'name': 'Liga NOS',
  'tot_value': 799530000.0},
 {'country': 'Netherlands',
  'href': '/eredivisie/startseite/wettbewerb/NL1',
  'name': 'Eredivisie',
  'tot_value': 611600000.0},
 {'country': 'Greece',
  'href': '/super-league/startseite/wettbewerb/GR1',
  'name': 'Super League',
  'tot_value': 313880000.0},
 {'country': 'Switzerland',
  'href': '/raiffeisen-super-league/startseite/wettbewerb/C1',
  'name': 'Raiffeisen Super League',
  'tot_value': 207050000.0},
 {'country': 'Spain',
  'href': '/laliga/startseite/wettbewerb/ES1',
  'nam