# Transfermarkt Scraper

In [1]:
from urllib.request import *
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
import requests
import json
import os
import time
%matplotlib inline

In [2]:
base_url = "https://www.transfermarkt.com"

In [3]:
def parseContinent(ref):
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + "/" + ref

    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    
    leagues = []
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})

    for row in rows:
        val = row.find("td",{"class":"rechts hauptlink"}).text
        val = ".".join(val.split(","))
        rest = val.split(" ")[1]
        val = val.split(" ")[0]
        if "Bill" in rest:
            val = float(val) * 10**9
        else:
            if "Mill" in rest:
                val = float(val) * 10**6
            else: 
                val = 0
        if val > 200*10**6:
            league = {}
            league["href"] = row.findAll('a')[1]['href']
            league["name"] = row.find("img")["title"]
            league["country"] = row.find("td",{"class","zentriert"}).find("img")["title"]
            league["tot_value"] = val
            leagues.append(league)
            
    return leagues

In [4]:
def getPlayers(club_page):
    players = []
    players_infos = club_page.find("div", {"id":"yw1"}).find("table", {"class":"items"}).find("tbody").find_all("tr", recursive=False)
    for player_info in players_infos:
        player = {}
        player_info = player_info.find("a", {"class":"spielprofil_tooltip"})
        player["name"] = player_info["title"]
        player["id"] = player_info["id"]
        player["href"] = player_info["href"]
        players.append(player)
    return players

In [5]:
def parse_league(league_ref): #get clubs in league

    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + league_ref
    clubs = []
    
    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
    
    for row in rows:
            
            url_club = base_url + row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
            r_club = requests.get(url_club, headers=HEADERS)
            response_club = BeautifulSoup(r_club.text, 'html.parser')
            stadium_info =response_club.find("div",{"id":"main"}).findAll("span",{"class":"dataValue"})[4].text
            
            stadium_info=stadium_info.replace(u'\xa0',u'')
            stadium_info=stadium_info.replace(u'\n',u'')


            split_stadium= re.split(r'(\d+)',stadium_info)
            stadium = split_stadium[0]
#             num_seats = float(split_stadium[1]+'.'+split_stadium[3])
           
            
            club = {}
            club["name"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['title']
            club["href"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
            club["squad"] = row.findAll("td",{"class":"zentriert"})[1].text
            club["market_value"] = row.find("td",{"class":"rechts show-for-small show-for-pad nowrap"}).text
            club["stadium"] = stadium
            
            players = getPlayers(BeautifulSoup(r_club.text, 'html.parser'))
            club["players"] = players
#             club["stadium_seats"] = num_seats
            clubs.append(club)  
    
    return clubs

In [42]:
def parsePlayer(player_page, player_ref):    
    
    response = BeautifulSoup(player_page, 'html.parser')
    
    playerInfos = str(response.find("table", {"class":"auflistung"}))
    player = {}
    player["href"] = player_ref
    try:
        player["number"] = response.find("span", {"class":"dataRN"}).text
    except:
        player["number"] = None
    player["name"] = response.find("h1", {"itemprop":"name"}).text
    player["player_id"] = player_ref.split("/")[-1]
    position = BeautifulSoup(playerInfos.split("Position")[1], 'html.parser').find("td").text
    reg = re.compile( "[a-zA-Z -]")
    player["position"] = "".join(reg.findall(position))
    player["birthdate"] = BeautifulSoup(playerInfos.split("Date of birth")[1], 'html.parser').find("td").text
    player["nationality"] = BeautifulSoup(playerInfos.split("Nationality")[1], 'html.parser').find("td").find("img")["title"]
    player["current_club"] = BeautifulSoup(playerInfos.split("Current club")[1], 'html.parser').find("td").find_all("a")[-1].text

    try:
        transfers = []
        trans = response.find("div",{"class" : "box transferhistorie"}).find("table").find("tbody").find_all("tr", {"class":"zeile-transfer"})

        for t in trans:
            transfer = {}
            transfer["player"] = player_ref.split("/")[-1]
            transfer["date"] = t.find_all("td", {"class":"zentriert hide-for-small"})[1].text
            transfer["from"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[0].find("a")["id"]
            transfer["to"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[1].find("a")["id"]
            if (t.find("td", {"class":"zelle-abloese"}).text) == "End of loan" or t.find("td", {"class":"zelle-abloese"}).text =="Loan":
                transfer["fee"] = t.find("td", {"class":"zelle-mw"}).text
            else: 
                transfer["fee"] = t.find("td",{"class":"zelle-abloese"}).text

            transfers.append(transfer)
    except:
        transfers = None
        
    return player, transfers

In [7]:
def getPlayersPage(player_ref):
        
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + player_ref

    r = requests.get(url, headers=HEADERS)
    return r.text

In [9]:
'''
leagues = parseContinent("wettbewerbe/europa")
leagues += parseContinent("wettbewerbe/amerika")
leagues += parseContinent("wettbewerbe/asien")
with open("data/leagues.json", "w") as out:
    json.dump(leagues, out)
'''

'\nleagues = parseContinent("wettbewerbe/europa")\nleagues += parseContinent("wettbewerbe/amerika")\nleagues += parseContinent("wettbewerbe/asien")\nwith open("data/leagues.json", "w") as out:\n    json.dump(leagues, out)\n'

In [8]:
with open("data/leagues.json", "r") as in_file:
    leagues = json.load(in_file)
    
print("Number of leagues: " + str(len(leagues)))
for league in leagues:
    print(league["name"])

Number of leagues: 23
Premier League
Serie A
Ligue 1
Liga NOS
Eredivisie
Super League
Raiffeisen Super League
LaLiga
1.Bundesliga
Süper Lig
Premier Liga
Jupiler Pro League
Premier Liga
HET Liga
Campeonato Brasileiro Série A
Liga MX Clausura
Major League Soccer
Campeonato Brasileiro Série B
Primera División
Liga MX Apertura
Liga Águila I
Chinese Super League
J1 League


In [9]:
leagues[0]

{'country': 'England',
 'href': '/premier-league/startseite/wettbewerb/GB1',
 'name': 'Premier League',
 'tot_value': 5790000000.0}

In [10]:
'''
clubs = []
for league in leagues:
    clubs += parse_league(league["href"])
    
with open("data/clubs.json", "w") as out:
    json.dump(clubs, out)
'''

'\nclubs = []\nfor league in leagues:\n    clubs += parse_league(league["href"])\n    \nwith open("data/clubs.json", "w") as out:\n    json.dump(clubs, out)\n'

In [11]:
with open("data/clubs.json", "r") as in_file:
    clubs = json.load(in_file)

In [12]:
len(clubs)

418

In [13]:
len(leagues)

23

In [14]:
'''
player_list = []

for club in clubs:
    players = club["players"]
    for player in players:
        player_list.append(player["href"])

with open("data/players_ref.json", "w") as out:
    json.dump(player_list, out)
'''

'\nplayer_list = []\n\nfor club in clubs:\n    players = club["players"]\n    for player in players:\n        player_list.append(player["href"])\n\nwith open("data/players_ref.json", "w") as out:\n    json.dump(player_list, out)\n'

In [15]:
with open("data/players_ref.json", "r") as in_file:
    players_list = json.load(in_file)

In [16]:
'''
for player_ref in players_list:
    player_id = player_ref.split("/")[-1]
     
    directory = 'data/players/' + player_id + "/"
    fname = directory + "page.html"

    if os.path.isfile(fname) == False:
        if os.path.exists(directory) == False:
            os.makedirs(directory)
        page = getPlayersPage(player_ref)
        with open(fname, "w")as out:
            json.dump(page, out)  
        time.sleep(0.5)
'''

'\nfor player_ref in players_list:\n    player_id = player_ref.split("/")[-1]\n     \n    directory = \'data/players/\' + player_id + "/"\n    fname = directory + "page.html"\n\n    if os.path.isfile(fname) == False:\n        if os.path.exists(directory) == False:\n            os.makedirs(directory)\n        page = getPlayersPage(player_ref)\n        with open(fname, "w")as out:\n            json.dump(page, out)  \n        time.sleep(0.5)\n'

In [18]:
i = 0
for player_ref in players_list:
    player_id = player_ref.split("/")[-1]
     
    directory = 'data/players/' + player_id + "/"
    fname = directory + "info.json"

    with open(directory + "page.html", "r") as in_file:
        player_page = json.load(in_file)

    player = parsePlayer(player_page, player_ref)
    with open(fname, "w") as out:
        json.dump(player, out)  

    i+= 1
    if i % 1000 == 0:
        print("Scraped " + str(i) + "/" + str(len(players_list)) + " players.")

Scraped 1000/12075 players.
Scraped 2000/12075 players.


KeyboardInterrupt: 

In [40]:
# player_url = "https://www.transfermarkt.co.uk/gary-cahill/profil/spieler/27511"
# HEADERS = {'User-Agent': 'Mozilla/5.0'}
# r = requests.get(player_url, headers=HEADERS)

# response = BeautifulSoup(r.text, 'html.parser')
# trans = response.find("div",{"class" : "box transferhistorie"}).find("table").find("tbody").find_all("tr", {"class":"zeile-transfer"})
# transfers = []
# for t in trans:
#             transfer = {}
#             transfer["player"] = player_ref.split("/")[-1]
#             transfer["date"] = t.find_all("td", {"class":"zentriert hide-for-small"})[1].text
#             transfer["from"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[0].find("a")["id"]
#             transfer["to"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[1].find("a")["id"]
#             if (t.find("td", {"class":"zelle-abloese"}).text) == "End of loan" or t.find("td", {"class":"zelle-abloese"}).text =="Loan":
#                 transfer["fee"] = t.find("td", {"class":"zelle-mw"}).text
#             else: 
#                 transfer["fee"] = t.find("td",{"class":"zelle-abloese"}).text

#             transfers.append(transfer)
# transfers

[{'date': 'Jan 16, 2012',
  'fee': '£7.56m',
  'from': '355',
  'player': '66587',
  'to': '631'},
 {'date': 'Jan 30, 2008',
  'fee': '£5.40m',
  'from': '405',
  'player': '66587',
  'to': '355'},
 {'date': 'Dec 31, 2007',
  'fee': '£1.80m',
  'from': '350',
  'player': '66587',
  'to': '405'},
 {'date': 'Sep 19, 2007',
  'fee': '£1.80m',
  'from': '405',
  'player': '66587',
  'to': '350'},
 {'date': 'May 9, 2005',
  'fee': '£45k',
  'from': '1132',
  'player': '66587',
  'to': '405'},
 {'date': 'Nov 9, 2004',
  'fee': '-',
  'from': '405',
  'player': '66587',
  'to': '1132'},
 {'date': 'Jul 1, 2004',
  'fee': '-',
  'from': '6933',
  'player': '66587',
  'to': '405'}]