# Transfermarkt Scraper

In [1]:
from urllib.request import *
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
import requests
import json
import os
%matplotlib inline

In [2]:
base_url = "https://www.transfermarkt.com"

In [211]:
def parseContinent(ref):
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + "/" + ref

    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    
    leagues = []
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})

    for row in rows:
        val = row.find("td",{"class":"rechts hauptlink"}).text
        val = ".".join(val.split(","))
        rest = val.split(" ")[1]
        val = val.split(" ")[0]
        if "Bill" in rest:
            val = float(val) * 10**9
        else:
            if "Mill" in rest:
                val = float(val) * 10**6
            else: 
                val = 0
        if val > 200*10**6:
            league = {}
            league["href"] = row.find("a")["href"]
            league["name"] = row.find("img")["title"]
            league["country"] = row.find("td",{"class","zentriert"}).find("img")["title"]
            league["tot_value"] = val
            leagues.append(league)
            
    return leagues

In [215]:
leagues = parseContinent("wettbewerbe/europa")
leagues += parseContinent("wettbewerbe/amerika")
leagues += parseContinent("wettbewerbe/asien")

In [218]:
with open("data/leagues.json", "w") as out:
    json.dump(leagues, out)

In [192]:
def parsePlayer(player_ref):    
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + "/" + player_ref

    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    
    playerInfos = str(response.find("table", {"class":"auflistung"}))
    player = {}
    player["href"] = player_ref
    player["number"] = test.find("span", {"class":"dataRN"}).text
    player["name"] = test.find("h1", {"itemprop":"name"}).text
    player["player_id"] = player_ref.split("/")[-1]
    position = BeautifulSoup(playerInfos.split("Position")[1], 'html.parser').find("td").text
    reg = re.compile( "[a-zA-Z -]")
    player["position"] = "".join(reg.findall(position))
    player["birthdate"] = BeautifulSoup(playerInfos.split("Date of birth")[1], 'html.parser').find("td").text
    player["nationality"] = BeautifulSoup(playerInfos.split("Nationality")[1], 'html.parser').find("td").find("img")["title"]
    player["current_club"] = BeautifulSoup(playerInfos.split("Current club")[1], 'html.parser').find("td").find_all("a")[-1].text

    transfers = []
    trans = test.find("div",{"class" : "box transferhistorie"}).find("table").find("tbody").find_all("tr", {"class":"zeile-transfer"})

    for t in trans:
        transfer = {}
        transfer["player"] = player_ref.split("/")[-1]
        transfer["date"] = t.find_all("td", {"class":"zentriert hide-for-small"})[1].text
        transfer["from"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[0].find("a")["id"]
        transfer["to"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[1].find("a")["id"]
        transfer["fee"] = t.find("td", {"class":"zelle-mw"}).text

        transfers.append(transfer)
        
    return player, transfers

In [164]:
href = "ivan-rakitic/profil/spieler/32467"
player, transfers = parsePlayer(href)

In [165]:
player

{'birthdate': 'Mar 10, 1988 ',
 'current_club': 'FC Barcelona',
 'name': 'Ivan Rakitic',
 'nationality': 'Croatia',
 'number': '#4',
 'player_id': '32467',
 'position': 'Midfield - Central Midfield'}

In [166]:
transfers

[{'date': 'Jul 1, 2014',
  'fee': '20,00 Mill. €',
  'from': '368',
  'player': '32467',
  'to': '131'},
 {'date': 'Jan 28, 2011',
  'fee': '9,00 Mill. €',
  'from': '33',
  'player': '32467',
  'to': '368'},
 {'date': 'Jul 1, 2007',
  'fee': '5,00 Mill. €',
  'from': '26',
  'player': '32467',
  'to': '33'},
 {'date': 'Jul 1, 2005',
  'fee': '-',
  'from': '5299',
  'player': '32467',
  'to': '26'},
 {'date': 'Jul 1, 2004',
  'fee': '-',
  'from': '14322',
  'player': '32467',
  'to': '5299'}]