In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
from urllib.parse import urljoin
from itertools import chain

In [284]:
def try_get(url, headers={}):
    print('Getting %s' % (url))
    try:
        res = requests.get(url, headers=headers)
        if not res.ok:
            res.raise_for_status()
        return res
    except Exception as e:
        print(e)
        return None

def links(url, xpath='', css='', headers={}):
    res = try_get(url)
    if not res:
        return []
    parser = etree.HTMLParser()
    parsed = etree.fromstring(res.text, parser)
    a_tags = parsed.cssselect(css) if css else parsed.xpath(xpath)
    return (urljoin(url, a.get("href")) for a in a_tags)

def player_links(url):
    res = try_get(url)
    if not res:
        return []
    soup = BeautifulSoup(res.text, "lxml")
    spans = soup.find("tr", attrs={"style":"background-color:#AAD0FF"}).find_parent("tr").find_all("span", class_="fn")
    return (urljoin(url, span.find("a").get("href")) for span in spans if span.find("a"))

## Wikipedia

In [285]:
premier_league = links("https://en.wikipedia.org/wiki/2016%E2%80%9317_Premier_League", xpath='//table[2]/*/td[1]/a')
la_liga = links("https://en.wikipedia.org/wiki/2016%E2%80%9317_La_Liga", xpath='//table[2]/*/td[1]/a')
player_urls = (chain.from_iterable((player_links(link)) for link in la_liga))

Getting https://en.wikipedia.org/wiki/2016%E2%80%9317_Premier_League
Getting https://en.wikipedia.org/wiki/2016%E2%80%9317_La_Liga


In [286]:
list(player_urls)

Getting https://en.wikipedia.org/wiki/Deportivo_Alav%C3%A9s
Getting https://en.wikipedia.org/wiki/Athletic_Bilbao
Getting https://en.wikipedia.org/wiki/Atl%C3%A9tico_Madrid
Getting https://en.wikipedia.org/wiki/FC_Barcelona
Getting https://en.wikipedia.org/wiki/Celta_de_Vigo
Getting https://en.wikipedia.org/wiki/Deportivo_de_La_Coru%C3%B1a
Getting https://en.wikipedia.org/wiki/SD_Eibar
Getting https://en.wikipedia.org/wiki/RCD_Espanyol
Getting https://en.wikipedia.org/wiki/Granada_CF
Getting https://en.wikipedia.org/wiki/UD_Las_Palmas
Getting https://en.wikipedia.org/wiki/CD_Legan%C3%A9s
Getting https://en.wikipedia.org/wiki/M%C3%A1laga_CF
Getting https://en.wikipedia.org/wiki/CA_Osasuna
Getting https://en.wikipedia.org/wiki/Real_Betis
Getting https://en.wikipedia.org/wiki/Real_Madrid_C.F.
Getting https://en.wikipedia.org/wiki/Real_Sociedad
Getting https://en.wikipedia.org/wiki/Sevilla_FC
Getting https://en.wikipedia.org/wiki/Sporting_de_Gij%C3%B3n
Getting https://en.wikipedia.org/wiki

['https://en.wikipedia.org/wiki/Fernando_Pacheco_Flores',
 'https://en.wikipedia.org/wiki/Ra%C3%BAl_Garc%C3%ADa_Carnero',
 'https://en.wikipedia.org/wiki/V%C3%ADctor_Laguardia',
 'https://en.wikipedia.org/wiki/Christian_Santos',
 'https://en.wikipedia.org/wiki/Manuel_Barreiro_Bustelo',
 'https://en.wikipedia.org/wiki/Gaizka_Toquero',
 'https://en.wikipedia.org/wiki/Manu_Garc%C3%ADa_(footballer,_born_1986)',
 'https://en.wikipedia.org/wiki/Kiko_Femen%C3%ADa',
 'https://en.wikipedia.org/wiki/Einar_Galilea',
 'https://en.wikipedia.org/wiki/Sergio_Llamas',
 'https://en.wikipedia.org/wiki/Asier_Benito',
 'https://en.wikipedia.org/wiki/Adri%C3%A1n_Ortol%C3%A1',
 'https://en.wikipedia.org/wiki/Alexis_(footballer)',
 'https://en.wikipedia.org/wiki/Zouhair_Feddal',
 'https://en.wikipedia.org/wiki/%C3%89dgar_M%C3%A9ndez',
 'https://en.wikipedia.org/wiki/Ibai_G%C3%B3mez',
 'https://en.wikipedia.org/wiki/Nenad_Krsti%C4%8Di%C4%87',
 'https://en.wikipedia.org/wiki/Daniel_Alejandro_Torres',
 'https:/

In [287]:
len(_)

486

In [70]:
url = "https://en.wikipedia.org/wiki/Ander_Herrera"
def get_player_info(url):
    player = {}
    print('Getting %s' % (url))
    res = requests.get(url)
    parser = etree.HTMLParser()
    parsed = etree.fromstring(res.text, parser)
    player['wikidata'] = parsed.xpath("//li[@id='t-wikibase']/a/@href")
    infobox = parsed.xpath("//table[@class='infobox vcard']")[0].getchildren()
    player['name'] = infobox.pop(0).text
    return player
    

## Transfermarkt

In [40]:
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'}
url = "http://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1"
xpath = '//*[@id="yw1"]/table/tbody/tr/td[2]/a'
team_links = links(url, xpath=xpath, headers=headers)
list(team_links)

Getting http://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1


['http://www.transfermarkt.com/real-madrid/startseite/verein/418/saison_id/2016',
 'http://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1',
 'http://www.transfermarkt.com/fc-barcelona/startseite/verein/131/saison_id/2016',
 'http://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1',
 'http://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1',
 'http://www.transfermarkt.com/atletico-madrid/startseite/verein/13/saison_id/2016',
 'http://www.transfermarkt.com/sevilla-fc/startseite/verein/368/saison_id/2016',
 'http://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1',
 'http://www.transfermarkt.com/valencia-cf/startseite/verein/1049/saison_id/2016',
 'http://www.transfermarkt.com/athletic-bilbao/startseite/verein/621/saison_id/2016',
 'http://www.transfermarkt.com/villarreal-cf/startseite/verein/1050/saison_id/2016',
 'http://www.transfermarkt.com/real-sociedad/startseite/verein/681/saison_id/2016',
 'http://www.transfermarkt.