# Scraping team lineups

This notebook will cover scraping player statistics from https://www.transfermarkt.co.uk/. These player statistics will later be used as features in combination with data pertaining to individual matches.

In [8]:
from selenium import webdriver
import pandas as pd

In [2]:
browser = webdriver.Chrome("./chromedriver")

In [3]:
browser.get("https://www.transfermarkt.co.uk/")

In [49]:
# search for a country
search_base_url = "https://www.transfermarkt.co.uk/schnellsuche/ergebnis/schnellsuche?query="
country = "ivory coast"
year = "2014"
browser.get(search_base_url + country.lower().replace(" ","+"))

In [50]:
# get first result
team_base_url = browser.find_element_by_class_name("odd").find_element_by_css_selector('a[class="vereinprofil_tooltip tooltipstered"]').get_attribute("href")

In [51]:
# go to specific year
browser.get(team_base_url + "?saison_id=" + str(int(year) - 1))

In [62]:
# search roster for player 
players1 = browser.find_element_by_class_name("items").find_elements_by_css_selector('tr[class="odd"]')
players2 = browser.find_element_by_class_name("items").find_elements_by_css_selector('tr[class="even"]')
players = players1 + players2

In [3]:
def get_name_link_age_value(player_element):
    name = player_element.find_element_by_class_name("hauptlink").text
    link = player_element.find_element_by_class_name("hauptlink").find_element_by_tag_name("a").get_attribute("href")
    age = player_element.find_element_by_css_selector('td[class="zentriert"]').text
    value = player_element.find_element_by_css_selector('td[class="rechts hauptlink"]').text
    return [name, link, age, value]

In [90]:
player_data = list(map(get_name_link_age_value, players))

In [9]:

header = ["name","link","age","value"]

In [94]:


df = pd.DataFrame(player_data, columns=header)

In [95]:
df

Unnamed: 0,name,link,age,value
0,Boubacar Copa,https://www.transfermarkt.co.uk/boubacar-copa/...,33,£1.35m
1,Badra Ali Sangaré,https://www.transfermarkt.co.uk/badra-ali-sang...,27,
2,Kolo Touré,https://www.transfermarkt.co.uk/kolo-toure/pro...,32,£5.40m
3,Sol Bamba,https://www.transfermarkt.co.uk/sol-bamba/prof...,28,£1.80m
4,Arthur Boka,https://www.transfermarkt.co.uk/arthur-boka/pr...,30,£1.80m
5,Constant Djakpa,https://www.transfermarkt.co.uk/constant-djakp...,26,£630k
6,Brice Dja Djédjé,https://www.transfermarkt.co.uk/brice-dja-djed...,22,£3.15m
7,Didier Zokora,https://www.transfermarkt.co.uk/didier-zokora/...,32,£3.60m
8,Jean-Jacques Gosso,https://www.transfermarkt.co.uk/jean-jacques-g...,30,£1.35m
9,Ismaël Diomandé,https://www.transfermarkt.co.uk/ismael-diomand...,20,£450k


In [161]:
all_matches_df = pd.read_csv("data/espn_data_230618_all.csv")

In [237]:
world_cup_years = ["2014","2010","2006","2002","1998","1994","1990","1986","1982","1978"]

for year in world_cup_years:
    print(year)
    # filter dataframe
    year_matches_df = all_matches_df[all_matches_df.year == int(year)]
    team_list = list(set(list(year_matches_df.Team1.values)+list(year_matches_df.Team2.values)))
    get_player_stats_for_year(team_list,year)

1998
Yugoslavia
Paraguay
England
Chile
Nigeria
Croatia
Italy
Argentina
Cameroon
South Africa
Norway
United States
Mexico
South Korea
Jamaica
Saudi Arabia
Colombia
France
Netherlands
Germany
Bulgaria
Romania
Austria
Denmark
Scotland
Morocco
Spain
Japan
Iran
Belgium
Tunisia
Brazil
1994
Greece
Nigeria
Italy
Argentina
Cameroon
Norway
United States
Mexico
South Korea
Saudi Arabia
Colombia
Netherlands
Germany
Bulgaria
Republic of Ireland
Romania
Russia
Morocco
Switzerland
Spain
Bolivia
Sweden
Belgium
Brazil
1990
Yugoslavia
England
Costa Rica
Czechoslovakia
Soviet Union
Egypt
Italy
Cameroon
Argentina
United States
South Korea
Uruguay
Colombia
United Arab Emirates
Netherlands
Republic of Ireland
Romania
Austria
Scotland
Spain
Germany FR
cant find: Germany FR, 1990
Sweden
Belgium
Brazil
1986
Paraguay
England
Soviet Union
Canada
Italy
Argentina
Algeria
Mexico
South Korea
Uruguay
Northern Ireland
Portugal
France
Poland
Bulgaria
Denmark
Scotland
Morocco
Spain
Germany FR
cant find: Germany FR, 1986

In [214]:
team_list

['Spain',
 'Germany FR',
 'Hungary',
 'Sweden',
 'France',
 'Brazil',
 'Poland',
 'Netherlands',
 'Scotland',
 'Mexico',
 'Austria',
 'Peru',
 'Iran',
 'Tunisia',
 'Italy',
 'Argentina']

In [4]:
def get_player_stats_for_year(team_list, year):
    year_stats = []
    for team in team_list:
        print(team)
        try:
            year_stats += get_player_stats_for_team(team, year)
        except:
            print("cant find: "+ team + ", " + year)
            continue
    df = pd.DataFrame(year_stats, columns=["name","link","age","value"])
    df.to_csv("data/teams/"+year+".csv")

In [5]:
def get_player_stats_for_team(team, year):
    # search for a country
    search_base_url = "https://www.transfermarkt.co.uk/schnellsuche/ergebnis/schnellsuche?query="
    browser.get(search_base_url + team.lower().replace(" ","+"))
    # get first result
    clubs_div = get_clubs_div()
    team_base_url = clubs_div.find_element_by_class_name("odd").find_element_by_css_selector('a[class="vereinprofil_tooltip tooltipstered"]').get_attribute("href")
    # go to specific year
    browser.get(team_base_url + "?saison_id=" + str(int(year) - 1))
    # get player data
    players1 = browser.find_element_by_class_name("items").find_elements_by_css_selector('tr[class="odd"]')
    players2 = browser.find_element_by_class_name("items").find_elements_by_css_selector('tr[class="even"]')
    players = players1 + players2
    player_data = list(map(get_name_link_age_value, players))
    return player_data

In [195]:
"clubs" in browser.find_elements_by_class_name("box")[3].find_element_by_class_name("table-header").text.lower()

True

In [196]:
browser.find_elements_by_class_name("box")[3].text.lower()

'search results: clubs - 11 hits\n  club\ncountry squad\ntotal market value\ntransfers stadium forum\nbrazil\n23 £882.90m    \nbrazil u20\n22 £45.88m    \nsanat naft fc\npersian gulf pro league\n26 £6.23m    \nbrazil u17\n8 £4.55m    \nball school moskau\n2 £90k    \nbrazil u23\n0 -    \nsv brazil juniors\n1 -    \nbrazil u15\n20 -    \nbrazil olympic team\n0 -    \nbrazil international football academy\n0 -    \n1 2      '

In [6]:
def get_clubs_div():
    divs = browser.find_elements_by_class_name("box")
    for div in divs:
        try:
            if "clubs" in div.find_element_by_class_name("table-header").text.lower():
                return div
        except:
            continue
    return

### 2018 data

In [10]:
all_matches_2018_df = pd.read_csv("2018.csv")

In [11]:
all_matches_2018_df

Unnamed: 0,Date,Team1,Team1_score,Team2,Team2_score,team1_fouls,team2_fouls,team1_yellow_cards,team2_yellow_cards,team1_red_cards,...,team1_shots,team2_shots,team1_avg_wins,team1_avg_draws,team1_avg_gd,team2_avg_wins,team2_avg_draws,team2_avg_gd,team1_lineup,team2_lineup
0,20180614,Russia,5,Saudi Arabia,0,22,10,1,1,0,...,13 (7),6 (0),0.0,0.4,-1.2,0.4,0.0,-0.2,1 Igor Akinfeev_4 Sergey Ignashevich_3 Ilya Ku...,1 Abdullah Al-Mayouf_5 Omar Hawsawi_3 Osama Ha...
1,20180615,Egypt,0,Uruguay,1,12,6,2,0,0,...,8 (3),14 (4),0.0,0.4,-1.0,0.6,0.2,1.0,23 Mohamed El-Shenawy_6 Ahmed Hegazi _2 Ali G...,1 Fernando Muslera_3 Diego Godín_2 José Giméne...
2,20180615,Morocco,0,Iran,1,22,14,1,3,0,...,13 (3),8 (2),0.8,0.2,1.2,0.6,0.0,0.2,12 Munir_6 Romain Saïss_5 Medhi Benatia_2 Achr...,1 Alireza Beiranvand_4 Roozbeh Cheshmi_8 Morte...
3,20180615,Portugal,3,Spain,3,12,10,1,1,0,...,8 (3),12 (5),0.4,0.4,0.2,0.4,0.6,1.2,1 Rui Patrício_6 José Fonte_3 Pepe_5 Raphaël G...,1 David De Gea_15 Sergio Ramos_3 Gerard Piqué_...
4,20180616,France,2,Australia,1,16,19,1,3,0,...,13 (5),4 (1),0.6,0.2,1.0,0.6,0.2,0.8,1 Hugo Lloris_5 Samuel Umtiti_4 Raphaël Varane...,1 Mathew Ryan_5 Mark Milligan_20 Trent Sainsbu...
5,20180616,Argentina,1,Iceland,1,10,15,0,0,0,...,26 (7),9 (3),0.6,0.0,0.0,0.2,0.2,-0.6,23 Wilfredo Caballero_16 Marcos Rojo_17 Nicolá...,1 Hannes Halldórsson_6 Ragnar Sigurdsson_14 Ká...
6,20180616,Peru,0,Denmark,1,10,18,1,2,0,...,17 (6),10 (3),0.8,0.2,1.8,0.4,0.4,0.4,1 Pedro Gallese_2 Alberto Rodríguez_15 Christi...,1 Kasper Schmeichel_6 Andreas Christensen_4 Si...
7,20180616,Croatia,2,Nigeria,0,20,16,2,1,0,...,11 (2),14 (2),0.4,0.2,-0.4,0.2,0.2,-0.6,23 Danijel Subasic_21 Domagoj Vida_6 Dejan Lov...,23 Francis Uzoho_6 Leon Balogun_5 William Troo...
8,20180617,Costa Rica,0,Serbia,1,18,15,2,2,0,...,10 (3),10 (3),0.4,0.0,-0.4,0.4,0.2,0.8,1 Keylor Navas_3 Giancarlo González_6 Óscar Du...,1 Vladimir Stojkovic_3 Dusko Tosic_15 Nikola M...
9,20180617,Germany,0,Mexico,1,10,15,2,2,0,...,25 (9),12 (4),0.2,0.4,-0.2,0.4,0.2,0.2,1 Manuel Neuer_5 Mats Hummels _17 Jérôme Boat...,13 Guillermo Ochoa_15 Héctor Moreno _2 Hugo A...


In [13]:
year = "2018"
team_list = list(set(list(all_matches_2018_df.Team1.values)+list(all_matches_2018_df.Team2.values)))
get_player_stats_for_year(team_list,year)

Australia
Russia
Colombia
Senegal
Saudi Arabia
Costa Rica
Tunisia
Germany
Poland
France
Brazil
Peru
Croatia
Portugal
Egypt
Belgium
Switzerland
Sweden
Iceland
Japan
Morocco
Argentina
Panama
England
Iran
Nigeria
Serbia
Mexico
Uruguay
Spain
South Korea
Denmark
