---
title: Data extraction
description: We proceed to extract dataset using WorldFootballR library from Fbref and ...
---

In [51]:
if (!require(worldfootballR)) {
  install.packages("worldfootballR")
}

if (!require(data.table)) {
  install.packages("data.table")
}

library(worldfootballR)
library(data.table)

## Get match results

In [52]:
country <- c("ENG", "ESP", "ITA", "GER", "FRA")
year <- c(2018, 2019, 2020, 2021, 2022)
result <- fb_match_results(country = country, gender = "M", season_end_year = year, tier = "1st")

In [53]:
column_to_drop <- c('Gender', 'Day', 'Wk', 'Time', 'Venue', 'Referee', 'Attendance', 'Home_xG', 'Away_xG', 'Notes', 'Round')
result <- result[, !(names(result) %in% column_to_drop)]
# Rename result$Competition_Name of 'Fu\303\237ball-Bundesliga' to 'Bundesliga'
result$Competition_Name[result$Competition_Name == 'Fu\303\237ball-Bundesliga'] <- 'Bundesliga'
# Rename columns of result dataframe
names(result) <- c('league', 'country', 'season_year', 'date', 'home', 'home_goals', 'away', 'away_goals', 'match_url')
# Rename country code to country name
match_country <- c('ENG' = 'England', 'ITA' = 'Italy', 'FRA' = 'France', 'GER' = 'Germany', 'ESP' = 'Spain')
result$country <- match_country[result$country]


In [54]:
head(result)
paste(nrow(result), "rows")
print("leagues of interests:")
unique(result$league)

Unnamed: 0_level_0,league,country,season_year,date,home,home_goals,away,away_goals,match_url
Unnamed: 0_level_1,<chr>,<chr>,<int>,<date>,<chr>,<dbl>,<chr>,<dbl>,<chr>
1,Premier League,England,2018,2017-08-11,Arsenal,4,Leicester City,3,https://fbref.com/en/matches/e3c3ddf0/Arsenal-Leicester-City-August-11-2017-Premier-League
2,Premier League,England,2018,2017-08-12,Watford,3,Liverpool,3,https://fbref.com/en/matches/60f6cc1d/Watford-Liverpool-August-12-2017-Premier-League
3,Premier League,England,2018,2017-08-12,Crystal Palace,0,Huddersfield,3,https://fbref.com/en/matches/2d369d17/Crystal-Palace-Huddersfield-Town-August-12-2017-Premier-League
4,Premier League,England,2018,2017-08-12,West Brom,1,Bournemouth,0,https://fbref.com/en/matches/684f704a/West-Bromwich-Albion-Bournemouth-August-12-2017-Premier-League
5,Premier League,England,2018,2017-08-12,Chelsea,2,Burnley,3,https://fbref.com/en/matches/71b00bca/Chelsea-Burnley-August-12-2017-Premier-League
6,Premier League,England,2018,2017-08-12,Everton,1,Stoke City,0,https://fbref.com/en/matches/7c834541/Everton-Stoke-City-August-12-2017-Premier-League


[1] "leagues of interests:"


In [55]:
# Save result in 'data/match_results.csv'
fwrite(result, file = "data/match_results.csv", quote = "auto")

## Get head Coach

In [56]:
country <- c("England", "Spain", "Italy", "Germany", "France")

# Créer le vecteur teams_url
teams_url <- c()
for (i in seq_along(country)) {
    team_url <- tm_league_team_urls(country_name = country[i], start_year = 2018)
    print(paste(country[i], ":", length(team_url), "teams"))
    teams_url <- c(teams_url, team_url)
}

[1] "England : 20 teams"
[1] "Spain : 20 teams"
[1] "Italy : 20 teams"
[1] "Germany : 18 teams"
[1] "France : 20 teams"


In [57]:
head_coach <- tm_team_staff_history(team_urls = teams_url, staff_role = "Manager")
print(paste(nrow(head_coach), "head coaches records"))

[1] "4852 head coaches records"


In [58]:
# match_results$league : 'Premier League''La Liga''Ligue 1''Bundesliga''Serie A'
unique(head_coach$league)
league <- c('Premier League', 'LaLiga', 'Ligue 1', 'Bundesliga', 'Serie A')
head_coach_bis <- head_coach[head_coach$league %in% league,]
paste(nrow(head_coach_bis), "head coaches records for leagues of interests")

In [70]:
# Select head-coach that have been active between 2018 and 2022
head_coach_bis <- head_coach_bis[is.na(head_coach_bis$end_date) | head_coach_bis$end_date >= "2018-01-01",]
head_coach_bis <- head_coach_bis[head_coach_bis$appointed <= "2022-12-31",]
paste(nrow(head_coach_bis), "head coaches records for leagues of interests active between 2018 and 2022")
# Drop column
column_to_drop <- c("staff_role", "ppg")
head_coach_bis <- head_coach_bis[, !(names(head_coach_bis) %in% column_to_drop)]
# Rename staff_name column to coach_name
names(head_coach_bis)[names(head_coach_bis) == "staff_name"] <- "coach_name"

In [71]:
# Save in 'data/head_coach.csv'
fwrite(head_coach_bis, file = "data/head_coach.csv", quote = "auto")