---
title: Data extraction
description: We proceed to extract dataset using WorldFootballR library from Fbref and ...
---

In [None]:
if (!require(worldfootballR)) { 
  install.packages("worldfootballR")
}

if (!require(data.table)) {
  install.packages("data.table")
}

library(worldfootballR)
library(data.table)

### Les données sur les matchs

In [None]:
country <- c("ENG", "ESP", "ITA", "GER", "FRA")
year <- c(2018, 2019, 2020, 2021, 2022)
result <- fb_match_results(country = country, gender = "M", season_end_year = year, tier = "1st")

In [None]:
column_to_drop <- c('Gender', 'Day', 'Wk', 'Time', 'Venue', 'Referee', 'Attendance', 'Home_xG', 'Away_xG', 'Notes', 'Round')
result <- result[, !(names(result) %in% column_to_drop)]
# Rename result$Competition_Name of 'Fu\303\237ball-Bundesliga' to 'Bundesliga'
result$Competition_Name[result$Competition_Name == 'Fu\303\237ball-Bundesliga'] <- 'Bundesliga'
# Rename columns of result dataframe
names(result) <- c('league', 'country', 'season_year', 'date', 'home', 'home_goals', 'away', 'away_goals', 'match_url')
# Rename country code to country name
match_country <- c('ENG' = 'England', 'ITA' = 'Italy', 'FRA' = 'France', 'GER' = 'Germany', 'ESP' = 'Spain')
result$country <- match_country[result$country]

Nous avons récupéré les données pour les ligues suivante : {eval}`paste(unique(result$league), collapse = ", ")`. Cela concerne {eval}`nrow(result)` matchs pour la période {eval}`min(year)` à {eval}`max(year)`. 

In [None]:
head(result, 4)

In [None]:
# Save result in 'data/match_results.csv'
fwrite(result, file = "data/extracted_match_results.csv", quote = "auto")

### Les données sur les entraîneurs-chefs

In [None]:
country <- c("England", "Spain", "Italy", "Germany", "France")

# Créer le vecteur teams_url
teams_url <- c()
for (i in seq_along(country)) {
    team_url <- tm_league_team_urls(country_name = country[i], start_year = 2018)
    print(paste(country[i], ":", length(team_url), "teams"))
    teams_url <- c(teams_url, team_url)
}

In [None]:
head_coach <- tm_team_staff_history(team_urls = teams_url, staff_role = "Manager")
print(paste(nrow(head_coach), "head coaches records"))

In [None]:
# match_results$league : 'Premier League''La Liga''Ligue 1''Bundesliga''Serie A'
unique(head_coach$league)
league <- c('Premier League', 'LaLiga', 'Ligue 1', 'Bundesliga', 'Serie A')
head_coach_bis <- head_coach[head_coach$league %in% league,]
paste(nrow(head_coach_bis), "head coaches records for leagues of interests")

Leagues in which we are collecting data : {eval}`unique(head_coach$league)`

{eval}`nrow(head_coach_bis)` head coaches records for leagues of interests

In [None]:
# Select head-coach that have been active between 2018 and 2022
head_coach_bis <- head_coach_bis[is.na(head_coach_bis$end_date) | head_coach_bis$end_date >= "2018-01-01",]
head_coach_bis <- head_coach_bis[head_coach_bis$appointed <= "2022-12-31",]
paste(nrow(head_coach_bis), "head coaches records for leagues of interests active between 2018 and 2022")
# Drop column
column_to_drop <- c("staff_role", "ppg")
head_coach_bis <- head_coach_bis[, !(names(head_coach_bis) %in% column_to_drop)]
# Rename staff_name column to coach_name
names(head_coach_bis)[names(head_coach_bis) == "staff_name"] <- "coach_name"

In [None]:
# Save in 'data/head_coach.csv'
fwrite(head_coach_bis, file = "data/extracted_head_coach.csv", quote = "auto")