#Football Player Injury 2016-2021 Dataset

In [1]:
#link to the Kaggle dataset -> https://www.kaggle.com/datasets/kolambekalpesh/football-player-injury-data

import pandas as pd
import numpy as np

##Data Exploration

In [2]:
df_player_injury = pd.read_csv("/content/football_player_injury.csv")

In [3]:
df_player_injury.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301 entries, 0 to 1300
Data columns (total 30 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   p_id2                              1301 non-null   object 
 1   start_year                         1301 non-null   int64  
 2   season_days_injured                1301 non-null   int64  
 3   total_days_injured                 1301 non-null   int64  
 4   season_minutes_played              1301 non-null   float64
 5   season_games_played                1301 non-null   int64  
 6   season_matches_in_squad            1301 non-null   int64  
 7   total_minutes_played               1301 non-null   float64
 8   total_games_played                 1301 non-null   int64  
 9   dob                                1301 non-null   object 
 10  height_cm                          1301 non-null   float64
 11  weight_kg                          1301 non-null   float

In [None]:
'''
About Dataset
Data Sources:
The following data sources were used for this model:
Player attributes - FIFA 16-21 data
Injury history - Transfermarkt injury history data.
Pulled and scraped from there using worldfootballR R package

Players/seasons in scope:
Original scope was all players who have played in the British Premier League at
any point between 2016/17 season and 2020/21 season
Due to complications and difficulties in joining 3 datasets from entirely
different sources, this came out to a total of 685 rows of data, consisting of 317 players

Training Data:
3 separate data sources were combined to create a datset which included player attributes
(i.e. - pace, height, weight), player injury history and player game time
Data was grouped on a player-year level
'''

In [5]:
df_player_injury.describe(include='all')

Unnamed: 0,p_id2,start_year,season_days_injured,total_days_injured,season_minutes_played,season_games_played,season_matches_in_squad,total_minutes_played,total_games_played,dob,...,cumulative_games_played,minutes_per_game_prev_seasons,avg_days_injured_prev_seasons,avg_games_per_season_prev_seasons,bmi,work_rate_numeric,position_numeric,significant_injury_prev_season,cumulative_days_injured,season_days_injured_prev_season
count,1301,1301.0,1301.0,1301.0,1301.0,1301.0,1301.0,1301.0,1301.0,1301,...,697.0,685.0,697.0,697.0,1301.0,1301.0,1299.0,697.0,697.0,697.0
unique,604,,,,,,,,,588,...,,,,,,,,,,
top,bendavies,,,,,,,,,1993-06-22,...,,,,,,,,,,
freq,24,,,,,,,,,9,...,,,,,,,,,,
mean,,2018.099923,79.05073,260.710992,1483.156034,19.509608,25.089931,5533.59339,71.638739,,...,53.27977,74.541947,79.501987,21.952038,23.03931,3.346272,1.823711,0.225251,786.351506,81.27977
std,,1.38481,84.578169,202.741896,1014.01392,11.099727,10.157698,4095.421082,46.85426,,...,63.111519,17.856054,71.05764,9.408793,1.470255,0.392971,0.999472,0.418048,1064.323175,84.353614
min,,2016.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,,...,0.0,7.0,2.0,0.0,18.792451,2.5,0.0,0.0,7.0,0.0
25%,,2017.0,24.0,111.0,612.0,10.0,19.0,2012.0,31.0,,...,23.0,65.5,29.0,16.0,22.09317,3.0,1.0,0.0,191.0,26.0
50%,,2018.0,49.0,210.0,1440.0,21.0,28.0,5023.0,69.0,,...,35.0,80.080645,60.5,23.0,23.07483,3.5,2.0,0.0,427.0,50.0
75%,,2019.0,103.0,370.0,2311.0,29.0,33.0,8387.0,107.0,,...,64.0,87.925926,111.5,29.0,23.947116,3.5,3.0,0.0,903.0,109.0


In [6]:
columns_player_injury = [
    'p_id2', 'start_year', 'season_days_injured', 'total_days_injured',
    'season_minutes_played', 'season_games_played', 'season_matches_in_squad',
    'total_minutes_played', 'total_games_played', 'dob', 'height_cm',
    'weight_kg', 'nationality', 'work_rate', 'pace', 'physic',
    'fifa_rating', 'position', 'age', 'cumulative_minutes_played',
    'cumulative_games_played', 'minutes_per_game_prev_seasons',
    'avg_days_injured_prev_seasons', 'avg_games_per_season_prev_seasons',
    'bmi', 'work_rate_numeric', 'position_numeric',
    'significant_injury_prev_season', 'cumulative_days_injured',
    'season_days_injured_prev_season'
]

for col in columns_player_injury:
    missing = df_player_injury[col].isna().sum()
    duplicated = df_player_injury[col].duplicated().sum()
    total_non_null = df_player_injury[col].count()

    print(f"--- {col} ---")
    print(f"Missing values:        {missing}")
    print(f"Duplicated values:     {duplicated}")
    print(f"Total non-null values: {total_non_null}\n")

--- p_id2 ---
Missing values:        0
Duplicated values:     697
Total non-null values: 1301

--- start_year ---
Missing values:        0
Duplicated values:     1296
Total non-null values: 1301

--- season_days_injured ---
Missing values:        0
Duplicated values:     1036
Total non-null values: 1301

--- total_days_injured ---
Missing values:        0
Duplicated values:     952
Total non-null values: 1301

--- season_minutes_played ---
Missing values:        0
Duplicated values:     351
Total non-null values: 1301

--- season_games_played ---
Missing values:        0
Duplicated values:     1262
Total non-null values: 1301

--- season_matches_in_squad ---
Missing values:        0
Duplicated values:     1262
Total non-null values: 1301

--- total_minutes_played ---
Missing values:        0
Duplicated values:     747
Total non-null values: 1301

--- total_games_played ---
Missing values:        0
Duplicated values:     1140
Total non-null values: 1301

--- dob ---
Missing values:     

##Data Cleaning

In [7]:
#p_id2 column

#Missing values:        0
#Duplicated values:     697
#Total non-null values: 1301

df_player_injury['p_id2'].value_counts()

Unnamed: 0_level_0,count
p_id2,Unnamed: 1_level_1
bendavies,24
dannyrose,18
dannyward,12
adamsmith,12
callumwilson,8
...,...
timowerner,1
fikayotomori,1
willkeane,1
wilfriedbony,1


In [8]:
#start_year column

#Missing values:        0
#Duplicated values:     1296
#Total non-null values: 1301

df_player_injury['start_year'].value_counts()

Unnamed: 0_level_0,count
start_year,Unnamed: 1_level_1
2018,298
2019,279
2020,269
2016,232
2017,223


In [9]:
#season_days_injured column

#Missing values:        0
#Duplicated values:     1036
#Total non-null values: 1301

df_player_injury['season_days_injured'].value_counts()

#range check -> (0, 702)
df_player_injury['season_days_injured'].min(), df_player_injury['season_days_injured'].max()

(0, 702)

In [10]:
#total_days_injured column

#Missing values:        0
#Duplicated values:     952
#Total non-null values: 1301

df_player_injury['total_days_injured'].value_counts()

#range check -> (3, 2398)
df_player_injury['total_days_injured'].min(), df_player_injury['total_days_injured'].max()

(3, 2398)

In [11]:
#total_days_injured column

#Missing values:        0
#Duplicated values:     952
#Total non-null values: 1301

df_player_injury['total_days_injured'].value_counts()

#range check -> (3, 2398)
df_player_injury['total_days_injured'].min(), df_player_injury['total_days_injured'].max()

(3, 2398)

In [12]:
#season_minutes_played column

#Missing values:        0
#Duplicated values:     351
#Total non-null values: 1301

df_player_injury['season_minutes_played'].value_counts()

#range check -> (0.0, 3610.0)
df_player_injury['season_minutes_played'].min(), df_player_injury['season_minutes_played'].max()

(0.0, 3610.0)

In [13]:
#season_games_played column

#Missing values:        0
#Duplicated values:     1262
#Total non-null values: 1301

df_player_injury['season_games_played'].value_counts()

#range check -> (0, 38)
df_player_injury['season_games_played'].min(), df_player_injury['season_games_played'].max()

(0, 38)

In [14]:
#season_matches_in_squad column

#Missing values:        0
#Duplicated values:     1262
#Total non-null values: 1301

df_player_injury['season_matches_in_squad'].value_counts()

#range check -> (1, 54)
df_player_injury['season_matches_in_squad'].min(), df_player_injury['season_matches_in_squad'].max()

(1, 54)

In [15]:
#total_minutes_played column

#Missing values:        0
#Duplicated values:     747
#Total non-null values: 1301

df_player_injury['total_minutes_played'].value_counts()

#range check -> (0.0, 16768.0)
df_player_injury['total_minutes_played'].min(), df_player_injury['total_minutes_played'].max()

(0.0, 16768.0)

In [16]:
#total_games_played column

#Missing values:        0
#Duplicated values:     1140
#Total non-null values: 1301

df_player_injury['total_games_played'].value_counts()

#range check -> (0, 179)
df_player_injury['total_games_played'].min(), df_player_injury['total_games_played'].max()

(0, 179)

In [24]:
#dob column

#Missing values:        0
#Duplicated values:     713
#Total non-null values: 1301

#range check -> ('1979-12-05', '2002-06-27')
df_player_injury['dob'].min(), df_player_injury['dob'].max()

#check if all entries are in date format
pd.to_datetime(df_player_injury['dob'], errors='coerce').notna().all()

np.True_

In [23]:
#height_cm column

#Missing values:        0
#Duplicated values:     1136
#Total non-null values: 1301

df_player_injury['height_cm'].value_counts()

#range check -> (163.0, 203.0)
df_player_injury['height_cm'].min(), df_player_injury['height_cm'].max()

(163.0, 203.0)

In [27]:
#weight_kg column

#Missing values:        0
#Duplicated values:     1116
#Total non-null values: 1301

df_player_injury['weight_kg'].value_counts()

#range check -> (58.0, 99.0)
df_player_injury['weight_kg'].min(), df_player_injury['weight_kg'].max()

(58.0, 99.0)

In [32]:
#nationality column

#Missing values:        0
#Duplicated values:     1240
#Total non-null values: 1301

df_player_injury['nationality'].value_counts()

#check if all entries don't contain digits or special chars
df_player_injury['nationality'].str.match(r'^[A-Za-z\s\-]+$').all()

np.True_

In [31]:
#work_rate column

#Missing values:        0
#Duplicated values:     1293
#Total non-null values: 1301

df_player_injury['work_rate'].value_counts()

Unnamed: 0_level_0,count
work_rate,Unnamed: 1_level_1
Medium/Medium,434
High/Medium,337
Medium/High,209
High/High,206
Low/High,29
Low/Medium,29
High/Low,29
Medium/Low,28
