In [1]:
# DEPENDENCIES
import pandas as pd
import numpy as np
import glob

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
# from os import walk

In [2]:
# NOTE: FIFA TYPICALLY RELEASES RATINGS IN SEPTEMBER OF EACH YEAR. 
        # WITH 2022 BEING A NOTABLE EXCEPTION, THE WORLD CUP IS HELD IN JUNE EVERY FOUR YEARS

# CSV FILES FOR FIFA PLAYERS IN THE LAST DECADE
# Before the 2020 World Cup
players_2021 = "Data/Players_EAFIFA/2021.csv"
players_2020 = "Data/Players_EAFIFA/2020.csv"
players_2019 = "Data/Players_EAFIFA/2019.csv"
players_2018 = "Data/Players_EAFIFA/2018.csv"
# Before the 2018 World Cup
players_2017 = "Data/Players_EAFIFA/2017.csv"
players_2016 = "Data/Players_EAFIFA/2016.csv"
players_2015 = "Data/Players_EAFIFA/2015.csv"
players_2014 = "Data/Players_EAFIFA/2014.csv"
# Before the 2014 World Cup
players_2013 = "Data/Players_EAFIFA/2013.csv"
players_2012 = "Data/Players_EAFIFA/2012.csv"

# PLAYER CSV FILES AS LIST (FOR LOOPS LATER IF NEEDED)
player_csvs = [players_2021,players_2020, players_2019,players_2018, players_2017, players_2016, players_2015, players_2014, players_2013, players_2012]

# HISOTRIC PLAYER APPEARANCES CSV FILE
file1_18624records = "Data/player_appearances.csv"
playerApps_df = pd.read_csv(file1_18624records)
playerApps_df["Name"] = playerApps_df["given_name"] + " " + playerApps_df["family_name"]

# PLAYER DATA CSV FILE
# file2_7908records = "Data/players.csv"
# playerData_df = pd.read_csv(file2_7908records)


# CHANGING MATCH_DATE TO DATETIME IN ORDER TO EXTRACT THE MATCH YEAR
playerApps_df["match_date"] = pd.to_datetime(playerApps_df["match_date"])
playerApps_df["match_year"] = playerApps_df["match_date"].dt.year


players_2021_df = pd.read_csv(players_2021)

players_2021_df.columns
playerApps_df.columns

Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
       'match_date', 'stage_name', 'group_name', 'team_id', 'team_name',
       'team_code', 'home_team', 'away_team', 'player_id', 'family_name',
       'given_name', 'shirt_number', 'position_name', 'position_code',
       'starter', 'substitute', 'captain', 'Name', 'match_year'],
      dtype='object')

In [3]:
playerApps_df_grp = playerApps_df[["match_id", "player_id", "Name"]].groupby(["player_id", "Name"]).nunique("match_id")
playerApps_df_grp 

# df_wcPlayers.groupby(["Nationality", "Any_WC_Apperance"]).nunique("ID").head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,match_id
player_id,Name,Unnamed: 2_level_1
P-00002,Javad Zarincheh,3
P-00004,Krasimir Balakov,10
P-00007,Stephen Malcolm,2
P-00008,not applicable Rivaldo,14
P-00015,Vladimir Soria,3
...,...,...
P-09989,Ziad Jaziri,6
P-09990,Bernardo Redín,3
P-09993,Francisco Silva,3
P-09994,Maciej Rybus,2


In [4]:
# TEAMS THAT QUALIFIED FOR THE 2022 WORLD CUP
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']

In [5]:
# CREATING DATAFRAME OF ALL PLAYER CSV FILES

df_list = []
for file in glob.glob("Data/Players_EAFIFA/*.csv"):
    year = file.split(".")[0]
    df = pd.read_csv(file).assign(year = year)
    df_list.append(df)

df = pd.concat(df_list)

df["year2"] = [x.strip()[-4:] for x in df["year"]]
df.drop(columns=["year"])

df = df.rename(columns={"Natinality":"Nationality","Overal":"Overall", "year":"file_source", "year2":"data_year"})

# df.info()
# df["Nation_KitNumber"].unique()
df["Name"].tail(50)

19067           Matteo Politano
19068                 Cucurella
19069                  Angeliño
19070            James Maddison
19071          Davinson Sánchez
19072            Manuel Lazzari
19073              Tomáš Souček
19074         Dominik Livaković
19075                Rúben Dias
19076              Moussa Diaby
19077                Renan Lodi
19078           Steven Berghuis
19079          Gianluigi Buffon
19080            Danilo Pereira
19081               Lucas Moura
19082         Agustín Marchesín
19083          Gabriel Paulista
19084              Tomáš Vaclík
19085                 Nick Pope
19086              Diego Carlos
19087           Martin Dúbravka
19088           Lucas Hernández
19089         Donny van de Beek
19090           Martin Ødegaard
19091              Marcos Acuña
19092       Georginio Wijnaldum
19093           Alejandro Gómez
19094          Leonardo Bonucci
19095             Péter Gulácsi
19096               Kai Havertz
19097            Erling Haaland
19098   

In [6]:
# playerApps_df["Name"]
# playerApps_df.info()

The raw dataset for 2012-2022 reveals 50,234 unique players from 188 countries who played for 1,366 different clubs during the period. In total, there were 172,125 records in the raw dataset. 

In [7]:
# MODIFICATIONS FOR MACHINE LEARNING

# adding a binary column indicating the preferred foot
df["PreferredFoot"].unique()
df.loc[df["PreferredFoot"] == "L", "PreferredFootNum"]=1
df.loc[df["PreferredFoot"] == "R", "PreferredFootNum"]=0

# dropping columns
df2 = df.drop(columns=["PreferredFoot","BirthDate", "Nation", "Nation_Position", "Nation_KitNumber", 
                       "Club","Club_Position", "Club_KitNumber", "Club_JoinedClub", "Club_ContractLength",
                       "Traits", "Specialities", "file_source"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172125 entries, 0 to 19116
Data columns (total 50 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ID                  172125 non-null  int64  
 1   Name                172125 non-null  object 
 2   Nationality         172125 non-null  object 
 3   Overall             172125 non-null  int64  
 4   Potential           172125 non-null  int64  
 5   Height              172125 non-null  int64  
 6   Weight              172125 non-null  int64  
 7   Age                 172125 non-null  int64  
 8   PreferredPositions  172125 non-null  object 
 9   PlayerWorkRate      172125 non-null  object 
 10  WeakFoot            172125 non-null  int64  
 11  SkillMoves          172125 non-null  int64  
 12  Value               169528 non-null  float64
 13  Wage                169528 non-null  float64
 14  BallControl         172125 non-null  int64  
 15  Dribbling           172125 non-null

In [8]:
df["Club"].nunique()

1366

In [11]:
df_wc = pd.merge(df2,players_2021_df["Name"], on="Name", how="left")

In [14]:
# merging on the name created an additioanl 29,172 records (+16.9%). 
# The player_id is not null for 38,066 records
# df_wc = pd.merge(df2,playerApps_df[["player_id","Name"]], on="Name", how="outer")
# WC_2022_Country = df_wc["Nationality"].isin(teams_2022)
# df_wc2["WC_2022_Country"]=df_wc[WC_2022_Country]
# df_wc2.info()






# df_wc.drop_duplicates()

df_wcPlayers = df_wc.drop_duplicates()

df_wcPlayers.loc[df_wcPlayers["player_id"].isnull(), "Any_WC_Apperance"]= "False"
df_wcPlayers.loc[df_wcPlayers["player_id"].notnull(), "Any_WC_Apperance"]= "True"
df_wcPlayers.loc[df_wcPlayers["Nationality"].isin(teams_2022), "WC_2022_Country"] = "True"

# df_wcPlayers.groupby(["Nationality", "Any_WC_Apperance"]).nunique("ID").head(40)

# df_wcPlayers["Nationality"].drop("Nationality").isnull()
# df_wcPlayers[df_wcPlayers.Nationality.notna()]
df_wcPlayers

# Nationality_notnull = df_wcPlayers[df_wcPlayers["Nationality"].notnull()]
# Nationality_notnull = df_wcPlayers[df_wcPlayers["WC_2022_Country"].notnull()]

# Nationality_notnull = Nationality_notnull.drop(columns = ["Composure", "PreferredPositions", "PlayerWorkRate", "data_year", "Value", "Wage"])
# Nationality_notnull.info()


KeyError: 'player_id'

In [None]:
# MODEL

X = Nationality_notnull.drop("Any_WC_Apperance", axis = 1)
X_dummies = pd.get_dummies(X)
X_dummies

In [None]:
y_label = LabelEncoder().fit_transform(Nationality_notnull["Any_WC_Apperance"])
y_label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label, random_state=13)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

In [None]:
df_wc["Prev_WC_Appearance"] = np.where(df_wc["player_id"].notnull(), "True", "False")

df_wc.drop_duplicates(keep=False)
df_wc[["Name", "Prev_WC_Appearance", "Wage", "data_year"]].head(20)

In [None]:
df_wc2[["Prev_WC_Appearance", "Nationality", "ID"]].groupby(["Nationality", "Prev_WC_Appearance"]).nunique("ID")

In [None]:
df_wc3 = np.where(df_wc2[df_wc2["Prev_WC_Appearance"]=="False"])
df_wc3