In [1]:
import pandas as pd

import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('./Dataset/Dataset-FIFA-21.xlsx')

df.head()

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Composure,Defensive Awareness,Standing Tackle,Sliding Tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Loaned From
0,206517,Jack Grealish,24,https://cdn.sofifa.com/players/206/517/21_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,83,86,Aston Villa,https://cdn.sofifa.com/teams/2/30.png,...,83.0,48.0,49.0,43.0,11.0,6,11,10,13.0,
1,208532,Kohei Yamada,23,https://cdn.sofifa.com/players/208/532/13_60.png,Japan,https://cdn.sofifa.com/flags/jp.png,52,59,Colorado Rapids,https://cdn.sofifa.com/teams/694/30.png,...,,,40.0,41.0,5.0,6,6,15,10.0,
2,21146,Michael Carrick,35,https://cdn.sofifa.com/players/021/146/18_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,79,79,Manchester United,https://cdn.sofifa.com/teams/11/30.png,...,88.0,,76.0,69.0,13.0,12,11,15,9.0,
3,225863,Olivier Boscagli,22,https://cdn.sofifa.com/players/225/863/21_60.png,France,https://cdn.sofifa.com/flags/fr.png,75,80,PSV,https://cdn.sofifa.com/teams/247/30.png,...,71.0,76.0,74.0,75.0,8.0,7,14,8,12.0,
4,212273,Clinton Njie,26,https://cdn.sofifa.com/players/212/273/21_60.png,Cameroon,https://cdn.sofifa.com/flags/cm.png,73,73,,https://cdn.sofifa.com/flags/cm.png,...,71.0,24.0,30.0,27.0,13.0,10,9,9,11.0,


#### Let's have a quick look at the data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 60 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        420 non-null    int64  
 1   Name                      420 non-null    object 
 2   Age                       420 non-null    int64  
 3   Photo                     420 non-null    object 
 4   Nationality               420 non-null    object 
 5   Flag                      420 non-null    object 
 6   Overall                   420 non-null    int64  
 7   Potential                 420 non-null    int64  
 8   Club                      419 non-null    object 
 9   Club Logo                 420 non-null    object 
 10  Value (€))                420 non-null    object 
 11  Wage (€)                  420 non-null    object 
 12  Special                   420 non-null    int64  
 13  Preferred Foot            420 non-null    object 
 14  Weak Foot 

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,420.0,226847.652381,29768.468915,41.0,216379.0,233845.0,243816.0,261050.0
Age,420.0,22.592857,4.140544,16.0,20.0,22.0,24.0,38.0
Overall,420.0,78.164286,6.19486,52.0,75.0,79.0,82.0,93.0
Potential,420.0,85.130952,3.395841,59.0,84.0,85.0,87.0,95.0
Special,420.0,1923.690476,219.176304,918.0,1817.75,1952.5,2084.0,2347.0
International Reputation,420.0,1.638095,1.016558,1.0,1.0,1.0,2.0,5.0
Jersey Number,420.0,16.861905,13.99062,1.0,8.0,14.0,21.25,98.0
Weight (lbs),420.0,163.661905,16.603907,123.0,152.0,163.0,174.0,209.0
Likes,420.0,202.595238,176.723434,0.0,89.0,161.5,258.0,1296.0
Dislikes,420.0,23.904762,34.093588,0.0,8.0,16.0,27.0,426.0


#### We have a glimpse of the data on the first hand. We can tell that there are many null values, special characters (which changed the data type of some columns), and some unneccessary columns (which are not correlated). Thus, we will drop those columns and remove all special characters.

In [5]:
df.drop(['Photo', 'Flag', 'Club Logo', 'Loaned From', 'Release Clause',
         'Joined', 'Contract Valid Until'], axis = 1,inplace = True)

In [6]:
# Removing all special characters, so we can interpret them

def value_to_int(df_value):
    
    try:
        
        value = float(df_value[1:-1])
        
        suffix = df_value[-1:]

        if suffix == 'M':
            
            value = value * 1000000
            
        elif suffix == 'K':
            
            value = value * 1000
            
    except ValueError:
        
        value = 0
        
    return value

In [7]:
df['Value (€))'] = df['Value (€))'].apply(value_to_int)

df['Wage (€)'] = df['Wage (€)'].apply(value_to_int)

#### Now, we will check any odds if cleaning needed.

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 53 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        420 non-null    int64  
 1   Name                      420 non-null    object 
 2   Age                       420 non-null    int64  
 3   Nationality               420 non-null    object 
 4   Overall                   420 non-null    int64  
 5   Potential                 420 non-null    int64  
 6   Club                      419 non-null    object 
 7   Value (€))                420 non-null    float64
 8   Wage (€)                  420 non-null    float64
 9   Special                   420 non-null    int64  
 10  Preferred Foot            420 non-null    object 
 11  Weak Foot                 420 non-null    object 
 12  Skill Moves               420 non-null    object 
 13  International Reputation  420 non-null    int64  
 14  Work Rate 

 11  Weak Foot                 420 non-null    object 
 
 12  Skill Moves               420 non-null    object

#### We need to find out why these columns have object data type becasue they are supposed to have to be numeric.

In [9]:
# Finding non-numeric rows in DataFrame

df.loc[~df['Skill Moves'].str.isdigit(), 'Skill Moves'].tolist()

['3+1']

In [10]:
df.loc[~df['Weak Foot'].str.isdigit(), 'Weak Foot'].tolist()

['3+1']

In [11]:
# Replacing '3+1' to 4

df['Skill Moves'] = df['Skill Moves'].map({'3+1': 4})

df['Weak Foot'] = df['Weak Foot'].map({'3+1': 4})

# Changing the data type of 'Skill Moves' and 'Weak Foot' columns

df['Skill Moves'] = df['Skill Moves'].apply(lambda row: int(row) if not pd.isnull(row) else row)

df['Weak Foot'] = df['Weak Foot'].apply(lambda row: int(row) if not pd.isnull(row) else row)

#### We will save down the cleaned file for future uses.

In [12]:
df.to_excel('./Dataset/Dataset-FIFA-21-cleaned.xlsx', index = False)