The FIFA 19 dataset contains over 18,000 rows with 89 columns. Dataset can be found here - https://www.kaggle.com/karangadiya/fifa19
<br><br><br>

---   

Taken from https://www.kaggle.com/ap1495/fifa-19-classification-regression

### Import required libraries

In [10]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
import warnings
warnings.filterwarnings('ignore')
import pathlib
import pandas_profiling

In [11]:
init_notebook_mode(connected=True)
cf.go_offline()

### Read the data

In [12]:
cwd = pathlib.Path.cwd()
data = pd.read_csv(cwd/'fifa_19_dataset'/'data.csv')

print('Number of Categorical Columns: ', len(data.select_dtypes(include=object).columns))
print('Number of Numerical Columns: ', len(data.select_dtypes(exclude=object).columns))

Number of Categorical Columns:  45
Number of Numerical Columns:  44


In [13]:
report = data.profile_report()

report.to_file(output_file="FIFA19_original.html")

In [3]:
# We have a dataset with 18,207 rows which includes 45 categorical features and 44 numerical features.

# Dealing with unnecessary features and missing values

#Dropping columns which are of very less significance.
data.drop(columns=['Unnamed: 0', 'ID', 'Photo', 'Flag', 'Club Logo', 'Special', 'Real Face', 'Release Clause',
                   'Joined', 'Contract Valid Until'], inplace=True)

#Dropping Loaned From as more than half the values are missing
data.drop(columns=['Loaned From'], inplace=True)

#Still a lot of missing values to deal with. Let us fill in these missing values appropriately and/or drop columns which are not required. <br><br>

#Players who are not part of any club.
data['Club'].fillna(value='No Club', inplace=True)

#Full of NaN values for many features, so drop.
data.drop(index=data[data['Preferred Foot'].isna()].index, inplace=True)

#Can fill in position manually but LS, RS, CF, etc. features have no values, so drop them.
data.drop(index=data[data['Position'].isna()].index, inplace=True)

#Looks like the above features are not set for Goalkeepers. We cannot drop them as it would remove all the goal keepers from our dataset. Instead we will fill these values with 0.

data.fillna(value=0, inplace=True)

Number of Categorical Columns:  45
Number of Numerical Columns:  44


### Converting categorical features to appropriate numerical features

In [4]:
#Function to convert value and wage of the player.
def currencyConverter(val):
    if val[-1] == 'M':
        val = val[1:-1]
        val = float(val) * 1000000
        return val
        
    elif val[-1] == 'K':
        val = val[1:-1]
        val = float(val) * 1000
        return val
    
    else:
        return 0

data['Value in Pounds'] = data['Value'].apply(currencyConverter)
data['Wage in Pounds'] = data['Wage'].apply(currencyConverter)

data.drop(columns=['Value', 'Wage'], inplace=True)

# Value and Wage have been converted.

#Function to convert skill rating at each position.
def skillConverter(val):
    if type(val) == str:
        s1 = val[0:2]
        s2 = val[-1]
        val = int(s1) + int(s2)
        return val
    
    else:
        return val

skill_columns = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
       'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM',
       'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']
                      
for col in skill_columns:
    data[col] = data[col].apply(skillConverter)

def height_converter(val):
    f = val.split("'")[0]
    i = val.split("'")[1]
    h = (int(f) * 30.48) + (int(i)*2.54)
    return h

def weight_converter(val):
    w = int(val.split('lbs')[0])
    return w

data['Height in Cms'] = data['Height'].apply(height_converter)
data['Weight in Pounds'] = data['Weight'].apply(weight_converter)

data.drop(columns=['Height', 'Weight'], inplace=True)


# For the remaining columns - **Work Rate, Body Type, Position**, we will not be converting them to numerical features right now. Here, we ensure these features have appropriate values and they will be converted to numerical features when feeding this data to our machine learning models.

data['Body Type'][data['Body Type'] == 'Messi'] = 'Lean'
data['Body Type'][data['Body Type'] == 'C. Ronaldo'] = 'Normal'
data['Body Type'][data['Body Type'] == 'Neymar'] = 'Lean'
data['Body Type'][data['Body Type'] == 'Courtois'] = 'Lean'
#PLAYER_BODY_TYPE_25 is the body type of Mohammed Salah who has a Normal body type.
data['Body Type'][data['Body Type'] == 'PLAYER_BODY_TYPE_25'] = 'Normal'
data['Body Type'][data['Body Type'] == 'Shaqiri'] = 'Stocky'
data['Body Type'][data['Body Type'] == 'Akinfenwa'] = 'Stocky'

# # Let us simplify the above positions into 4 simple categories of - **F**orwards, **M**idfielders, **D**efenders and **G**oal**K**eepers

def position_simplifier(val):
    
    if val == 'RF' or val == 'ST' or val == 'LF' or val == 'RS' or val == 'LS' or val == 'CF':
        val = 'F'
        return val
        
    elif val == 'LW' or val == 'RCM' or val == 'LCM' or val == 'LDM' or val == 'CAM' or val == 'CDM' or val == 'RM' \
         or val == 'LAM' or val == 'LM' or val == 'RDM' or val == 'RW' or val == 'CM' or val == 'RAM':
        val = 'M'
        return val

    
    elif val == 'RCB' or val == 'CB' or val == 'LCB' or val == 'LB' or val == 'RB' or val == 'RWB' or val == 'LWB':
        val = 'D'
        return val
    
    else:
        return val
        

data['Position'] = data['Position'].apply(position_simplifier)

# Profling

In [7]:
report = data.profile_report()

In [9]:
report.to_file(output_file="FIFA19.html")