# Study case | FIFA

## 01 - Problem (case study)

### Data Description

Data provided in a csv table with the following features:

* Number of rows: 13700
* Number of columns: 102 <br>
<br>
* Number of numerical variables: 54
* Numbre of categorical/non-numerical variables: 48

## 01 - Goal

The objective of this data is to understand the scoring of FIFA players. We will use predictive analytics to build a model that can predict their overall score based on multiple variables.

## 02 - Getting Data and Preparing Notebook

In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats as stats
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [184]:
# To make run all you ask in one cell, not only the last required
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [185]:
# Open file and show shape 
data=pd.read_csv('C:/Student/IRONHACK/Week2/D1/Case_study/fifa21_trainning.csv')
data.shape

numerical = data.select_dtypes(np.number)
categorical = data.select_dtypes(np.object)
numerical.shape
categorical.shape
data.head(2)

(13700, 102)

(13700, 54)

(13700, 48)

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Position,Team & Contract,Height,...,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,OVA
0,1954,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,FC Lausanne-Sport 2015 ~ 2020,"5'9""",...,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,2225,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,"Beijing Sinobo Guoan FC Dec 31, 2020 On Loan","6'0""",...,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77


## 03 - Cleaning/Wrangling/EDA

In [186]:
numerical.columns
categorical.columns

Index(['Unnamed: 0', 'ID', 'Age', 'Growth', 'Attacking', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys', 'Skill',
       'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control',
       'Movement', 'Acceleration', 'Sprint Speed', 'Agility', 'Reactions',
       'Balance', 'Power', 'Shot Power', 'Jumping', 'Stamina', 'Strength',
       'Long Shots', 'Mentality', 'Aggression', 'Interceptions', 'Positioning',
       'Vision', 'Penalties', 'Composure', 'Defending', 'Marking',
       'Standing Tackle', 'Sliding Tackle', 'Goalkeeping', 'GK Diving',
       'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes',
       'Total Stats', 'Base Stats', 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY',
       'OVA'],
      dtype='object')

Index(['Name', 'Nationality', 'Club', 'BP', 'Position', 'Team & Contract',
       'Height', 'Weight', 'foot', 'Joined', 'Loan Date End', 'Value', 'Wage',
       'Release Clause', 'Contract', 'W/F', 'SM', 'A/W', 'D/W', 'IR', 'Hits',
       'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM',
       'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB',
       'LCB', 'CB', 'RCB', 'RB', 'GK'],
      dtype='object')

In [187]:
# Describe
numerical.describe()
categorical.describe()

Unnamed: 0.1,Unnamed: 0,ID,Age,Growth,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,...,GK Reflexes,Total Stats,Base Stats,PAC,SHO,PAS,DRI,DEF,PHY,OVA
count,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13656.0,...,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0,13700.0
mean,8561.641971,219378.929708,25.286058,5.509927,258.313431,51.585766,47.996934,53.460292,60.427956,44.986965,...,15.830292,1630.103942,361.317007,68.108832,55.056423,58.931679,64.236277,50.123577,64.860219,66.945912
std,4954.144383,37445.343736,4.944498,5.797176,72.412379,17.89993,19.41153,17.010578,13.975634,17.760924,...,17.276491,260.986045,40.258414,11.183342,13.839958,10.159851,9.816203,16.81511,9.736964,6.860189
min,1.0,16.0,16.0,-1.0,33.0,6.0,3.0,5.0,8.0,4.0,...,1.0,731.0,228.0,26.0,17.0,25.0,28.0,12.0,27.0,38.0
25%,4269.75,203979.75,21.0,0.0,231.0,41.0,33.0,45.0,56.0,32.0,...,8.0,1491.0,333.0,62.0,46.0,52.0,59.0,35.0,59.0,62.0
50%,8555.5,228927.0,25.0,4.0,271.0,56.0,52.0,56.0,63.0,47.0,...,11.0,1659.0,362.0,69.0,58.0,60.0,65.0,53.0,66.0,67.0
75%,12887.25,244043.5,29.0,9.0,306.0,65.0,64.0,65.0,69.0,59.0,...,14.0,1811.25,389.0,75.0,65.0,66.0,71.0,64.0,72.0,72.0
max,17122.0,259091.0,47.0,26.0,437.0,94.0,95.0,93.0,94.0,90.0,...,90.0,2304.0,498.0,96.0,93.0,93.0,95.0,90.0,93.0,93.0


Unnamed: 0,Name,Nationality,Club,BP,Position,Team & Contract,Height,Weight,foot,Joined,...,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK
count,13700,13700,13679,13700,13357,13700,13700,13700,13700,13656,...,13700,13700,13700,13700,13700,13700,13700,13700,13700,13700
unique,13064,160,897,15,583,8524,21,55,2,1803,...,301,301,301,271,265,321,321,321,265,158
top,J. Rojas,England,Everton,CB,CB,India Free,"6'0""",154lbs,Right,"Jul 1, 2019",...,59+2,59+2,59+2,59+2,62+2,61+2,61+2,61+2,62+2,16+2
freq,8,1342,38,2553,1595,30,2053,1073,10322,940,...,410,410,410,514,454,360,360,360,454,2510


In [188]:
numerical.head()

Unnamed: 0.1,Unnamed: 0,ID,Age,Growth,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,...,GK Reflexes,Total Stats,Base Stats,PAC,SHO,PAS,DRI,DEF,PHY,OVA
0,1954,184383,26,1,258,54,47,43,70,44.0,...,6,1682,357,69,51,63,63,51,60,64
1,2225,188044,30,0,365,66,79,76,68,76.0,...,16,1961,412,83,75,68,82,33,71,77
2,1959,184431,33,0,336,73,76,34,78,75.0,...,3,1925,404,80,77,78,86,27,56,80
3,9815,233796,22,13,242,44,42,58,62,36.0,...,12,1527,329,57,44,54,57,57,60,59
4,10074,234799,23,8,249,49,37,61,68,34.0,...,15,1664,360,66,44,60,64,60,66,65


#### Data cleaning and formating

In [189]:
# Clean categorical 
data.drop(['Unnamed: 0','LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM',
       'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB',
       'LCB', 'CB', 'RCB', 'RB', 'GK','Position', 'Team & Contract', 'Joined', 'Loan Date End','Release Clause', 
           'Contract','Attacking', 'Skill','Movement', 'Power' ,'Mentality','Defending', 'Goalkeeping',
          'Total Stats','Base Stats'], axis=1, inplace=True)
data.shape

(13700, 59)

In [190]:
numerical = data.select_dtypes(np.number)
categorical = data.select_dtypes(np.object)
numerical.shape
categorical.head(2)
data.head(2)

(13700, 44)

Unnamed: 0,Name,Nationality,Club,BP,Height,Weight,foot,Value,Wage,W/F,SM,A/W,D/W,IR,Hits
0,A. Pasche,Switzerland,FC Lausanne-Sport,CM,"5'9""",161lbs,Right,€525K,€4K,4 ★,2★,High,Medium,1 ★,3
1,Alan Carvalho,China PR,Beijing Sinobo Guoan FC,ST,"6'0""",159lbs,Right,€8.5M,€23K,3 ★,4★,High,Low,2 ★,44


Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Height,Weight,foot,Growth,...,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits,OVA
0,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,"5'9""",161lbs,Right,1,...,Medium,1 ★,69,51,63,63,51,60,3,64
1,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,"6'0""",159lbs,Right,0,...,Low,2 ★,83,75,68,82,33,71,44,77


In [191]:

#X = '20M'
# def formating(X):
#     if 'K' in X:
#         X = X.replace('K','')
#         X = float(X)
#         X = X*1000
#         X = X.replace('€','')
#     elif 'M' in X:
#         X = X.replace('M','')
#         X = float(X)
#         X = X*1000000
#     elif 'lbs' in X:
#         X = X.replace('lbs','')
#         X = float(X)
#     elif '★' in X:
#         X = X.replace('★','')
#         X = float(X)
#     elif "'" in X:
#         X = X.replace("'",".")
#         X = float(X)
#     elif '"'in X:
#         X = X.replace('"','')
#         X = float(X)
#     return X

# data['Weight'] = list(map(formating,data['Weight']))
# # data['Height'] = list(map(formating,data['Height']))
# data['SM'] = list(map(formating,data['SM']))
# data['IR'] = list(map(formating,data['IR']))
# data['Wage'] = list(map(formating,data['Wage']))
# data['Value'] = list(map(formating,data['Value']))
# data['W/F'] = list(map(formating,data['W/F']))

#### Drop NaN values

In [192]:
data.dropna(inplace=True)
data.shape

(13358, 59)

#### Identify again numerical and categorical variables

In [193]:
numerical1 = data.select_dtypes(np.number)
categorical1 = data.select_dtypes(np.object)
numerical1
categorical1

Unnamed: 0,ID,Age,Growth,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Dribbling,Curve,...,GK Kicking,GK Positioning,GK Reflexes,PAC,SHO,PAS,DRI,DEF,PHY,OVA
0,184383,26,1,54,47,43,70,44.0,61,44.0,...,14,9,6,69,51,63,63,51,60,64
1,188044,30,0,66,79,76,68,76.0,83,78.0,...,14,7,16,83,75,68,82,33,71,77
2,184431,33,0,73,76,34,78,75.0,85,89.0,...,6,3,3,80,77,78,86,27,56,80
3,233796,22,13,44,42,58,62,36.0,54,41.0,...,6,7,12,57,44,54,57,57,60,59
4,234799,23,8,49,37,61,68,34.0,64,44.0,...,15,5,15,66,44,60,64,60,66,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13695,239074,21,11,59,23,42,51,22.0,51,30.0,...,7,10,13,76,28,46,55,53,57,60
13696,241223,21,9,13,9,14,34,7.0,9,13.0,...,57,54,62,60,55,57,62,30,54,59
13697,210930,27,0,76,72,34,79,71.0,77,76.0,...,16,9,6,65,69,78,77,39,60,76
13698,162993,34,0,58,44,61,62,43.0,56,47.0,...,9,9,8,53,47,58,58,61,69,63


Unnamed: 0,Name,Nationality,Club,BP,Height,Weight,foot,Value,Wage,W/F,SM,A/W,D/W,IR,Hits
0,A. Pasche,Switzerland,FC Lausanne-Sport,CM,"5'9""",161lbs,Right,€525K,€4K,4 ★,2★,High,Medium,1 ★,3
1,Alan Carvalho,China PR,Beijing Sinobo Guoan FC,ST,"6'0""",159lbs,Right,€8.5M,€23K,3 ★,4★,High,Low,2 ★,44
2,S. Giovinco,Italy,Al Hilal,CAM,"5'4""",134lbs,Right,€9M,€49K,4 ★,4★,High,Medium,2 ★,73
3,J. Evans,Wales,Swansea City,CDM,"5'10""",152lbs,Right,€275K,€4K,2 ★,2★,Medium,Medium,1 ★,7
4,Y. Demoncy,France,US Orléans Loiret Football,CDM,"5'11""",150lbs,Right,€725K,€2K,2 ★,3★,Low,Medium,1 ★,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13695,S. Aw,Senegal,Gil Vicente FC,LB,"5'8""",143lbs,Left,€325K,€1K,3 ★,2★,High,Medium,1 ★,4
13696,S. Mogi,Japan,Cerezo Osaka,GK,"6'5""",176lbs,Right,€190K,€700,2 ★,1★,Medium,Medium,1 ★,3
13697,Carles Gil,Spain,New England Revolution,RM,"5'7""",146lbs,Left,€8M,€9K,4 ★,4★,High,Medium,2 ★,15
13698,J. Perch,England,Mansfield Town,CDM,"5'11""",176lbs,Right,€140K,€4K,3 ★,2★,Medium,Medium,1 ★,4


## 04 - Processing Data

In [194]:
corr_matrix = numerical1.corr()
# corr_matrix
# sns.heatmap(corr_matrix, annot=True)
# plt.show()

In [195]:
# We remove Reactions column because it is overcorrelated with our target OVA
data.drop(['Reactions'], axis=1, inplace=True)
data.shape

(13358, 58)

#### Separate x and y

In [196]:
data.shape

y= data['OVA']
x= data.drop(['OVA'], axis=1)

# Identify x numerical and x categorical
X_num = x.select_dtypes(np.number)
X_cat = x.select_dtypes(np.object)

X_num.shape
X_cat.shape

(13358, 58)

(13358, 42)

(13358, 15)

#### Normalize (numerical)

In [197]:
# Normalization
MinMaxtransformer = MinMaxScaler().fit(X_num)
x_normalized = MinMaxtransformer.transform(X_num)
print(x_normalized.shape)
x_normalized = pd.DataFrame(x_normalized,columns=X_num.columns)
x_normalized.head()

(13358, 42)


Unnamed: 0,ID,Age,Growth,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Dribbling,Curve,...,GK Handling,GK Kicking,GK Positioning,GK Reflexes,PAC,SHO,PAS,DRI,DEF,PHY
0,0.711636,0.322581,0.038462,0.545455,0.478261,0.431818,0.710843,0.465116,0.615385,0.444444,...,0.111111,0.141304,0.078652,0.045455,0.614286,0.447368,0.558824,0.522388,0.5,0.515625
1,0.725767,0.451613,0.0,0.681818,0.826087,0.806818,0.686747,0.837209,0.857143,0.822222,...,0.055556,0.141304,0.05618,0.159091,0.814286,0.763158,0.632353,0.80597,0.269231,0.6875
2,0.711821,0.548387,0.0,0.761364,0.793478,0.329545,0.807229,0.825581,0.879121,0.944444,...,0.011111,0.054348,0.011236,0.011364,0.771429,0.789474,0.779412,0.865672,0.192308,0.453125
3,0.902364,0.193548,0.5,0.431818,0.423913,0.602273,0.614458,0.372093,0.538462,0.411111,...,0.077778,0.054348,0.05618,0.113636,0.442857,0.355263,0.426471,0.432836,0.576923,0.515625
4,0.906236,0.225806,0.307692,0.488636,0.369565,0.636364,0.686747,0.348837,0.648352,0.444444,...,0.077778,0.152174,0.033708,0.147727,0.571429,0.355263,0.514706,0.537313,0.615385,0.609375


#### One Hot/Label Encoding (categorical).

In [198]:
# # Encoding Categorical Data.
# # Add the two categorical variables that might have the most influence in predicting the Total Claim Amount (TCA)
# X_cat = data[['Club','BP']]
# #One Hot Encoding 
# encoder = OneHotEncoder().fit(X_cat)
# X_cat_encoded = encoder.transform(X_cat).toarray()
# enc = [columname for sublist in encoder.categories_ for columname in sublist]
# onehot_encoded = pd.DataFrame(X_cat_encoded,columns=enc)
# onehot_encoded.head()
# onehot_encoded.shape

#### Concatenation and x,y split

In [199]:
# x_normalized.shape
# onehot_encoded.shape
# X = pd.concat([x_normalized, onehot_encoded],axis=1)
# X.head(2)

#### Train, test split

In [200]:
# Splitting into train set and test set.
X_train, X_test, y_train, y_test = train_test_split(x_normalized, y, test_size=0.25, random_state=100)
X_train.head()

Unnamed: 0,ID,Age,Growth,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Dribbling,Curve,...,GK Handling,GK Kicking,GK Positioning,GK Reflexes,PAC,SHO,PAS,DRI,DEF,PHY
12989,0.76391,0.322581,0.038462,0.636364,0.326087,0.715909,0.783133,0.627907,0.736264,0.633333,...,0.022222,0.032609,0.022472,0.022727,0.357143,0.447368,0.676471,0.656716,0.74359,0.6875
3228,0.818616,0.258065,0.153846,0.590909,0.597826,0.386364,0.722892,0.686047,0.714286,0.6,...,0.133333,0.108696,0.146067,0.068182,0.571429,0.552632,0.588235,0.626866,0.653846,0.65625
10311,0.861009,0.322581,0.076923,0.056818,0.076087,0.068182,0.228916,0.093023,0.076923,0.144444,...,0.666667,0.641304,0.685393,0.738636,0.585714,0.592105,0.514706,0.58209,0.423077,0.5625
6694,0.936358,0.129032,0.384615,0.204545,0.293478,0.693182,0.53012,0.325581,0.296703,0.266667,...,0.1,0.119565,0.134831,0.136364,0.385714,0.236842,0.294118,0.134328,0.679487,0.734375
10723,0.928059,0.129032,0.384615,0.420455,0.228261,0.704545,0.554217,0.267442,0.637363,0.266667,...,0.088889,0.076087,0.123596,0.090909,0.585714,0.144737,0.294118,0.507463,0.692308,0.734375


## 05 - Modeling

In [201]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
#we train/fit our model
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

(10018, 42)
(3340, 42)
(10018,)
(3340,)


## 06 - Model Validation

In [202]:
predictions = lm.predict(X_test)

In [203]:
# R2.
r2 = r2_score(y_test, predictions)
print(r2)
# MSE.
mse = mean_squared_error(y_test, predictions)
# RMSE.
rmse = math.sqrt(mse)


0.8640699106927789


In [204]:
r2

0.8640699106927789