# FIFA 21 MoneyBall

Use a Linear Regression Model to predict the Value of FIFA players.

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
from scipy.stats.mstats import winsorize

import math
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings('ignore')

## Import Dataset

In [4]:
data = pd.read_csv(r"Data/fifa21_male2.csv")

## Review and Clean Data

In [5]:
data.shape

(17125, 107)

In [8]:
data.head()

Unnamed: 0,ID,Name,Age,OVA,Nationality,Club,BOV,BP,Position,Player Photo,...,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,Gender
0,2,G. Pasquale,33,69,Italy,Udinese,71,LWB,LM,https://cdn.sofifa.com/players/000/002/16_120.png,...,70+-1,70+-1,71+-2,70+-1,69+0,69+0,69+0,70+-1,17+0,Male
1,16,Luis García,37,71,Spain,KAS Eupen,70,CM,CM CAM CDM,https://cdn.sofifa.com/players/000/016/19_120.png,...,66+1,66+1,62+1,60+1,60+1,60+1,60+1,60+1,17+1,Male
2,27,J. Cole,33,71,England,Coventry City,71,CAM,CAM RM RW LM,https://cdn.sofifa.com/players/000/027/16_120.png,...,54+0,54+0,52+0,47+0,46+0,46+0,46+0,47+0,15+0,Male
3,36,D. Yorke,36,68,Trinidad &amp; Tobago,Sunderland,70,ST,,https://cdn.sofifa.com/players/000/036/09_120.png,...,65+0,65+0,56+0,57+0,51+0,51+0,51+0,57+0,22+0,Male
4,41,Iniesta,36,81,Spain,Vissel Kobe,82,CAM,CM CAM,https://cdn.sofifa.com/players/000/041/20_120.png,...,73+3,73+3,70+3,67+3,64+3,64+3,64+3,67+3,17+3,Male


### Set Option Display max rows/columns (due to size of Dataset)

In [9]:
pd.set_option('display.max_rows', 110)
pd.set_option('display.max_columns', 110)

### Cleaning Column Names

In [16]:
list(data.columns)

['id',
 'name',
 'age',
 'ova',
 'nationality',
 'club',
 'bov',
 'bp',
 'position',
 'player_photo',
 'club_logo',
 'flag_photo',
 'pot',
 'team_&_contract',
 'height',
 'weight',
 'foot',
 'growth',
 'joined',
 'loan_date_end',
 'value',
 'wage',
 'release_clause',
 'contract',
 'attacking',
 'crossing',
 'finishing',
 'heading_accuracy',
 'short_passing',
 'volleys',
 'skill',
 'dribbling',
 'curve',
 'fk_accuracy',
 'long_passing',
 'ball_control',
 'movement',
 'acceleration',
 'sprint_speed',
 'agility',
 'reactions',
 'balance',
 'power',
 'shot_power',
 'jumping',
 'stamina',
 'strength',
 'long_shots',
 'mentality',
 'aggression',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'composure',
 'defending',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'goalkeeping',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'total_stats',
 'base_stats',
 'w_f',
 'sm',
 'a_w',
 'd_w',
 'ir',
 'pac',
 'sho',
 'pas',
 'dri',
 'def',
 'phy',
 

In [13]:
data.columns = data.columns.str.lower()
data.columns = [column.lower().replace(' ', '_').replace('/', '_') for column in data.columns]

In [15]:
data.columns

Index(['id', 'name', 'age', 'ova', 'nationality', 'club', 'bov', 'bp',
       'position', 'player_photo',
       ...
       'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'gender'],
      dtype='object', length=107)

### Drop Columns

In [19]:
data.shape

(17125, 107)

**drop columns that cannot be used for analysis:** <br>
- name and id
- player_photo, club_logo, flag_photo are links
- there is only one gender so no added information
- loan_date_end are mainly null values or unstructured dates
- position because of several positions per row, we also have the column best position to use for position
- stats per position are dropped for now because I could not find a reason to keep them

In [23]:
data = data.drop(data.loc[:, "id":"name"].columns, axis = 1)
data = data.drop(['player_photo', "club_logo", "flag_photo", "gender", "loan_date_end", "position", "team_&_contract", "joined"], axis=1)
data = data.drop(data.loc[:, "ls":"gk"].columns, axis = 1)
data = data.drop(data.loc[:, "height":"weight"].columns, axis = 1)
data = data.drop(data.loc[:, "contract":"sliding_tackle"].columns, axis = 1)
data = data.drop(data.loc[:, "gk_diving":"base_stats"].columns, axis = 1)

KeyError: "['player_photo' 'club_logo' 'flag_photo' 'gender' 'loan_date_end'\n 'position' 'team_&_contract' 'joined'] not found in axis"

In [24]:
data.shape

(17125, 25)

### Review DTypes

In [25]:
data.dtypes

age                int64
ova                int64
nationality       object
club              object
bov                int64
bp                object
pot                int64
foot              object
growth             int64
value             object
wage              object
release_clause    object
goalkeeping        int64
w_f               object
sm                object
a_w               object
d_w               object
ir                object
pac                int64
sho                int64
pas                int64
dri                int64
def                int64
phy                int64
hits              object
dtype: object

**changes of dtypes I want to perform:** <br>
- 

In [None]:


data = data.astype({'death_event':'boolean', 'sex':'boolean', 'smoking':'boolean', 'high_blood_pressure':'boolean','diabetes':'boolean','anaemia':'boolean','age':'int64'})

### Drop Duplicates

In [None]:
data.shape

In [None]:
data = data.drop_duplicates()

In [None]:
data.shape

##  Review Cleaned Dataset

In [18]:
round(data.corr()['value'].sort_values(ascending=False),2)

KeyError: 'value'

In [None]:
data.describe()

In [None]:
data.corr()

In [None]:
# create correlation heatmap 

mask = np.zeros_like(data.corr())

mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(data.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')

plt.show()