In [206]:
##mport necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix

In [207]:
#load the dataset
data_players =pd.read_csv("players_20.csv")

In [208]:
## check column names
data_players.columns

Index(['sofifa_id', 'player_url', 'short_name', 'long_name', 'age', 'dob',
       'height_cm', 'weight_kg', 'nationality', 'club',
       ...
       'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb'],
      dtype='object', length=104)

In [209]:
##check for missing data
data_players.isnull().sum()

sofifa_id        0
player_url       0
short_name       0
long_name        0
age              0
              ... 
lb            2036
lcb           2036
cb            2036
rcb           2036
rb            2036
Length: 104, dtype: int64

In [210]:
### Dealing with categorical fetures missing values
##handling missing values

data_players['loaned_from']=data_players['loaned_from'].fillna('Unknown')
data_players['nation_position']=data_players['nation_position'].fillna('Unknown')
data_players['player_traits']=data_players['player_traits'].fillna('Unknown')
data_players['joined']=data_players['joined'].fillna('2019-07-10')


In [211]:
####fill na in numerical data with 0

cleaned_data=data_players.fillna(0)

In [212]:
cleaned_data.isnull().sum()

sofifa_id     0
player_url    0
short_name    0
long_name     0
age           0
             ..
lb            0
lcb           0
cb            0
rcb           0
rb            0
Length: 104, dtype: int64

In [213]:
####remove unncessarry features for the model
reduntant_data=cleaned_data.drop(['age','height_cm','weight_kg','overall','potential','value_eur','international_reputation'
,'weak_foot','skill_moves','release_clause_eur','work_rate','preferred_foot',          
'pace','shooting','passing','dribbling','defending','physic','gk_diving','gk_handling','gk_kicking',                   
'gk_reflexes','gk_speed','gk_positioning','attacking_crossing','attacking_finishing','attacking_heading_accuracy'    
,'attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy'
,'skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility'         
,'movement_reactions','movement_balance','power_shot_power','power_jumping'                 
,'power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions' 
,'mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking',            
'defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling',        
'goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes'], axis=1) 

In [214]:
###Select useful features for model
fifa_20 = cleaned_data.drop(reduntant_data, axis=1)

In [215]:
### Encoding categorical features which have effect on the model

In [216]:
##encoding work_rate
lebel_encoder = LabelBinarizer()
fifa_20.work_rate= lebel_encoder.fit_transform(fifa_20.work_rate.values)

In [217]:
###encoding preferred foot
lebe_encoder = LabelBinarizer()
fifa_20.preferred_foot= lebe_encoder.fit_transform(fifa_20.preferred_foot.values)

In [218]:
###create a subset of features with maximum correlation with the dependent variable

In [219]:
#### compute correlation

correlation=fifa_20.corr()['overall'].sort_values(ascending=False)

##absolute value
abs_corr=abs(correlation)

###selecting relavant feature
relavant_features = pd.DataFrame(abs_corr[abs_corr>0.49])
relavant_features

Unnamed: 0,overall
overall,1.0
movement_reactions,0.864526
mentality_composure,0.719992
potential,0.64665
value_eur,0.63823
release_clause_eur,0.60713
power_shot_power,0.567504
mentality_vision,0.509251
attacking_short_passing,0.503157


In [220]:
#### TRAIN MODEL

In [221]:
#### select featutres

X=fifa_20.drop(['overall'], axis=1)
y=fifa_20['overall']


In [222]:
### split data
x_train,x_test,y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 42)

In [223]:
#####XGB BOOST

from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(x_train, y_train)




XGBRegressor()

In [224]:
result = model.predict(x_test)
result

array([64.02376 , 74.225555, 68.52398 , ..., 58.849716, 53.854813,
       67.88531 ], dtype=float32)

In [225]:
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(result, y_test)))

Mean Absolute Error : 0.4922191642828009


In [226]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import absolute

cross_val =RepeatedKFold(n_splits=20, n_repeats=5, random_state=1)

##evaluation of model
scores = cross_val_score(model, x_test, y_test, scoring='neg_mean_absolute_error',cv=cross_val, n_jobs=-1)
scores= absolute(scores)
print('mean MAE:%.3f (%.3f)'%(scores.mean(), scores.std()) )

mean MAE:0.550 (0.034)


In [227]:
my_model = XGBRegressor(n_estimators=100)
my_model.fit(x_train, y_train, early_stopping_rounds=7, 
             eval_set=[(x_test, y_test)], verbose=False)



XGBRegressor()

In [239]:
## Links

https://colab.research.google.com/drive/1y-5SrqwQCUEgiwJC0GywP0uGbjvR1Ya2?usp=sharing