In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import joblib

In [2]:
df = pd.read_csv("Datasets/clean_fifa18.csv")
df.head()

Unnamed: 0,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Height_cm,Weight_kg,...,RS,RW,RWB,ST,Continent,FieldPositions,Atk,Mid,Def,Position
0,Cristiano Ronaldo,32,Portugal,94,94,Real Madrid CF,95500000.0,565000.0,185,80,...,92.0,91.0,66.0,92.0,Europe,Attack,91,79,59,ST
1,L. Messi,30,Argentina,93,93,FC Barcelona,105000000.0,565000.0,170,72,...,88.0,91.0,62.0,88.0,South America,Attack,90,80,53,RW
2,Neymar,25,Brazil,92,94,Paris Saint-Germain,123000000.0,280000.0,175,68,...,84.0,89.0,64.0,84.0,South America,Attack,86,77,54,LW
3,L. Suarez,30,Uruguay,92,92,FC Barcelona,97000000.0,510000.0,182,86,...,88.0,87.0,68.0,88.0,South America,Attack,87,78,62,ST
4,M. Neuer,31,Germany,92,92,FC Bayern Munich,61000000.0,230000.0,193,92,...,58.20405,59.359265,57.698721,58.20405,Europe,Attack,58,58,56,GK


In [3]:
input_col = ['Age','Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Crossing',
           'Curve', 'Dribbling', 'Finishing', 'Free kick accuracy', 'GK diving', 'GK handling',
           'GK kicking', 'GK positioning', 'GK reflexes', 'Heading accuracy', 'Interceptions', 
           'Jumping', 'Long passing', 'Long shots',	'Marking', 'Penalties', 'Positioning', 
           'Reactions', 'Short passing', 'Shot power', 'Sliding tackle', 'Sprint speed',	'Stamina',
           'Standing tackle', 'Strength', 'Vision', 'Volleys']

output_col=['ST', 'CF', 'RF', 'LF', 'RW', 'LW', 'RS', 'LS','CAM', 'RAM', 'LAM', 'CM', 'RCM', 'LCM', 'LM', 'RM', 'CDM', 'RDM', 'LDM' 
           ,'CB', 'RB', 'LB', 'RCB', 'LCB', 'RWB', 'LWB']
X = pd.DataFrame(df, columns=input_col)

In [4]:
from sklearn.model_selection import cross_val_score

def cv_results(model, X, y):
    scores = cross_val_score(model, X, y, cv = 5, scoring="neg_mean_squared_error", n_jobs=-1)
    rmse_scores = np.sqrt(-scores)
    rmse_scores = np.round(rmse_scores, 2)
    print('CV Scores: ', rmse_scores)
    print('rmse: {},  S.D.:{} '.format(np.mean(rmse_scores), np.std(rmse_scores)))

In [5]:
for pos in output_col:
    print("Training model for ", pos)
    y=df[pos]
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)    
    model = XGBRegressor(objective='reg:squarederror', n_estimators=300, max_depth=8, learning_rate=0.1, n_jobs=-1, random_state=42)
    model = model.fit(x_train,y_train)
    cv_results(model, x_test, y_test)
    name="models/"+pos+".pkl"
    joblib.dump(model, name)

Training model for  ST
CV Scores:  [0.86 0.87 0.85 0.82 0.93]
rmse: 0.866,  S.D.:0.03611094017053561 
Training model for  CF
CV Scores:  [0.81 0.86 0.88 0.85 0.99]
rmse: 0.8779999999999999,  S.D.:0.060464865831323884 
Training model for  RF
CV Scores:  [0.81 0.86 0.88 0.85 0.99]
rmse: 0.8779999999999999,  S.D.:0.060464865831323884 
Training model for  LF
CV Scores:  [0.81 0.86 0.88 0.85 0.99]
rmse: 0.8779999999999999,  S.D.:0.060464865831323884 
Training model for  RW
CV Scores:  [0.86 0.84 0.93 0.89 0.94]
rmse: 0.892,  S.D.:0.03867815921162744 
Training model for  LW
CV Scores:  [0.86 0.84 0.93 0.89 0.94]
rmse: 0.892,  S.D.:0.03867815921162744 
Training model for  RS
CV Scores:  [0.86 0.87 0.85 0.82 0.93]
rmse: 0.866,  S.D.:0.03611094017053561 
Training model for  LS
CV Scores:  [0.86 0.87 0.85 0.82 0.93]
rmse: 0.866,  S.D.:0.03611094017053561 
Training model for  CAM
CV Scores:  [0.85 0.85 0.87 0.84 1.02]
rmse: 0.8859999999999999,  S.D.:0.06770524351924305 
Training model for  RAM
CV