In [28]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Datasets/clean_fifa18.csv")
df.head()

Unnamed: 0,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Height_cm,Weight_kg,...,RS,RW,RWB,ST,Continent,FieldPositions,Atk,Mid,Def,Position
0,Cristiano Ronaldo,32,Portugal,94,94,Real Madrid CF,95500000.0,565000.0,185,80,...,92.0,91.0,66.0,92.0,Europe,Attack,91,79,59,ST
1,L. Messi,30,Argentina,93,93,FC Barcelona,105000000.0,565000.0,170,72,...,88.0,91.0,62.0,88.0,South America,Attack,90,80,53,RW
2,Neymar,25,Brazil,92,94,Paris Saint-Germain,123000000.0,280000.0,175,68,...,84.0,89.0,64.0,84.0,South America,Attack,86,77,54,LW
3,L. Suarez,30,Uruguay,92,92,FC Barcelona,97000000.0,510000.0,182,86,...,88.0,87.0,68.0,88.0,South America,Attack,87,78,62,ST
4,M. Neuer,31,Germany,92,92,FC Bayern Munich,61000000.0,230000.0,193,92,...,58.20405,59.359265,57.698721,58.20405,Europe,Attack,58,58,56,GK


In [5]:
columns = ['Age','Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Crossing',
           'Curve', 'Dribbling', 'Finishing', 'Free kick accuracy', 'GK diving', 'GK handling',
           'GK kicking', 'GK positioning', 'GK reflexes', 'Heading accuracy', 'Interceptions', 
           'Jumping', 'Long passing', 'Long shots',	'Marking', 'Penalties', 'Positioning', 
           'Reactions', 'Short passing', 'Shot power', 'Sliding tackle', 'Sprint speed',	'Stamina',
           'Standing tackle', 'Strength', 'Vision', 'Volleys','Overall']

df = pd.DataFrame(df, columns=columns)
X = df.drop("Overall", axis=True)
y = df["Overall"].copy()

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape

((14384, 35), (3597, 35))

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

def cv_results(model, X, y):
    scores = cross_val_score(model, X, y, cv = 5, scoring="neg_mean_squared_error", n_jobs=-1)
    rmse_scores = np.sqrt(-scores)
    rmse_scores = np.round(rmse_scores, 2)
    print('CV Scores: ', rmse_scores)
    print('rmse: {},  S.D.:{} '.format(np.mean(rmse_scores), np.std(rmse_scores)))

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf_grid_parm=[{'n_estimators':[100, 200, 300], 'max_depth':[8, 16, 24]}]
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1), 
                              rf_grid_parm, cv=5, scoring="neg_mean_squared_error", return_train_score=True, n_jobs=-1)
rf_grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=-

In [14]:
rf_grid_search.best_params_, -rf_grid_search.best_score_

({'max_depth': 24, 'n_estimators': 300}, 1.9677200496729186)

In [15]:
best_forest_reg = rf_grid_search.best_estimator_
best_forest_reg

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=24, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [16]:
feature_imp = [ col for col in zip(columns, best_forest_reg.feature_importances_)]
feature_imp.sort(key=lambda x:x[1], reverse=True)
feature_imp

[('Reactions', 0.6661712491008235),
 ('Ball control', 0.07902454717301723),
 ('Standing tackle', 0.041271237107203984),
 ('Marking', 0.033895931925873854),
 ('GK positioning', 0.016313110755370116),
 ('GK diving', 0.015580049214622051),
 ('Composure', 0.015553692541627143),
 ('Positioning', 0.011906081214023917),
 ('GK reflexes', 0.010865347528235028),
 ('Heading accuracy', 0.010512656577005335),
 ('Age', 0.009252584880509496),
 ('GK handling', 0.00893879685384652),
 ('Finishing', 0.008060501337852632),
 ('Short passing', 0.006637639941266197),
 ('Crossing', 0.006093097598394463),
 ('Interceptions', 0.005944899192756001),
 ('Sliding tackle', 0.005877748037749478),
 ('Dribbling', 0.005774073657371888),
 ('Stamina', 0.004711408108870309),
 ('Shot power', 0.004604047632259242),
 ('Sprint speed', 0.004297312368757145),
 ('Strength', 0.0036719589156561246),
 ('Acceleration', 0.0033751099080805798),
 ('Aggression', 0.002941180147310276),
 ('Long shots', 0.0028399432384606407),
 ('Vision', 0.

In [29]:
cv_results(best_forest_reg, x_test, y_test)

CV Scores:  [1.7  1.72 1.67 1.64 1.55]
rmse: 1.656,  S.D.:0.05953150426454885 


In [19]:
from xgboost import XGBRegressor

xgb_grid_parm=[{'n_estimators':[50, 150, 300], 'max_depth':[8, 12, 16]}]
xgb_grid_search = GridSearchCV(XGBRegressor(objective='reg:squarederror', learning_rate=0.1, n_jobs=-1, random_state=42), 
                               xgb_grid_parm, cv=5, scoring="neg_mean_squared_error", return_train_score=True, n_jobs=-1)
xgb_grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=-1, nthread=None,
                                    objective='reg:squarederror',
                                    random_state=42, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'max_depth': [8, 12, 16],
                          'n_estimators': [50, 150, 300]}],
             p

In [20]:
xgb_grid_search.best_params_, -xgb_grid_search.best_score_

({'max_depth': 8, 'n_estimators': 300}, 1.1793248664677791)

In [21]:
best_xgb_reg = xgb_grid_search.best_estimator_
best_xgb_reg

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=None, n_estimators=300,
             n_jobs=-1, nthread=None, objective='reg:squarederror',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [22]:
feature_imp = [ col for col in zip(columns, best_xgb_reg.feature_importances_)]
feature_imp.sort(key=lambda x:x[1], reverse=True)
feature_imp

[('Reactions', 0.54808146),
 ('Composure', 0.059436966),
 ('Short passing', 0.04758273),
 ('Ball control', 0.046574604),
 ('Standing tackle', 0.033853028),
 ('Marking', 0.027569791),
 ('GK positioning', 0.02616392),
 ('Heading accuracy', 0.024192471),
 ('Sliding tackle', 0.020523261),
 ('GK diving', 0.019943161),
 ('GK handling', 0.018225001),
 ('GK reflexes', 0.017667912),
 ('Dribbling', 0.015156828),
 ('Sprint speed', 0.012356945),
 ('Crossing', 0.0091408985),
 ('Positioning', 0.009069782),
 ('Interceptions', 0.008742967),
 ('Finishing', 0.008640404),
 ('Strength', 0.0086384425),
 ('Shot power', 0.007229518),
 ('Stamina', 0.0072103557),
 ('Age', 0.0043569403),
 ('Acceleration', 0.00308806),
 ('Long shots', 0.0027603956),
 ('Vision', 0.0026855657),
 ('Jumping', 0.0019018641),
 ('Long passing', 0.0017663072),
 ('GK kicking', 0.0016112288),
 ('Aggression', 0.0013377392),
 ('Volleys', 0.0009136426),
 ('Penalties', 0.0007880709),
 ('Curve', 0.00073065714),
 ('Free kick accuracy', 0.000702

In [30]:
cv_results(best_xgb_reg, x_test, y_test)

CV Scores:  [1.25 1.33 1.33 1.29 1.23]
rmse: 1.286,  S.D.:0.04079215610874232 


In [23]:
import joblib

In [24]:
joblib.dump(best_xgb_reg, "models/overall.pkl")

['overall.pkl']