In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv("Datasets/clean_fifa18.csv")
df.head()

Unnamed: 0,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Height_cm,Weight_kg,...,RS,RW,RWB,ST,Continent,FieldPositions,Atk,Mid,Def,Position
0,Cristiano Ronaldo,32,Portugal,94,94,Real Madrid CF,95500000.0,565000.0,185,80,...,92.0,91.0,66.0,92.0,Europe,Attack,91,79,59,ST
1,L. Messi,30,Argentina,93,93,FC Barcelona,105000000.0,565000.0,170,72,...,88.0,91.0,62.0,88.0,South America,Attack,90,80,53,RW
2,Neymar,25,Brazil,92,94,Paris Saint-Germain,123000000.0,280000.0,175,68,...,84.0,89.0,64.0,84.0,South America,Attack,86,77,54,LW
3,L. Suarez,30,Uruguay,92,92,FC Barcelona,97000000.0,510000.0,182,86,...,88.0,87.0,68.0,88.0,South America,Attack,87,78,62,ST
4,M. Neuer,31,Germany,92,92,FC Bayern Munich,61000000.0,230000.0,193,92,...,58.20405,59.359265,57.698721,58.20405,Europe,Attack,58,58,56,GK


In [6]:
columns = ['Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure', 'Crossing',
           'Curve', 'Dribbling', 'Finishing', 'Free kick accuracy', 'GK diving', 'Heading accuracy', 'Interceptions', 
           'Jumping', 'Long passing', 'Long shots',	'Marking', 'Penalties', 'Positioning', 
           'Reactions', 'Short passing', 'Shot power', 'Sliding tackle', 'Sprint speed',	'Stamina',
           'Standing tackle', 'Strength', 'Vision', 'Volleys','FieldPositions']

df = pd.DataFrame(df, columns=columns)
X = df.drop("FieldPositions", axis=True)
y = df["FieldPositions"].copy()

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape

((14384, 30), (3597, 30))

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [9]:
def grid_search(model, grid_param):
    print("Obtaining Best Model for {}".format(model.__class__.__name__))
    grid_search = GridSearchCV(model, grid_param, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1)
    grid_search.fit(x_train, y_train)
    print("Best Parameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_*100)
        
    return grid_search.best_estimator_

def cv_results(model, X, y):
    scores = cross_val_score(model, X, y, cv = 5, scoring="accuracy", n_jobs=-1)*100
    acc_scores = np.round(scores, 2)
    print('CV Scores: ', acc_scores)
    print('Mean Accuracy: {},  S.D.:{} '.format(np.mean(acc_scores), np.std(acc_scores)))

In [10]:
from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression(penalty='l2', random_state=42, n_jobs=-1)
logistic_param_grid = [{'C':[0.1, 1, 10], 'penalty':['l1', 'l2']}]
best_logistic_clf = grid_search(logistic_clf, logistic_param_grid)

Obtaining Best Model for LogisticRegression
Best Parameters:  {'C': 10, 'penalty': 'l2'}
Best Score:  98.21326447393673


In [26]:
cv_results(best_logistic_clf, x_test, y_test)

CV Scores:  [97.36 97.36 97.77 97.91 98.89]
Mean Accuracy: 97.85799999999999,  S.D.:0.5606210841557785 


In [33]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(max_features='auto', random_state=42, n_jobs=-1)
forest_param_grid = [{'n_estimators':[50, 100, 200], 'max_depth':[8, 12, 16]}]
best_forest_clf = grid_search(forest_clf, forest_param_grid)

Obtaining Best Model for RandomForestClassifier
Best Parameters:  {'max_depth': 16, 'max_features': 'auto', 'n_estimators': 100}
Best Score:  95.97467420620015


In [34]:
cv_results(best_forest_clf, x_test, y_test)

CV Scores:  [94.17 94.58 92.35 94.85 95.41]
Mean Accuracy: 94.272,  S.D.:1.0419673699305565 


In [38]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(objective="multi:softmax", learning_rate=0.1, n_jobs=-1, random_state=42)
xgb_param_grid=[{'n_estimators':[100, 200, 300], 'max_depth':[3, 8, 12]}]
best_xgb_clf = grid_search(xgb_clf, xgb_param_grid)

Obtaining Best Model for XGBClassifier
Best Parameters:  {'max_depth': 3, 'n_estimators': 300}
Best Score:  97.05923145681327


In [39]:
cv_results(best_xgb_clf, x_test, y_test)

CV Scores:  [96.67 95.56 96.24 96.11 96.38]
Mean Accuracy: 96.19200000000001,  S.D.:0.36668242390384537 


In [11]:
import joblib

In [13]:
joblib.dump(best_logistic_clf, "models/field_position.pkl")

['models/field_position.pkl']