In [1]:
# import libraries
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals import joblib

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

sns.set_style('whitegrid')

In [2]:
df = pd.read_csv('../Final Project/Data/Processed/fifa19data_clean.csv')
print(df.columns)

FileNotFoundError: [Errno 2] File b'../Final Project/Data/Processed/fifa19data_clean.csv' does not exist: b'../Final Project/Data/Processed/fifa19data_clean.csv'

In [None]:
# group similar positions together
forward = ['RS', 'LS', 'RF', 'LF', 'CF', 'ST']

attack_mid = ['RAM', 'LAM', 'CAM']
wings = ['RM', 'RW', 'LM', 'LW']

central_mid = ['CM', 'LCM', 'RCM']
defensive_mid = ['CDM', 'LDM', 'RDM']

fullback = ['RB', 'RWB', 'LB', 'LWB']
cb_def = ['CB', 'LCB', 'RCB']

gk = ['GK']

In [None]:
# functions to run models

def model_training(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    return model
    
def model_prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_evaluation(model_name, y_test, y_pred):
    print(model_name)
    print('MAE', mean_absolute_error(y_test, y_pred))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
    plt.scatter(y_test, y_pred, alpha=0.3)
    plt.plot(range(0,10), range(0,10), '--r', alpha=0.3, label='Line1')
    plt.title(model_name)
    plt.xlabel('True Value')
    plt.ylabel('Predicted Value')
    plt.show()
    print('')

def run_experiment(model_name, model, X_train, y_train, X_test, y_test):
    train_model = model_training(model_name, model, X_train, y_train)
    predictions = model_prediction(train_model, X_test)
    model_evaluation(model_name, y_test, predictions)
    return train_model, predictions

In [None]:
# create dataframes for each group of positions
df_gk = df[df['Position'] == 'GK']

df_fw = pd.DataFrame()
df_am = pd.DataFrame()
df_wings = pd.DataFrame()
df_cm = pd.DataFrame()
df_dm = pd.DataFrame()
df_fullback = pd.DataFrame()
df_cb = pd.DataFrame()

def aggregatePositions(df_pos, df_data, positions):
    for pos in positions:
        df_pos = pd.concat([df_pos, df_data[df_data['Position'] == pos]])
    return df_pos

df_fw = aggregatePositions(df_fw, df, forward)
df_am = aggregatePositions(df_am, df, attack_mid)
df_wings = aggregatePositions(df_wings, df, wings)
df_cm = aggregatePositions(df_cm, df, central_mid)
df_dm = aggregatePositions(df_dm, df, defensive_mid)
df_fullback = aggregatePositions(df_fullback, df, fullback)
df_cb = aggregatePositions(df_cb, df, cb_def)

In [None]:
X_columns = ['Age', 'Potential']
y_column = ['Overall']

In [None]:
X_fw = df_fw[list(set(X_columns) - set(['Overall']))]
X_am = df_am[list(set(X_columns) - set(['Overall']))]
X_wings = df_wings[list(set(X_columns) - set(['Overall']))]
X_cm = df_cm[list(set(X_columns) - set(['Overall']))]
X_dm = df_dm[list(set(X_columns) - set(['Overall']))]
X_fullback = df_fullback[list(set(X_columns) - set(['Overall']))]
X_cb = df_cb[list(set(X_columns) - set(['Overall']))]
X_gk = df_gk[list(set(X_columns) - set(['Overall']))]

y_fw = df_fw[y_column]
y_am = df_am[y_column]
y_wings = df_wings[y_column]
y_cm = df_cm[y_column]
y_dm = df_dm[y_column]
y_fullback = df_fullback[y_column]
y_cb = df_cb[y_column]
y_gk = df_gk[y_column]

In [None]:
threshold = 0.8
# Forward
X_fw_train, X_fw_test, y_fw_train, y_fw_test = train_test_split(X_fw, y_fw, test_size=1.0-threshold, shuffle=True)

print('Forward:')
print('')
print('X_forward_train', X_fw_train.shape)
print('y_forward_train', y_fw_train.shape)
print('X_forward_test', X_fw_test.shape)
print('y_forward_test', y_fw_test.shape)
print('')

fw_LinReg_model, fw_LinReg_pred = run_experiment('Linear Regression - Forward', LinearRegression(), X_fw_train, y_fw_train, X_fw_test, y_fw_test)
fw_RF_model, fw_RF_pred = run_experiment('Random Forest Regression - Forward', RandomForestRegressor(100), X_fw_train, y_fw_train.values.ravel(), X_fw_test, y_fw_test)


# Attacking Midfield
X_am_train, X_am_test, y_am_train, y_am_test = train_test_split(X_am, y_am, test_size=1.0-threshold, shuffle=True)

print('Attacking Midfield:')
print('')
print('X_AM_train', X_am_train.shape)
print('y_AM_train', y_am_train.shape)
print('X_AM_test', X_am_test.shape)
print('y_AM_test', y_am_test.shape)
print('')

am_LinReg_model, am_LinReg_pred = run_experiment('Linear Regression - Attacking Midfield', LinearRegression(), X_am_train, y_am_train, X_am_test, y_am_test)
am_RF_model, am_RF_pred = run_experiment('Random Forest Regression - Attacking Midfield', RandomForestRegressor(100), X_am_train, y_am_train.values.ravel(), X_am_test, y_am_test)

# Wings
X_wings_train, X_wings_test, y_wings_train, y_wings_test = train_test_split(X_wings, y_wings, test_size=1.0-threshold, shuffle=True)

print('Wings:')
print('')
print('X_wings_train', X_wings_train.shape)
print('y_wings_train', y_wings_train.shape)
print('X_wings_test', X_wings_test.shape)
print('y_wings_test', y_wings_test.shape)
print('')

wings_LinReg_model, wings_LinReg_pred = run_experiment('Linear Regression - Wings', LinearRegression(), X_wings_train, y_wings_train, X_wings_test, y_wings_test)
wings_RF_model, wings_RF_pred = run_experiment('Random Forest Regression - Wings', RandomForestRegressor(100), X_wings_train, y_wings_train.values.ravel(), X_wings_test, y_wings_test)

# Central Midfield
X_cm_train, X_cm_test, y_cm_train, y_cm_test = train_test_split(X_cm, y_cm, test_size=1.0-threshold, shuffle=True)

print('Central Midfield:')
print('')
print('X_CM_train', X_cm_train.shape)
print('y_CM_train', y_cm_train.shape)
print('X_CM_test', X_cm_test.shape)
print('y_CM_test', y_cm_test.shape)
print('')

cm_LinReg_model, cm_LinReg_pred = run_experiment('Linear Regression - Central Midfield', LinearRegression(), X_cm_train, y_cm_train, X_cm_test, y_cm_test)
cm_RF_model, cm_RF_pred = run_experiment('Random Forest Regression - Central Midfield', RandomForestRegressor(100), X_cm_train, y_cm_train.values.ravel(), X_cm_test, y_cm_test)

# Defensive Midfield
X_dm_train, X_dm_test, y_dm_train, y_dm_test = train_test_split(X_dm, y_dm, test_size=1.0-threshold, shuffle=True)

print('Defensive Midfield:')
print('')
print('X_DM_train', X_dm_train.shape)
print('y_DM_train', y_dm_train.shape)
print('X_DM_test', X_dm_test.shape)
print('y_DM_test', y_dm_test.shape)
print('')

dm_LinReg_model, dm_LinReg_pred = run_experiment('Linear Regression - Defensive Midfield', LinearRegression(), X_dm_train, y_dm_train, X_dm_test, y_dm_test)
dm_RF_model, dm_RF_pred = run_experiment('Random Forest Regression - Defensive Midfield', RandomForestRegressor(100), X_dm_train, y_dm_train.values.ravel(), X_dm_test, y_dm_test)

# Fullback
X_fullback_train, X_fullback_test, y_fullback_train, y_fullback_test = train_test_split(X_fullback, y_fullback, test_size=1.0-threshold, shuffle=True)

print('Fullback:')
print('')
print('X_fullback_train', X_fullback_train.shape)
print('y_fullback_train', y_fullback_train.shape)
print('X_fullback_test', X_fullback_test.shape)
print('y_fullback_test', y_fullback_test.shape)
print('')

fullback_LinReg_model, fullback_LinReg_pred = run_experiment('Linear Regression - Fullback', LinearRegression(), X_fullback_train, y_fullback_train, X_fullback_test, y_fullback_test)
fullback_RF_model, fullback_RF_pred = run_experiment('Random Forest Regression - Fullback', RandomForestRegressor(100), X_fullback_train, y_fullback_train.values.ravel(), X_fullback_test, y_fullback_test)

# Centre Back
X_cb_train, X_cb_test, y_cb_train, y_cb_test = train_test_split(X_cb, y_cb, test_size=1.0-threshold, shuffle=True)

print('Centre Back:')
print('')
print('X_CB_train', X_cb_train.shape)
print('y_CB_train', y_cb_train.shape)
print('X_CB_test', X_cb_test.shape)
print('y_CB_test', y_cb_test.shape)
print('')

cb_LinReg_model, cb_LinReg_pred = run_experiment('Linear Regression - Centre Back', LinearRegression(), X_cb_train, y_cb_train, X_cb_test, y_cb_test)
cb_RF_model, cb_RF_pred = run_experiment('Random Forest Regression - Centre Back', RandomForestRegressor(100), X_cb_train, y_cb_train.values.ravel(), X_cb_test, y_cb_test)

# Goalkeeper
X_gk_train, X_gk_test, y_gk_train, y_gk_test = train_test_split(X_gk, y_gk, test_size=1.0-threshold, shuffle=True)

print('Goalkeeper:')
print('')
print('X_GK_train', X_dm_train.shape)
print('y_GK_train', y_dm_train.shape)
print('X_GK_test', X_dm_test.shape)
print('y_GK_test', y_dm_test.shape)
print('')

gk_LinReg_model, gk_LinReg_pred = run_experiment('Linear Regression - Goalkeeper', LinearRegression(), X_gk_train, y_gk_train, X_gk_test, y_gk_test)
gk_RF_model, gk_RF_pred = run_experiment('Random Forest Regression - Goalkeeper', RandomForestRegressor(100), X_gk_train, y_gk_train.values.ravel(), X_gk_test, y_gk_test)

In [None]:
joblib.dump(fw_RF_model, '../Final Project/Data/Model/Forward_Model.pkl')