# Data Preprocessing and Feature Engineering

## Imports

In [334]:
# Essential Imports
import pandas as pd
import numpy as np
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import pickle as pkl
from sklearn.model_selection import cross_val_score
import joblib
pd.options.mode.copy_on_write = True

In [335]:
drive.mount('/content/drive')
legacy_data = pd.read_csv('/content/drive/My Drive/male_players (legacy).csv')
players_22 = pd.read_csv('/content/drive/My Drive/players_22.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  legacy_data = pd.read_csv('/content/drive/My Drive/male_players (legacy).csv')
  players_22 = pd.read_csv('/content/drive/My Drive/players_22.csv')


## Exploratory Data Analysis

In [336]:
# Gives an overview about data
print(legacy_data.describe())

# Gives a count of missing values
print(legacy_data.isnull().sum())

# Gives a description of the data
print(legacy_data.describe())

           player_id   fifa_version  fifa_update        overall  \
count  161583.000000  161583.000000     161583.0  161583.000000   
mean   214484.722353      19.125514          2.0      65.699071   
std     34928.608856       2.559318          0.0       7.040855   
min         2.000000      15.000000          2.0      40.000000   
25%    199159.000000      17.000000          2.0      61.000000   
50%    220621.000000      19.000000          2.0      66.000000   
75%    236958.000000      21.000000          2.0      70.000000   
max    271817.000000      23.000000          2.0      94.000000   

           potential     value_eur       wage_eur            age  \
count  161583.000000  1.595300e+05  159822.000000  161583.000000   
mean       70.744008  2.326770e+06   10855.409768      25.123181   
std         6.259121  6.005746e+06   21941.656285       4.670207   
min        40.000000  1.000000e+03     500.000000      16.000000   
25%        66.000000  3.250000e+05    2000.000000      2

## Imputation Function

In [337]:
# Cleans the data pased to it
def data_cleaning(data):

  # Drops data that has more 30% missing values or more
  threshold = len(data) * 0.7
  data = data.dropna(axis = 1, thresh = threshold)

  # Separates the data into numeric
  numeric = data.select_dtypes(include = np.number)

  # Separates the data into non-numeric
  non_numeric = data.select_dtypes(include = ['object'])

  # Calculates the mean values
  mean_values = numeric.mean()

  # Imputers missing values in numeric data and rounds to
  # the nearest integer
  data.loc[:, numeric.columns] = numeric.fillna(mean_values)

  # Finds the mode of the data and uses it
  # for the NA values
  for x in non_numeric:
    mode_value = non_numeric[x].mode()[0]
    non_numeric[x].fillna(mode_value, inplace = True)

  # Updates the main cleaned data with the new
  # cleaned non-numeric data
  data.update(non_numeric)

  # The columns that hold little influence over the
  # data are deleted here
  to_be_dropped = ['player_url', 'fifa_version', 'fifa_update',
    'fifa_update_date', 'player_face_url', 'short_name',
    'long_name', 'player_id', 'dob', 'league_name', 'league_id',
    'club_team_id', 'nationality_name', 'real_face',
    'club_jersey_number', 'club_position', 'player_positions',
    'club_contract_valid_until_year', 'club_name','club_joined_date'
    ]

  # Creates a loop to check if the column is in the new data
  # and deletes it
  for col in data.columns:

    # Checks for the column name from the list of names above
    if col in to_be_dropped:

      # drops the column once it's found in the list
      data.drop(col, axis = 1, inplace = True)

  # Returns the cleaned data after
  # cleaning is complete
  return pd.DataFrame(data)

## Removing Unwanted Columns

In [338]:
# Removing the data about players that shows what they could have as points
# if they had specific positions as it doesn't have a significant impact on
# the overall value the player has
def remove_extra(data):
  # Saves the filtered out columns
  filtered_columns = []

  # Loops through the columns
  for col in data.columns:

    # Removes the columns with more than 3 letters
    if len(col) > 3 or col == 'age':

      # Appends the column with the columns having more than 3 letters
      filtered_columns.append(col)

  # Gets the filtered data
  filtered_data = data[filtered_columns]

  # Returns the filtered data
  return filtered_data

In [339]:
def categorical_cleaner(data):

  # Gets the non-numeric data
  non_numeric = data.select_dtypes(include = ['object'])

  # Greates a dataframe for the categorical data
  categorical_data = pd.DataFrame(non_numeric)

  # Encode the data
  encoded_data = one_hot_encode(categorical_data)

  # Returns the final data
  return encoded_data

In [340]:
# Uses one hot encoding to encode the categorical
# variables
def one_hot_encode(categorical):

  # Creates the encoding instance
  converter = OneHotEncoder()

  # Encoding categorical values
  finished_conversion = (converter.fit_transform(categorical)).toarray()

  # Returns the encoded data and gets the feature names
  # for each category
  return pd.DataFrame(finished_conversion, columns = converter.get_feature_names_out(categorical.columns))

In [341]:
# Gets the correlation for your data for a specific target
def correlation(data, target):
  bad_correlation = []

  # Calculates the correlation matrix
  correlation_matrix = data.corr()

  # Gets the correlation values for the target variable
  # abs is used to make sure negative is not a factor
  target_correlation = correlation_matrix[target]

  # Sorts the correlation values in descending order
  sorted_correlation = target_correlation.sort_values(ascending = False)

  for col in sorted_correlation.index:
    if col != 'overall' and abs(sorted_correlation[col]) < 0.5:
      bad_correlation.append(col)

  final_data = data.drop(columns = bad_correlation, axis = 1)
  final_data.drop('overall', axis = 1)

  return final_data

In [342]:
def data_processing(data):

  # Removes the columns that are not needed
  data = remove_extra(data)

  # First level of cleaning clears NA values
  cleaned = data_cleaning(data)

  # Separates the cleaned numeric data
  numeric = cleaned.select_dtypes(include = np.number)

  # Separates the cleaned non-numeric data
  non_numeric = cleaned.select_dtypes(include = ['object'])

  # One-hot encodes the low cardinality columns
  encoded_data = categorical_cleaner(non_numeric)

  # Concatenates both numeric and non_numeric
  # dataframes
  final_numeric = pd.concat([numeric, encoded_data], axis = 1)

  # Get's the data with the highest 20 correlations
  corr_values = correlation(final_numeric, 'overall')

  # Returns the final cleaned data
  return corr_values

In [343]:
def scaling(data):
  # Creates an instance of StandardScalar
  scalar = StandardScaler()

  # Scales the data for reducing variability
  X_scaled = scalar.fit_transform(data)

  # Returns the scaled data
  return X_scaled

In [344]:
legacy_data_processed = data_processing(legacy_data)
legacy_data_processed

Unnamed: 0,overall,potential,value_eur,wage_eur,passing,dribbling,movement_reactions,mentality_composure
0,93,95,100500000.0,550000.0,86.000000,96.000000,94,57.816892
1,92,92,79000000.0,375000.0,81.000000,91.000000,90,57.816892
2,90,90,54500000.0,275000.0,83.000000,92.000000,89,57.816892
3,90,90,52500000.0,275000.0,81.000000,86.000000,85,57.816892
4,90,90,63500000.0,300000.0,56.938175,62.081872,89,57.816892
...,...,...,...,...,...,...,...,...
161578,46,61,110000.0,700.0,40.000000,47.000000,39,40.000000
161579,46,58,110000.0,750.0,29.000000,34.000000,42,35.000000
161580,46,58,110000.0,500.0,43.000000,46.000000,50,35.000000
161581,46,70,150000.0,500.0,36.000000,46.000000,45,43.000000


The features selected are: potential, value_eur, wage_eur, passing, dribbling, movement_reactions, mentality_composure

# Training



In [345]:
X = legacy_data_processed.drop('overall', axis = 1)
y = legacy_data_processed['overall']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [346]:
def random_forest_regressor(X_train, y_train, X_test, y_test):
  # Initializes the RandomForestRegressor
  regressor = RandomForestRegressor()

  # Trains the model
  regressor.fit(X_train, y_train)

  # Make predictions
  y_pred = regressor.predict(X_test)

  # Calculates the mean squared error
  r2score = r2_score(y_pred, y_test)
  mae = mean_absolute_error(y_pred, y_test)
  mse = mean_squared_error(y_pred, y_test)
  rmse = np.sqrt(mse)

  # Returns the mean squared error
  print(f""" Random Forest Regressor Results
    Mean Absolute Error={mae}
    Mean Squared Error={mse}
    Root Mean Squared Error={rmse}
    R2 score={r2score}
  """)

  return regressor

In [347]:
def xgboost(X_train, X_test, y_train, y_test):

  # Initializes the xgboost
  regressor = XGBRegressor()

  # Trains the model
  regressor.fit(X_train, y_train)

  # Make predictions
  y_pred = regressor.predict(X_test)

  # Calculates the mean squared error
  r2score = r2_score(y_pred, y_test)
  mae = mean_absolute_error(y_pred, y_test)
  mse = mean_squared_error(y_pred, y_test)
  rmse = np.sqrt(mse)

  # Returns the mean squared error
  print(f""" XGBoost Regressor Results
    Mean Absolute Error={mae}
    Mean Squared Error={mse}
    Root Mean Squared Error={rmse}
    R2 score={r2score}
  """)

  return regressor

In [348]:
def decision_tree_regressor(X_train, y_train, X_test, y_test):
  # Initializes the DecisionTreeRegressor
  regressor = DecisionTreeRegressor()

  # Trains the model
  regressor.fit(X_train, y_train)

  # Make predictions
  y_pred = regressor.predict(X_test)

  # Calculates the mean squared error
  r2score = r2_score(y_pred, y_test)
  mae = mean_absolute_error(y_pred, y_test)
  mse = mean_squared_error(y_pred, y_test)
  rmse = np.sqrt(mse)

  # Returns the mean squared error
  print(f""" Decision Tree Regressor Results
    Mean Absolute Error={mae}
    Mean Squared Error={mse}
    Root Mean Squared Error={rmse}
    R2 score={r2score}
  """)

  return regressor

In [349]:
def cross_validation(regressor, X_train, y_train, CV):

  # Initializes K-fold
  kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

  # Gets Cross validation scores for RandomFores
  cv_scores_rf = cross_val_score(regressor, X_train, y_train, cv = CV, scoring = 'r2')

  # Displays the r2 scores from the cross validation
  print(f'{type(regressor).__name__} Cross-Validation r2: {cv_scores_rf}')

  # Shows the mean for the values found
  return cv_scores_rf.mean()

In [350]:
def grid_search(regressor, X_train, y_train, CV):
  param_grid_ = {}

  if regressor.__class__.__name__ == 'RandomForestRegressor':
    param_grid_ = {
        'n_estimators': [200],
        'max_depth': [10, 20],
        'min_samples_split': [5],
        'min_samples_leaf': [1, 2]
  }
  elif regressor.__class__.__name__ == 'XGBRegressor':
    param_grid_ = {
        'n_estimators': [100, 200],
        'max_depth': [2, 3, 7],
        'learning_rate': [0.05, 0.2],
        'max_depth': [3, 7],
        'subsample': [0.8, 1.0]
  }
  elif regressor.__class__.__name__ == 'DecisionTreeRegressor':
    param_grid_ = {
        'max_depth': [10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
  }

  # Uses grid search for each regressor
  grid_search_run = GridSearchCV(estimator = regressor, param_grid = param_grid_, cv = CV, scoring = 'r2', n_jobs=-1)

  # Trains with the Grid search
  grid_search_run.fit(X_train, y_train)

  # Prints the best parameters and best score
  print(f'{type(regressor).__name__} Best Parameters: {grid_search_run.best_params_}')
  print(f'{type(regressor).__name__} Best Score: {grid_search_run.best_score_}')

  return grid_search_run.best_estimator_

In [351]:
def k_fold_cross_validation(regressors, X_train, y_train):
  # Initializes K-fold
  CV = KFold(n_splits = 5, shuffle = True, random_state = 42)

  values = []

  # Loops through each regressor
  for regressor in regressors:

    # Uses cross validation for each regressor
    value = cross_validation(regressor, X_train, y_train, CV)

    values.append(value)

  return values

In [352]:
def hyperparameter_tuning(regressor, X_train, y_train):

  # Initializes K-fold
  CV = KFold(n_splits = 5, shuffle = True, random_state = 42)

  # Uses grid search for each regressor
  model = grid_search(regressor, X_train, y_train, CV)

  return model

In [353]:
def pipe(X_train, X_test, y_train, y_test):

  # Runs the models to check their scores
  rf = random_forest_regressor(X_train, y_train, X_test, y_test)
  xg = xgboost(X_train, X_test, y_train, y_test)
  dt = decision_tree_regressor(X_train, y_train, X_test, y_test)

  regressors = [rf, xg, dt]

  # # Does Cross Evaluation
  values = k_fold_cross_validation(regressors, X_train, y_train)

  maximum = max(values)

  for index in range(len(regressors)):
    if values[index] == maximum:
      best_model = hyperparameter_tuning(regressors[index], X_train, y_train)
      break

  return best_model

In [None]:
overall_model = pipe(X_train, X_test, y_train, y_test)

 Random Forest Regressor Results
    Mean Absolute Error=0.9505149062781345
    Mean Squared Error=2.0367594688659305
    Root Mean Squared Error=1.4271508220457747
    R2 score=0.9571748656449258
  
 XGBoost Regressor Results
    Mean Absolute Error=1.1012176001142384
    Mean Squared Error=2.2897223157866042
    Root Mean Squared Error=1.513182842813982
    R2 score=0.9517935183338835
  
 Decision Tree Regressor Results
    Mean Absolute Error=1.162375839341523
    Mean Squared Error=4.0079602067023545
    Root Mean Squared Error=2.0019890625830987
    R2 score=0.9196051200161753
  


In [None]:
important_features = ['potential', 'value_eur', 'wage_eur', 'passing', 'dribbling', 'movement_reactions', 'mentality_composure']

In [None]:
def model_evaluation(model, X_test, Y_test):

  prediction = model.predict(X_test)

  # Calculates the mean squared error
  r2score = r2_score(Ytest, prediction)
  mae = mean_absolute_error(prediction, Y_test)
  mse = mean_squared_error(prediction, Y_test)
  rmse = np.sqrt(mse)

  # Imports the model
  with open('/content/drive/My Drive/Regression_Model.pkl', 'wb') as model_file:
    pkl.dump(model, model_file)

  # Prints the metrics
  print(f'R2 Score: {r2score}')
  print(f'Mean Absolute Error: {mae}')
  print(f'Mean Squared Error: {mse}')
  print(f'Root Mean Squared Error: {rmse}')


In [None]:
Xtest = players_22[important_features]
Xtest = data_cleaning(Xtest)
Xtest = scaling(Xtest)
Ytest = players_22['overall']

In [None]:
model_evaluation(overall_model, Xtest, Ytest)