In [8]:
import pandas as pd
import numpy as np
import pickle as pkl
import joblib as j

In [9]:
from category_encoders import BinaryEncoder

In [10]:
from sklearn.experimental import enable_iterative_imputer

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [12]:
from sklearn.feature_extraction import FeatureHasher 

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer

In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error 

In [15]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
from xgboost import XGBRegressor

In [18]:
mlegacy = pd.read_csv('male_players (legacy).csv')
pl22 = pd.read_csv('players_22-1.csv')

  mlegacy = pd.read_csv('male_players (legacy).csv')
  pl22 = pd.read_csv('players_22-1.csv')


In [19]:
def data_preparation (df):
    df.dropna(axis = 1, thresh=(1-30/100)*len(df), inplace = True) #drops all columns with more than 30% null values

    # split data into numeric and non-numeric data 
    numeric = df.select_dtypes (include = np.number) 
    non_numeric = df.select_dtypes (include = ['object'])

    #imputing numeric data
    imp = IterativeImputer(max_iter=10, random_state=0)
    numeric_imputed = pd.DataFrame(np.round(imp.fit_transform(numeric)), columns = numeric.copy().columns)

    #imputing non numeric data 
    imp2 = SimpleImputer(strategy = 'most_frequent')
    non_numeric_imputed = pd.DataFrame (imp2.fit_transform(non_numeric), columns=non_numeric.copy().columns)

    #encoding non numeric data
    non_num_encoded = BinaryEncoder().fit_transform(non_numeric_imputed)

    #creating dependent and independent variables
    y = numeric_imputed['overall']
    x = pd.concat([numeric_imputed, non_num_encoded], axis=1)

    correlation_matrix = x.corr()['overall'].abs().sort_values (ascending=False)
    selected_features = correlation_matrix[:16]
    x = x[selected_features.index]
    x.drop('overall', axis=1, inplace = True)

    scaler = StandardScaler()
    scaled = scaler.fit_transform(x)
    x = pd.DataFrame(scaled, columns = x.columns)
    
    return x, y

In [None]:
x, y = data_preparation(mlegacy)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

cv_scores = cross_val_score (rf_model, x_train, y_train, cv = 5, scoring='r2')
print (f'Cross-validation R^2 scores: {cv_scores}')
rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)

print (f'MAE: {mean_absolute_error(y_test, y_pred)}')
print (f'RMSE: {np.sqrt(mean_absolute_error (y_test, y_pred))}')

In [None]:
gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=7)

cv_scores = cross_val_score (gbr_model, x_train, y_train, cv=5, scoring='r2')
print (f'Cross-validation R^2 scores: {cv_scores}')

gbr_model.fit(x_train, y_train)
y_pred = gbr_model.predict(x_test)

print (f'MAE: {mean_absolute_error(y_test, y_pred)}')
print (f'RMSE: {np.sqrt(mean_absolute_error (y_test, y_pred))}')

In [None]:
xgb_model = XGBRegressor(n_estimators=100, random_state=7)

cv_scores = cross_val_score (xgb_model, x_train, y_train, cv=5, scoring='r2')
print (f'Cross-validation R^2 scores: {cv_scores}')

xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_test)

print (f'MAE: {mean_absolute_error(y_test, y_pred)}')
print (f'RMSE: {np.sqrt(mean_absolute_error (y_test, y_pred))}')

In [None]:
# Random Forest Regressor 
param_rf = {
    'n_estimators': [100, 200], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5]
}

grid_searchr = GridSearchCV(estimator = rf_model, param_grid = param_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1) 
grid_searchr.fit(x_train, y_train)
print('Random Forest best parameters: ', grid_searchr.best_params_) 
fine_tuned_rf = grid_searchr.best_estimator_

In [None]:
# Gradient Boosting Regressor
param_gbr = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05], 
    'max_depth': [3, 5],
}

grid_searchg = GridSearchCV(estimator = gbr_model, param_grid = param_gbr, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_searchg.fit(x_train, y_train)
print('Gradient Boosting best parameters: ', grid_searchg. best_params_) 
fine_tuned_gbr = grid_searchg. best_estimator_ 

In [None]:
# XGBoost regressor
param_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05], 
    'max_depth': [3, 5]
    }
grid_searchx = GridSearchCV(estimator = xgb_model, param_grid = param_xgb, cv=3, scoring = 'neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_searchx.fit(x_train, y_train)
print("XGBoost best parameters: ", grid_searchx.best_params_) 
fine_tuned_xgb = grid_searchx.best_estimator_ 

In [None]:
x2, y2 = data_preparation(pl22) # prepping new data

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size = 0.3, random_state = 20) #breaking it up for training and testing

In [None]:
fine_tuned_rf.fit(x_train2, y_train2)
y_pred2 = fine_tuned_rf.predict(x_test2)

print (f'MAE: {mean_absolute_error(y_test2, y_pred2)}')
print (f'RMSE: {np.sqrt(mean_absolute_error (y_test2, y_pred2))}')

In [None]:
fine_tuned_gbr.fit(x_train2, y_train2,)
y_pred2 = fine_tuned_gbr.predict(x_test2)

print (f'MAE: {mean_absolute_error(y_test2, y_pred2)}')
print (f'RMSE: {np.sqrt(mean_absolute_error (y_test2, y_pred2))}')

In [None]:
fine_tuned_xgb.fit(x_train2, y_train2,)
y_pred2 = fine_tuned_xgb.predict(x_test2)

print (f'MAE: {mean_absolute_error(y_test2, y_pred2)}')
print (f'RMSE: {np.sqrt(mean_absolute_error (y_test2, y_pred2))}')

In [None]:
j.dump(fine_tuned_rf, 'random_forest_model.pkl')