In [5]:
# pip install xgboost
# pip install hyperopt
#https://hyperopt.github.io/hyperopt/?source=post_page
# pip install category_encoders
# pip install scikit-optimize
# pip install dtreeviz


**Install Libraries**

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import shap

from xgboost import plot_importance
from xgboost import XGBRegressor, plot_importance, plot_tree, plotting


import dtreeviz
import graphviz



from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder



# #for hyperparameter tuning
# import hyperopt
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.preprocessing import StandardScaler




ModuleNotFoundError: No module named 'numba.core'

**Load dataset (football_cleaned_supervised.csv)**

In [None]:
def load_dataset(data):
    df = pd.read_csv(data)
   
    return df


In [None]:
df_football= load_dataset('cleaned_files/football_clean_supervised.csv')

In [None]:
df_football.head(5)

In [None]:
#save athlete names for later indexing
athlete_names = df_football['NAME']
# athlete_names

#remove Unnamed:0 and Sport column- not necessary 
def drop_col(df):
    df=df.drop(columns=['Unnamed: 0', 'NAME', 'STARCOLL', 'institution_name_short','SPORT'])
    return df

In [None]:
df_football = drop_col(df_football)



In [None]:
# # df_football.isna().sum()
df_football.columns
# df_football.info()

**Extract target and features. Perform Train-Test Split**

In [None]:

# Extract target and features
Target = 'NILVAL_LONG_USD'
Predictors = ['GRADE', 'AGE', 'SKILL', 'NUMOFF', 'POS', 'HEIGHT_IN', 'WEIGHT_LBS',
       'COLLDIST_MI', 'INSTA_LONG', 'TWIT_LONG', 'TIK_LONG',
       'TOT_FOL', 'RECRUIT_YEAR', 'EXP_MONTHS', 'EXP_YEARS',
       'ClassificationCode', 'REV_MEN', 'EXP_MEN']

X = df_football.drop('NILVAL_LONG_USD', axis=1).values
y =  df_football[['NILVAL_LONG_USD']].values
# X.head(3)

In [None]:
#split the data
#Shuffle data given concern that dataset has athletes ordered by SKILL (but also look at skew later based on model performance)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True)
print(X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

**XGBoostRegressor Base Model Decision Trees as Base Learners- using all features- using scikit learn**

In [None]:
#using default parameters, boosting rounds = 5  objective =reg:squarederror  XGBoost default is gbtree
xg_reg2 = XGBRegressor(n_estimators=5, random_state= 42)
xg_reg2.fit(X_train, y_train)

In [None]:
# Predict the labels of the test set: preds
preds = xg_reg2.predict(X_test)

# compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

 
#Measuring Goodness of fit in Training data
from sklearn import metrics
print('Train R2 Value:',metrics.r2_score(y_train, xg_reg2.predict(X_train)))
 
#Measuring accuracy on Testing Data
print('Accuracy Test',100- (np.mean(np.abs((y_test - preds) / y_test)) * 100))



#Plotting the feature importance for Top 10 most important columns
%matplotlib inline
feature_importances = pd.Series(xg_reg2.feature_importances_, index=Predictors)
feature_importances.nlargest(10).plot(kind='barh')
plt.xlabel ("Feature Importance")
plt.title("Top 10 Most Important Features - XGBoost Regressor Model (GBtree)")
 
#Printing some sample values of prediction
TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
TestingDataResults[Target]=y_test
TestingDataResults[('Predicted'+Target)]=preds
TestingDataResults.head()

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.scatter(y_test, preds)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values')
plt.show()


In [10]:
metrics.classification_report(y_test, pred)

NameError: name 'metrics' is not defined

In [None]:
# https://github.com/parrt/dtreeviz/blob/master/notebooks/dtreeviz_xgboost_visualisations.ipynb

#initialize dtreevizmodel (adaptor)

viz_rmodel = dtreeviz.model(model=xg_reg2, tree_index=1, 
                            X_train=df_football[Predictors],
                            y_train=df_football[Target], 
                            feature_names=Predictors, 
                            target_name=Target)

In [None]:
#Tree structure visualizations

# viz_rmodel.view()
viz_rmodel.view(orientation="LR")

In [None]:
#Prediction path explanations
x = df_football[Predictors].iloc[10]
x
viz_rmodel.view(x=x)

In [None]:
viz_rmodel.view(show_just_path=True, x = x)

In [None]:
print(viz_rmodel.explain_prediction_path(x))

In [None]:
# viz_rmodel.plot_importance(x, figsize=(3.5,2))

In [None]:
#leaf ratios
viz_rmodel.leaf_sizes()

In [None]:
viz_rmodel.rtree_leaf_distributions()

In [None]:
viz_rmodel.node_stats(node_id=4)

**XGBoost with linear learner base model (GBlinear)**- Have to use XGBoot non scikit learn functions to build model

In [None]:
DM_train = xgb.DMatrix(data=X_train, label=y_train, enable_categorical= True)
DM_test = xgb.DMatrix(data=X_test, label=y_test, enable_categorical = True)


# Create the parameter dictionary: params
params = {"booster":"gblinear", "objective":"reg:squarederror"}

# Define evaluation data
eval_data = [(DM_train, 'train'), (DM_test, 'test')]


# Train the model: xg_reg
xg_gbl = xgb.train(params=params, dtrain=DM_train, evals =eval_data, num_boost_round=5)


# Predict the labels of the test set: preds
preds = xg_gbl.predict(DM_test)

# Calculate the error RMSE
error = xg_gbl.eval(DM_test)
print('Evaluation Error:', error)

# Compute and print the RMSE
rmse = np.sqrt(mean_squared_error(y_test, preds))
# compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE %f:" % (rmse))
#Measuring Goodness of fit in Training data
from sklearn import metrics
print('Train R2 Value:',metrics.r2_score(y_train, xg_gbl.predict(DM_train)))
#Measuring accuracy on Testing Data
print('Accuracy Test',100- (np.mean(np.abs((y_test - preds) / y_test)) * 100))


# # Plot the feature importance
from matplotlib import pyplot
# plt.figure(figsize=(10, 20))
plot_importance(xg_gbl, max_num_features = 15)
pyplot.show()



import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.scatter(y_test, preds)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values')
plt.show()



In [None]:
y_test_trans = y_test.reshape(-1)
residuals = y_test_trans - preds
residuals
# preds.shape
# # y_test.shape

plt.figure(figsize=(10, 5))
plt.hist(residuals, bins= 20)
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.show()



In [None]:
nil_dmatrix = xgb.DMatrix(data=X, label=y)

# Perform cross-valdiation: cv_results
cv_results = xgb.cv(dtrain=nil_dmatrix, params=params, nfold=4,
                    num_boost_round=100, metrics='rmse', as_pandas=True, seed=42)

# Print cv_results
print(cv_results)

# Extract and print final boosting round metric
print((cv_results['test-rmse-mean']).tail(1))



In [None]:
nil_dmatrix = xgb.DMatrix(data=X, label=y)

# Perform cross-valdiation: cv_results
cv_results = xgb.cv(dtrain=nil_dmatrix, params=params, nfold=4,
                    num_boost_round=100, metrics='mae', as_pandas=True, seed=42)

# Print cv_results
print(cv_results)

# Extract and print final boosting round metric
print((cv_results['test-mae-mean']).tail(1))



In [None]:
# https://github.com/parrt/dtreeviz/blob/master/notebooks/dtreeviz_xgboost_visualisations.ipynb

#initialize dtreevizmodel (adaptor)

viz_rmodel = dtreeviz.model(model=xg_reg2, tree_index=0, 
                            X_train=X, 
                            y_train=y, 
                            feature_names=Predictors, 
                            target_name=Target)

**Finetune XGBoost Model**

In [None]:
estimators = [
    ('encoder', TargetEncoder()),
    ('clf', XGBRegressor(random_state=8)) # can customize objective function with the objective parameter
]
pipe = Pipeline(steps=estimators)
pipe

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the hyperparameter distributions

hyperparameter_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth': [2, 3, 5, 10, 15],
    'learning_rate': [0.05, 0.1, 0.15, 0.20],
    'min_child_weight': [1, 2, 3, 4]
    }


# Set up the random search with 5-fold cross validation
random_cv = RandomizedSearchCV(estimator=XGBRegressor(),
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'r2_score',n_jobs = 5,
            verbose = 5, 
            return_train_score = True,
            
            random_state=42)

# Create the XGBoost model object
xgb_model = xgb.XGBRegressor()



# Create the RandomizedSearchCV object
random_cv.fit(X_train,y_train)

random_cv.best_estimator_

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_cv.best_params_)
print("Best score: ", random_cv.best_score_)

Dataset has 3 categorical featurs (NAME, STARCOLL, institution_name_short). XGBoost has the ability to internally deal with categoricals.Enable this feature by castingthe categorical columns into Pandas category data type (by default, they are treated as text columns):

In [None]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [None]:
X.dtypes

**Split data into test and train (0.20 test)**

In [None]:
# Split the data
#Shuffle data given concern that dataset has athletes ordered by SKILL (but also look at skew later based on model performance)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True,random_state=42)


XGBoost comes with its own class for storing datasets called DMatrix. It is a highly optimized class for memory and speed. That's why converting datasets into this format is a requirement for the native XGBoost API. Native API of XGBoost contains some excellent features that Scikit-Learn API doesn’t support

In [None]:

# Create regression matrices
#class accepts both the training features and the labels- enable_categorical = True)
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

Choose a value for the objective parameter- RMSE, minimizes the square root of the squared sum of the differences between actual and predicted values. Objective functions and specified hyperparameters specified in params dictionary.

In [None]:
#num_boost_round =  number of boosting rounds. Hyperparameter to be tuned. Initally set to 100.
#training dataset = dtrain_reg
#function trains the XGBoost regression model with the specified hyperparameters and returns the trained model.
#tree booster always outperforms the linear booster (which israrely used)


# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

num_boost_round = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=num_boost_round
   
)

**Evaluation** During boosting rounds, the model object has learned all the patterns of the training set. Perform testing.

In [None]:
preds = model.predict(dtest_reg)


In [None]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE of the base model: {mae:.3f}")




**Using Validation Sets during Training** https://www.datacamp.com/tutorial/xgboost-in-python
Use evaluation arrays that allow us to see model performance as it gets improved incrementally across boosting rounds.

In [None]:
#set parameters again
params = {"objective": "reg:squarederror", "tree_method": "hist"}
num_boost_round = 100

#create list of two tupleshat each contain two elements. 
# #The first element is the array for the model to evaluate.
#The second is the array’s name.

evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

#Pass array to evals parameter of xgb.train to see model boosting performance after each round

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=num_boost_round,
   evals=evals,
   verbose_eval=10 # Every ten rounds   #forces XGBoost to print performance updates every vebose_eval rounds
)

At 100 boosting rounds validation-rsme reached at 60, although training continues to decrease
 

In [None]:
#Trying 5000 rounds with verbosity 250

params = {"objective": "reg:squarederror", "tree_method": "hist"}
num_boost_round = 5000

evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=num_boost_round,
   evals=evals,
   verbose_eval=250
)



**Using Early Stopping** Forces XGBoost to watch the validation loss, and if it stops improving for a specified number of rounds, it automatically stops training.

In [None]:
num_boost_round = 10000
params = {"objective": "reg:squarederror", "tree_method": "hist"}


evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=num_boost_round,
   evals=evals,
   verbose_eval=50,
   early_stopping_rounds=50  #Activiate early stopping
)




Training stopped after round 61 <br>
**Perform K-Fold Cross-Validation**

In [None]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
num_boost_round = 1000

results = xgb.cv(
   params, 
   dtrain_reg,
   num_boost_round=num_boost_round,
   nfold=5,
   early_stopping_rounds=20
)

#df containing each folds results
results.head(10)



In [None]:
#find the best score by taking the minimum of the test-rmse-mean column
best_rmse = results['test-rmse-mean'].min()

best_rmse
print(f"Best RMSE K-Fold CV: {best_rmse:.3f}")

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 5))
# plt.scatter(y_test, predictions)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
# plt.xlabel('True Values')
# plt.ylabel('Predicted Values')
# plt.title('True vs Predicted Values')
# plt.show()

**Feature Importance** Which features are most influential in making NIL prediction

In [None]:

with plt.style.context("ggplot"):
    fig = plt.figure(figsize=(9,6))
    ax = fig.add_subplot(111)
    xgb.plotting.plot_importance(model, ax=ax, height=0.6, importance_type="weight")

## Hyperparameter tuning**

In [None]:
# # Define hyperparameters
# params = {"objective": "reg:squarederror", "tree_method": "hist"}

#started with baseline parameters

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective': 'reg:squarederror',
    'tree_method': 'hist'
}

num_boost_round = 999
# params['eval_metric'] = "mae"

# evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
    params = params,
    dtrain =dtrain_reg,
    num_boost_round=num_boost_round,
    evals=[(dtrain_reg, "train"),(dtest_reg, "Test")],
    early_stopping_rounds=10
)

preds = model.predict(dtest_reg)
y_true = y_test # True values

print("Best Test MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))






In [None]:
#cross validation with current parameters
# params = {"objective": "reg:squarederror", "tree_method": "hist"}

cv_results = xgb.cv(
    params,
    dtrain_reg,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae', 'rmse'},
    early_stopping_rounds=10
)
cv_results

In [None]:
# best_mae = cv_results['test-mae-mean'].min()
# best_mae

best_rmse = cv_results['test-rmse-mean'].min()
best_rmse
print(f"Best RMSE CV: {best_rmse:.3f}")
print(f"Best MAE CV: {best_mae:.3f}")

In [None]:

with plt.style.context("ggplot"):
    fig = plt.figure(figsize=(9,6))
    ax = fig.add_subplot(111)
    xgb.plotting.plot_importance(model, ax=ax, height=0.6, importance_type="weight")

In [None]:
#tuning max_depth and min_child_weight
'''  max_depth = maximum number of nodes allowed from the root to the farthest leaf of a tree. 
                Deeper trees can model more complex relationships by adding more nodes, 
                but as we go deeper,splits become less relevant and are sometimes only due to noise,
                causing the model to overfit. 
                min_child_weight = minimum weight (or number of samples if all samples have a weight of 1) 
                required in order to create a new node in the tree.
                A smaller min_child_weight allows the algorithm to create children that 
                correspond to fewer samples, thus allowing for more complex trees, 
                but again, more likely to overfit. '''

gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae', 'rmse'},
        early_stopping_rounds=10
    )
     # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
#update params
params['max_depth'] = 10
params['min_child_weight'] = 5

In [None]:
#tuning subsample and colsample_bytree
'''   Instead of using the whole training set every time, we can build a 
    tree on slightly different data at each step, 
    which makes it less likely to overfit to a single sample or feature.

subsample =  the fraction of observations (the rows) to subsample at each step. 
By default it is set to 1 meaning that we use all rows.
colsample_bytree = the fraction of features (the columns) to use. 
By default it is set to 1 meaning that we will use all features    '''

gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain_reg,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
        
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
#update params
params['subsample'] = 0.8
params['colsample_bytree'] = 1.0

In [None]:
#tuning  eta
import time

'''    eta = controls the learning rate. corresponds to the shrinkage of 
            the weights associated to features after each round/ defines the amount of "correction" 
            at each step
            lower eta makes model more robust to overfitting.
            Usually, the lower the learning rate, the best. 
            With lower eta,  need more boosting rounds, which takes more time to train, 
            sometimes for only marginal improvements


'''
time

%time 
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    # Update parameters
    params['eta'] = eta
    # Run and time CV
    %time 
    cv_results = xgb.cv(
            params,
            dtrain_reg,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
          )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

In [None]:
params['eta'] = .005

In [None]:
#final params dictionary
params
{'colsample_bytree': 1.0,
 'eta': 0.005,
'eval_metric': {'mae','rmse'},
'max_depth': 10,
'min_child_weight': 5,
'objective': 'reg:tree',
'subsample': 0.8}

**Train model with new params**

In [None]:
#prior
# num_boost_round = 100
# model = xgb.train(
#    params=params,
#    dtrain=dtrain_reg,
#    num_boost_round=num_boost_round,
# ) 



model = xgb.train(
    params,
    dtrain_reg,
    num_boost_round=num_boost_round,
    evals=[(dtest_reg, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))


In [None]:
# Best rounds known, take out early stopping
params
{'colsample_bytree': 1.0,
 'eta': 0.005,
'eval_metric': {'mae','rmse'},
'max_depth': 10,
'min_child_weight': 5,
'objective': 'reg:tree',
'subsample': 0.8}

num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain_reg,
    num_boost_round=num_boost_round,
    evals=[(dtest_reg, "Test")]
)

predictions = best_model.predict(dtest_reg)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.scatter(y_test, predictions)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values')
plt.show()

In [None]:
cv_results = xgb.cv(
    params,
    dtrain_reg,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae', 'rmse'},
  
)

#df containing each folds results
cv_results.head(10)

In [None]:
# best_mae = cv_results['test-mae-mean'].min()
# best_mae

best_rmse = cv_results['test-rmse-mean'].min()
best_rmse
print(f"Best RMSE CV: {best_rmse:.3f}")
print(f"Best MAE CV: {best_mae:.3f}")

In [None]:
mean_absolute_error(best_model.predict(dtest_reg), y_test)


NOT getting same MAE as in last round (48733)

**Hyperparameter Tuning with GridSearchCV**


In [None]:
from sklearn.model_selection import GridSearchCV
%time

params = {
    'max_depth' : [8,9,10, 11, 12],
    'min_child_weight': [5,6,7,8],
    'subsample':[0.6,0.7, 0.8, 0.9, 1.0],
    'n_estimators': [50,100],
    'max_depth': [None, 3, 5, 7, 9],
    'eta': [.3, .2, .1, .05, .01, .005]
    }
grid_search = GridSearchCV(xgb.XGBRegressor(), params, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Test  R2 Score : %.2f"%grid_search.score(X_test, y_test))
print("Train R2 Score : %.2f"%grid_search.score(X_train, y_train))

print("Best Params : ", grid_search.best_params_)
# print("Feature Importances : ")
# # pd.DataFrame([grid_search.best_estimator_.feature_importances_], columns=df_football.feature_names)

In [None]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)
print("Grid Search Size : ", grid_search_results.shape)
grid_search_results.head()

## Using select features of dataset

In [None]:
columns = ['GRADE', 'AGE', 'SKILL',  'NUMOFF', 'POS',
       'HEIGHT_IN', 'WEIGHT_LBS', 'COLLDIST_MI', 'NILVAL_LONG_USD',
       'INSTA_LONG', 'TWIT_LONG', 'TIK_LONG', 'TOT_FOL', 
       'EXP_MONTHS',  
       'ClassificationCode', 'REV_MEN', 'EXP_MEN']
df_fb_select =df_football[columns].copy()

# df_fb_select.shape  #(1263,17)

In [None]:
# Extract feature and target arrays
X, y = df_fb_select.drop('NILVAL_LONG_USD', axis=1), df_fb_select[['NILVAL_LONG_USD']]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create regression matrices
#class accepts both the training features and the labels- enable_categorical = True)
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Define hyperparameters
# params = {"objective": "reg:squarederror", "tree_method": "hist"}
params
{'colsample_bytree': 1.0,
 'eta': 0.005,
'eval_metric': {'mae','rmse'},
'max_depth': 10,
'min_child_weight': 5,
'objective': 'reg:tree',
'subsample': 0.8}

num_boost_round = 100

#train model 
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=num_boost_round,
   # evals=[(dtrain_reg, "train"), (dtest_reg, "test")]
   
)
#make predictions
preds = model.predict(dtest_reg)


#Evaluation
print("Train RMSE : ",model.eval(dtrain_reg))
print("Test  RMSE : ",model.eval(dtest_reg))
print("Train  R2 Score : %.2f"%r2_score(y_train, model.predict(dtrain_reg)))
print("Test R2 Score : %.2f"%r2_score(y_test, model.predict(dtest_reg)))


In [None]:
# pd.DataFrame({ "Actuals":y_test[:10], "Prediction":model.predict(dtest_reg)[:10]})

In [None]:
with plt.style.context("ggplot"):
    fig = plt.figure(figsize=(9,6))
    ax = fig.add_subplot(111)
    xgb.plotting.plot_importance(model, ax=ax, height=0.6, importance_type="weight")