## XGBoost Hyperparameter Tuning
This is a nice tutorial
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
#!pip install xgboost==0.90

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import train_test_split

#from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Performing grid search

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("../Data/Diabetes.csv")

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
## only keep rows where non of the columns has 0 value (except the first and last columns)
data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]
data.reset_index(inplace=True, drop = True)

### Dealing with Missing Values

In [None]:
# using isnull() function  
# print(data.isnull().any().sum())
print(data.isnull().sum())
#data.isnull()

In [None]:
data.drop(columns=['Insulin'], inplace = True)
data.reset_index(inplace=True, drop = True)

In [None]:
### Replace missing values in each column with the mean or median of that column
#data.fillna(data.mean())
data.fillna(data.median(), inplace=True)

### Drop all rows that contain missing values?
#data = data.dropna()
#data.reset_index(inplace=True, drop = True)

### Split Data

In [None]:
X = data.iloc[:,:-1] # Features: all columns excep last
y = data.iloc[:,-1].ravel() # Target: last column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Useful Function
* This function will help us create XGBoost models and perform cross-validation. 
* The best part is that you can take this function as it is and use it later for your own models.

In [None]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

### Tune max_depth and min_child_weight

In [None]:
%%time
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic',eval_metric ='logloss', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5, return_train_score=True)

gsearch2.fit(X_train ,y_train)

print(gsearch2.best_params_)
print(gsearch2.best_score_)
#print(gsearch2.cv_results_)

In [None]:
dir(gsearch2)