### 1) XGBoost - Basic Example

In [None]:
from xgboost import XGBClassifier

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [3]:
data = load_iris()

X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)

bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

In [4]:
bst.fit(X_train, y_train)

In [5]:
preds = bst.predict(X_test)

In [6]:
preds

array([2, 2, 2, 1, 1, 2, 2, 2, 0, 1, 1, 0, 0, 1, 1, 2, 2, 2, 0, 1, 1, 1,
       2, 2, 0, 2, 0, 0, 0, 2])

### 2) XGBoost - Hyperparameter Search

In [18]:
import numpy as np
import xgboost as xgb

from sklearn.metrics import accuracy_score

In [19]:
train = xgb.DMatrix('xgboost_example_data/train.csv?format=csv&label_column=0')
test = xgb.DMatrix('xgboost_example_data/test.csv?format=csv&label_column=0')

In [22]:
# ROUND_RANGE = range(40, 100)
# DEPTH_RANGE = range(2, 30)

ROUND_RANGE = range(40, 42)
DEPTH_RANGE = range(2, 20)

optimal_accuracy = 0

for num_round in ROUND_RANGE:
    for depth in DEPTH_RANGE:
        
        print('num_rounds = ' + str(num_round))
        print('depth      = ' + str(depth))

        param = {'objective': 'binary:logistic',
                 'eval_metric': 'logloss',
                 'max_depth': depth,
                 'eta': 0.5
                }

        bst = xgb.train(param, train, num_round)
        y_pred = bst.predict(test)

        accuracy = accuracy_score(test.get_label(), np.around(y_pred))
        if accuracy > optimal_accuracy:
            print('new optimal accuracy = ' + str(accuracy))
            optimal_accuracy = accuracy
            optimal_num_round = num_round
            optimal_max_depth = depth

print('Optimal accuracy     = ' + str(optimal_accuracy))
print('Optimal num_round    = ' + str(optimal_num_round))
print('Optimal max_depth    = ' + str(optimal_max_depth))

num_rounds = 40
depth      = 2
new optimal accuracy = 0.9021842355175689
num_rounds = 40
depth      = 3
new optimal accuracy = 0.9240265906932573
num_rounds = 40
depth      = 4
new optimal accuracy = 0.9354226020892688
num_rounds = 40
depth      = 5
new optimal accuracy = 0.9373219373219374
num_rounds = 40
depth      = 6
num_rounds = 40
depth      = 7
num_rounds = 40
depth      = 8
num_rounds = 40
depth      = 9
num_rounds = 40
depth      = 10
num_rounds = 40
depth      = 11
new optimal accuracy = 0.9430199430199431
num_rounds = 40
depth      = 12
num_rounds = 40
depth      = 13
num_rounds = 40
depth      = 14
num_rounds = 40
depth      = 15
num_rounds = 40
depth      = 16
num_rounds = 40
depth      = 17
num_rounds = 40
depth      = 18
num_rounds = 40
depth      = 19
num_rounds = 41
depth      = 2
num_rounds = 41
depth      = 3
num_rounds = 41
depth      = 4
num_rounds = 41
depth      = 5
num_rounds = 41
depth      = 6
num_rounds = 41
depth      = 7
num_rounds = 41
depth      = 8
num_r

### 3) XGBoost - Regressor

In [27]:
import xgboost as xgb

from sklearn.metrics import r2_score


# splitter = Splitter(dataset, batch_size=100, dataloader_workers_count=8)

# data_matrix = xgb.DMatrix(data=dataset.fields, 
#                           label=dataset.targets)

In [28]:
regressor = xgb.XGBRegressor(eval_metric=r2_score, max_depth=3)

In [8]:
regressor.fit(X=dataset.fields,
              y=dataset.targets);

In [9]:
score = regressor.score(dataset.fields, dataset.targets)  

score

0.30075567170428397

#### Cross-validation

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

scores = cross_val_score(regressor, 
                         dataset.fields,
                         dataset.targets,
                         cv=10)

scores

In [11]:
kfold = KFold(n_splits=10, shuffle=True)

scores = cross_val_score(regressor, 
                         dataset.fields,
                         dataset.targets,
                         cv=kfold)

scores

array([0.26930294, 0.28458036, 0.29093956, 0.25948241, 0.28864461,
       0.25336515, 0.25955703, 0.2834751 , 0.25440335, 0.30816035])

In [12]:
y_pred = regressor.predict(dataset.fields)

r2_score(dataset.targets, y_pred)

0.30075567170428397

#### XGBoost Hyperparameter Tuning Loop

In [11]:
from sklearn.model_selection import train_test_split

In [20]:
MIN_MAX_DEPTH = 7
MAX_MAX_DEPTH = 8
MAX_DEPTH_STEP = 1

MIN_ESTIMATORS_COUNT = 26
MAX_ESTIMATORS_COUNT = 27
MAX_ESTIMATORS_STEP = 1

estimator_counts = list(range(MIN_ESTIMATORS_COUNT, MAX_ESTIMATORS_COUNT, MAX_ESTIMATORS_STEP))
max_depths = list(range(MIN_MAX_DEPTH, MAX_MAX_DEPTH, MAX_DEPTH_STEP))

train_fields, test_fields, train_targets, test_targets = train_test_split(dataset.fields,
                                                                          dataset.targets,
                                                                          test_size=.1)

score_rows = []

for estimators_count in estimator_counts:
    
    score_row = []
    
    print("estimators_count:", estimators_count)
    
    for max_depth in max_depths:
        
        print("max_depth:", max_depth)
                

        regressor = xgb.XGBRegressor(eval_metric=r2_score, 
                                     max_depth=max_depth,
                                     n_estimators=estimators_count)
        
        regressor.fit(X=train_fields,
                      y=train_targets)
        
        score = regressor.score(test_fields, test_targets)
        
        score_row.append(score)
        
        print("score:", score)
    
    score_rows.append(score_row)
        

estimators_count: 26
max_depth: 7
score: 0.322709083935642


In [21]:
scores = pd.DataFrame.from_records(score_rows, 
                                   index=estimator_counts, 
                                   columns=max_depths)

scores

Unnamed: 0,7
26,0.322709


In [None]:
# scores.to_csv('xgboost_scores.csv')

In [None]:
# scores = pd.read_csv('xgboost_scores.csv', index_col=0)

In [None]:
scores

In [None]:
plt.style.use('dark_background')

plt.xlabel('Maximum Depth')
plt.ylabel('Estimators Count')
plt.yticks(estimator_counts)

plt.imshow(scores);