**XGBoost**[https://xgboost.readthedocs.io/en/latest/] is a universal tool for classifier!

In [1]:
# Some environmental preparation
from pathlib import Path
import os
import sys
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ['TENSORBOARD_BINARY'] = str(Path(sys.executable).parent / "tensorboard")
try:
    import util
except ImportError:
    # Modify this according to your local env
    _src_root = Path.home() / "imageqc-master"
    sys.path.append(str(_src_root))
    import util

In [2]:
# System libraries
import matplotlib.pyplot as plt
import numpy; np = numpy  # Personal preference.
import datetime

np.random.seed(1)

In [3]:
# Initialize global variables
util.inject_config()
print(f"Project directory is {HOME_DIR}") 
print(f"Training set is in {TRAIN_DIR}") 
print(f"Validation(dev) set is in {VALIDATION_DIR}")
print(f"Evaluation(test) set is in {TEST_DIR}") 
print(f"Input image format is {EXTENSION}")

Project directory is /Users/jytang/imageqc-master
Training set is in /Users/jytang/imageqc-master/train_images
Validation(dev) set is in /Users/jytang/imageqc-master/validation_images
Evaluation(test) set is in /Users/jytang/imageqc-master/test_images
Input image format is .png


### Load training set with their labels

In [4]:
x_train, y_train = util.read_image("train", EXTENSION, onehot=False)

Reuse cached array train_cache.npz


### Load dev set (validation images) and labels

In [5]:
x_validation, y_validation = util.read_image("validation", EXTENSION, onehot=False)

Reuse cached array validation_cache.npz


### Load test set (evaluation images) and labels

In [6]:
x_test, y_test = util.read_image("test", EXTENSION, onehot=False)

Reuse cached array test_cache.npz


### Simply, create a binary random forest classifier

In [7]:
import xgboost as xgb

In [15]:
# A naive way to reduce image size to w/8 and h/8
# Simply for faster training
def dataset_to_feature_array(dataset):
    return dataset.reshape((dataset.shape[0], -1))[:, ::64]

In [16]:
# Load test images and labels
x_train_boost = xgb.DMatrix(dataset_to_feature_array(x_train), label=y_train)
x_validation_boost = xgb.DMatrix(dataset_to_feature_array(x_validation), label=y_validation)
x_test_boost = xgb.DMatrix(dataset_to_feature_array(x_test), label=y_test)

In [26]:
# Many of these parameter is to sacrifice accuracy in order to fit this session.
# What if we have a GPU machine and longer time.
param = {'max_depth': 3, 
         'eta': 1, 
         'silent': 0, 
         'verbosity': 2,
         'objective': 'binary:logistic',  # default
         'subsample': 0.5,
         'colsample_bytree': 0.5,
         'n_estimatores': 100, # default
         'eval_metric': 'logloss' # default
        }
clf = xgb.train(param, x_train_boost, 30)

[19:15:35] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[19:15:36] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[19:15:37] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[19:15:37] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[19:15:38] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[19:15:39] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[19:15:40] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[19:15:40] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[19:15:41] INFO: src/tree/updater_prune.cc:74: tree pruning end,

In [None]:
clf.fit(x_train_boost, y_train, verbose=True)

### Evaluation

In [27]:
result = np.where(clf.predict(x_test_boost) > 0.5, 1, 0)
print(result)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [28]:
print(f"Score = {(np.where(result > 0.4, 1, 0) == y_test).sum() / len(result) * 100:.2f}%")

Score = 99.41%
