In [13]:
import pandas as pd
from IPython.display import clear_output

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

dataset = load_iris()
n_classes = len(dataset.target_names)

dataset_X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
dataset_y = pd.Series(dataset.target)

train_X, test_X, train_y, test_y = train_test_split(dataset_X, dataset_y, test_size=0.3)
[i.shape for i in [train_X, test_X, train_y, test_y]]

[(105, 4), (45, 4), (105,), (45,)]

In [2]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.estimator import BoostedTreesClassifier

In [3]:
NUMERIC_COLUMNS = dataset.feature_names
feature_columns = []
# for feature_name in CATEGORICAL_COLUMNS:
    # Need to one-hot encode categorical features.
#     vocabulary = dftrain[feature_name].unique()
#     feature_columns.append(one_hot_cat_column(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                           dtype=tf.float32))

In [65]:
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(train_y)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).
        dataset = dataset.repeat(n_epochs)
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn

# Training and evaluation input functions.
# train_input_fn = make_input_fn(train_X, keras.utils.to_categorical(train_y))
train_input_fn = make_input_fn(train_X, train_y.astype("int"), n_epochs=500)
eval_input_fn = make_input_fn(test_X, test_y, shuffle=False, n_epochs=1)

In [71]:
btc = BoostedTreesClassifier(feature_columns=feature_columns, n_batches_per_layer=1,
                             n_classes=3, max_depth=2,
                             n_trees=1300)
clear_output()

In [72]:
btc.train(train_input_fn, max_steps=1000)
clear_output()

# Eval.
result = btc.evaluate(eval_input_fn, steps=1000)
clear_output()
print(pd.Series(result))

accuracy          0.955556
average_loss      0.903528
loss              0.903528
global_step     499.000000
dtype: float64


In [73]:
pred_test_y = pd.DataFrame(btc.predict(eval_input_fn))["class_ids"].apply(lambda x:x[0])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp9as89bfe/model.ckpt-499
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# evaluation

In [74]:
from sklearn.metrics.classification import confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef

In [75]:
cr = classification_report(test_y, pred_test_y)
print(cr)
matthews_corrcoef(test_y, pred_test_y)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      0.85      0.92        13
           2       0.88      1.00      0.93        14

    accuracy                           0.96        45
   macro avg       0.96      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45



0.9353478638403593

In [76]:
cm = confusion_matrix(test_y, pred_test_y)
cm

array([[18,  0,  0],
       [ 0, 11,  2],
       [ 0,  0, 14]])