# Binary classification benchmark

This notebook compares binary classification models from various libraries. The dataset used is the [Higgs dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS).

In [1]:
%load_ext watermark
%watermark --python --machine --packages river,keras,skgarden,sklearn,tensorflow,torch --datename

Using TensorFlow backend.


Fri Jan 03 2020 

CPython 3.7.4
IPython 7.4.0

river 0.4.4
keras 2.2.4
skgarden 0.1.2
sklearn 0.22
tensorflow 2.0.0
torch 1.3.1

compiler   : GCC 7.3.0
system     : Linux
release    : 5.2.10-arch1-1-ARCH
machine    : x86_64
processor  : 
CPU cores  : 8
interpreter: 64bit


In [6]:
from river import *
from keras import layers
from keras import models
from keras import optimizers
import numpy as np
import skgarden
from sklearn import linear_model as sk_linear_model
from skmultiflow import trees
import torch

%run utils.py
%run wrappers.py

In [None]:
n_features = 28

class PyTorchNet(torch.nn.Module):
    
    def __init__(self, n_features):
        super().__init__()
        self.linear = torch.nn.Linear(n_features, 1)
        self.sigmoid = torch.nn.Sigmoid()
        torch.nn.init.constant_(self.linear.weight, 0)
        torch.nn.init.constant_(self.linear.bias, 0)
        
    def forward(self, x):
        return self.sigmoid(self.linear(x))
    
torch_model = PyTorchNet(n_features=n_features)

# Keras
#inputs = layers.Input(shape=(n_features,))
#predictions = layers.Dense(1, activation='sigmoid', kernel_initializer='zeros', bias_initializer='zeros')(inputs)
#keras_model = models.Model(inputs=inputs, outputs=predictions)
#keras_model.compile(optimizer=optimizers.SGD(), loss='binary_crossentropy')

names = [
    'is_signal',
    'lepton pT', 'lepton eta','lepton phi',
    'missing energy magnitude', 'missing energy phi', 'jet 1 pt',
    'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag',
    'jet 2 pt', 'jet 2 eta', 'jet 2 phi',
    'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta',
    'jet 3 phi', 'jet 3 b-tag', 'jet 4 pt',
    'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag',
    'm_jj', 'm_jjj', 'm_lv', 'm_jlv',
    'm_bb', 'm_wbb', 'm_wwbb'
]
converters = {name: float for name in names}
converters['is_signal'] = lambda x: bool(float(x))


import itertools

n = 100_000 # 11_000_000

get_X_y = lambda: itertools.islice(stream.iter_csv('../HIGGS.csv.gz', fieldnames=names, target_name='is_signal', converters=converters), n)

results = benchmark(
    get_X_y=get_X_y,
    n=n,
    get_pp=preprocessing.StandardScaler,
    models=[
        ('river', 'LogisticRegression', linear_model.LogisticRegression()),
        ('river', 'PAClassifier', linear_model.PAClassifier()),
        ('river', 'KNeighborsClassifier', neighbors.KNeighborsClassifier()),
        ('river', 'DecisionTreeClassifier', tree.DecisionTreeClassifier()),
        ('river', 'RandomForestClassifier', tree.RandomForestClassifier()),
#        ('Keras on Tensorflow (CPU)', 'Dense', KerasBinaryClassifier(keras_model)),
        ('PyTorch (CPU)', 'Linear', PyTorchBinaryClassifier(
            network=torch_model,
            loss_fn=torch.nn.BCELoss(),
            optimizer=torch.optim.SGD(torch_model.parameters(), lr=0.01)
        )),
        ('scikit-learn', 'SGDClassifier', ScikitLearnClassifier(
            sk_linear_model.SGDClassifier(loss='log'),
            classes=[False, True]
        )),
        ('scikit-learn', 'PassiveAggressiveClassifier', ScikitLearnClassifier(
            sk_linear_model.PassiveAggressiveClassifier(),
            classes=[False, True]
        )),
        ('scikit-garden', 'MondrianTreeClassifier', ScikitLearnClassifier(
            skgarden.MondrianTreeClassifier(random_state=42),
            classes=np.array([False, True])
        )),
        ('scikit-garden', 'MondrianForestClassifier', ScikitLearnClassifier(
            skgarden.MondrianForestClassifier(random_state=42),
            classes=np.array([False, True])
        )),
        ('scikit-multiflow', 'HoeffdingTree', ScikitLearnClassifier(
            trees.HoeffdingTree(leaf_prediction='mc'),
            classes=[False, True]
        )),
    ],
    get_metric=metrics.Accuracy
)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))

In [None]:
results

In [8]:
results

Unnamed: 0,Library,Model,Accuracy,Fit time,Average fit time,Predict time,Average predict time
0,river,LogisticRegression,0.6181,"2s, 653ms, 445μs, 18ns","26μs, 534ns","1s, 146ms, 206μs, 360ns","11μs, 462ns"
1,river,PAClassifier,0.55009,"3s, 850ms, 575μs, 326ns","38μs, 506ns","2s, 364ms, 993μs, 939ns","23μs, 650ns"
2,river,KNeighborsClassifier,0.5305,"285ms, 405μs, 586ns","2μs, 854ns","1m, 51s, 872ms, 59μs, 362ns","1ms, 118μs, 721ns"
3,river,DecisionTreeClassifier,0.64663,"38s, 330ms, 11μs, 367ns","383μs, 300ns","1s, 601ms, 316μs, 845ns","16μs, 13ns"
4,river,RandomForestClassifier,0.65648,"6m, 30s, 695ms, 473μs, 639ns","3ms, 906μs, 955ns","20s, 755ms, 7μs, 824ns","207μs, 550ns"
5,Keras on Tensorflow (CPU),Dense,0.6184,"44s, 266ms, 650μs, 369ns","442μs, 667ns","47s, 689ms, 863μs, 668ns","476μs, 899ns"
6,PyTorch (CPU),Linear,0.6184,"21s, 188ms, 666μs, 303ns","211μs, 887ns","12s, 717ms, 581μs, 991ns","127μs, 176ns"
7,scikit-learn,SGDClassifier,0.56161,"26s, 520ms, 629μs, 619ns","265μs, 206ns","7s, 285ms, 170μs, 482ns","72μs, 852ns"
8,scikit-learn,PassiveAggressiveClassifier,0.55009,"29s, 77ms, 978μs, 654ns","290μs, 780ns","8s, 305ms, 4μs, 333ns","83μs, 50ns"
9,scikit-garden,MondrianTreeClassifier,0.53875,"1m, 3s, 318ms, 947μs, 905ns","633μs, 189ns","15s, 137ms, 836μs, 527ns","151μs, 378ns"
