# Logistic Regression benchmark


#### Comparison of Sklearn, H2O and Vawpol Wabbit performing Lasso regression of sparse data. 1 miilion rows and 500 features.

In [4]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import random
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator


# Create simulated data

In [5]:
np.random.seed(4)
n = 1000000
features = 500
X = random(n, features, density=0.1, format='csr') # VW expects csr format
X.shape

(1000000, 500)

In [6]:
y = np.random.choice([-1,1], n)
y.reshape(-1,1)
y.shape

(1000000,)

# SKLearn Logistic regression

In [5]:
lr_model = LogisticRegression(penalty='l1', solver='liblinear')
%time lr_model.fit(X, y)

CPU times: user 1min 31s, sys: 412 ms, total: 1min 32s
Wall time: 1min 33s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
%time lr_model.score(X,y)

CPU times: user 180 ms, sys: 4 ms, total: 184 ms
Wall time: 188 ms


0.508276

# H2O Lasso regression

In [4]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_151"; Java(TM) SE Runtime Environment (build 1.8.0_151-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.151-b12, mixed mode)
  Starting server from /home/vagrant/.local/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpuyng2oef
  JVM stdout: /tmp/tmpuyng2oef/h2o_vagrant_started_from_python.out
  JVM stderr: /tmp/tmpuyng2oef/h2o_vagrant_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster version:,3.16.0.3
H2O cluster version age:,2 days
H2O cluster name:,H2O_from_python_vagrant_s8yp0d
H2O cluster total nodes:,1
H2O cluster free memory:,1.884 Gb
H2O cluster total cores:,1
H2O cluster allowed cores:,1
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [None]:
X_h2o = h2o.H2OFrame(X)

In [None]:
predictors = X_h2o.columns

In [None]:
y_h2o = h2o.H2OFrame(y, column_names=['y'])

In [None]:
h20_frame = X_h2o.cbind(y_h2o)

In [None]:
# alpha=1 for Lasso
glm_model = H2OGeneralizedLinearEstimator(family = 'binomial', alpha=1, solver='L_BFGS')


In [19]:
%time glm_model.train(x=predictors, y='y', training_frame=h20_frame)


glm Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 88 ms, sys: 25.8 ms, total: 114 ms
Wall time: 5.03 s


In [20]:
%time glm_model.accuracy()

CPU times: user 343 µs, sys: 38 µs, total: 381 µs
Wall time: 397 µs


[[0.4999788574228196, 0.507154]]

# Vopal Wabbit Lasso regression

In [7]:
from vowpalwabbit.sklearn_vw import VWClassifier


In [10]:
model = VWClassifier(loss_function='logistic', l1=.001)
%time model.fit(X, y)

CPU times: user 2min 57s, sys: 432 ms, total: 2min 58s
Wall time: 2min 59s


{'l1': 0.001, 'quiet': True, 'loss_function': 'logistic'}

In [9]:
%time model.score(X,y)

CPU times: user 3min 18s, sys: 384 ms, total: 3min 18s
Wall time: 3min 19s


0.500199