# Logistic Regression benchmark


#### Comparison of Sklearn, H2O and Vawpol Wabbit performing Lasso regression of sparse data. 1 miilion rows and 500 features.

In [1]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import random
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator


# Create simulated data

In [2]:
np.random.seed(4)
n = 1000000
features = 500
X = random(n, features, density=0.1, format='csr') # VW expects csr format
X.shape

(1000000, 500)

In [11]:
y = np.random.choice([0,1], n)
y.reshape(-1,1)
y.shape

(1000000,)

# SKLearn Logistic regression

In [5]:
lr_model = LogisticRegression(penalty='l1', solver='liblinear')
%time lr_model.fit(X, y)

CPU times: user 1min 31s, sys: 412 ms, total: 1min 32s
Wall time: 1min 33s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
%time lr_model.score(X,y)

CPU times: user 180 ms, sys: 4 ms, total: 184 ms
Wall time: 188 ms


0.508276

# H2O Lasso regression

In [4]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_151"; Java(TM) SE Runtime Environment (build 1.8.0_151-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.151-b12, mixed mode)
  Starting server from /Users/madhan/anaconda/envs/fastaiV2/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/1y/lhg_qvyx5lvb_6x50gczc6mr0000gn/T/tmp2c0bn9ox
  JVM stdout: /var/folders/1y/lhg_qvyx5lvb_6x50gczc6mr0000gn/T/tmp2c0bn9ox/h2o_madhan_started_from_python.out
  JVM stderr: /var/folders/1y/lhg_qvyx5lvb_6x50gczc6mr0000gn/T/tmp2c0bn9ox/h2o_madhan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster version:,3.16.0.3
H2O cluster version age:,3 days
H2O cluster name:,H2O_from_python_madhan_sjnw8f
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [5]:
X_h2o = h2o.H2OFrame(X)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
predictors = X_h2o.columns

In [12]:
y_h2o = h2o.H2OFrame(y, column_names=['y'])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [13]:
h20_frame = X_h2o.cbind(y_h2o)

In [14]:
# alpha=1 for Lasso
glm_model = H2OGeneralizedLinearEstimator(family = 'binomial', alpha=1, solver='L_BFGS')


In [15]:
%time glm_model.train(x=predictors, y='y', training_frame=h20_frame)


glm Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 98.5 ms, sys: 23.6 ms, total: 122 ms
Wall time: 5.96 s


In [16]:
%time glm_model.accuracy()

CPU times: user 213 µs, sys: 7 µs, total: 220 µs
Wall time: 224 µs


[[0.5006760614260547, 0.506871]]

# Vopal Wabbit Lasso regression

In [11]:
from vowpalwabbit.sklearn_vw import VWClassifier


In [10]:
model = VWClassifier(loss_function='logistic', l1=.001)
%time model.fit(X, y)

CPU times: user 2min 57s, sys: 432 ms, total: 2min 58s
Wall time: 2min 59s


{'l1': 0.001, 'quiet': True, 'loss_function': 'logistic'}

In [11]:
%time model.score(X,y)

CPU times: user 3min 25s, sys: 460 ms, total: 3min 25s
Wall time: 3min 26s


0.500199

# XGBoost Lasso regression

In [8]:
from xgboost import XGBClassifier


In [9]:
model = XGBClassifier(booster='gblinear', reg_alpha=0.001)

%time model.fit(X, y)

CPU times: user 2min 21s, sys: 716 ms, total: 2min 22s
Wall time: 2min 22s


XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.001, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [10]:
%time model.score(X,y)

  if diff:


CPU times: user 12.9 s, sys: 536 ms, total: 13.4 s
Wall time: 13.5 s


0.508436