In [2]:
%load_ext cython

In [10]:
from creme import datasets
import itertools

def chunk_input_stream(input_stream, chunk_size):
    while True:
        chunk = list(itertools.islice(input_stream, chunk_size))
        if chunk:
            yield chunk
        else:
            return

def chunks(iterable, size=10):
    for first in iterable:
        yield itertools.chain([first], itertools.islice(iterable, size - 1))

for y in chunks(datasets.CreditCard().take(10), 3):
    print(y)


<itertools.chain object at 0xa1c894f90>
<itertools.chain object at 0xa1c87c350>
<itertools.chain object at 0xa1c87c850>
<itertools.chain object at 0xa1c1457d0>
<itertools.chain object at 0xa1c145c90>
<itertools.chain object at 0xa1c082b10>
<itertools.chain object at 0xa1c082350>
<itertools.chain object at 0xa1c06f750>
<itertools.chain object at 0xa1c894190>
<itertools.chain object at 0xa1c894090>


In [12]:
def dot(x: dict, y: dict):
    if len(x) < len(y):
        return sum(xi * y[i] for i, xi in x.items() if i in y)
    return sum(x[i] * yi for i, yi in y.items() if i in x)

In [4]:
%%cython --annotate

from cython cimport boundscheck, wraparound


@boundscheck(False)
@wraparound(False)
cpdef double cdot(dict x, dict y):
    
    cdef:
        int i
        double total
        double xi
        double yi
        list xx
        list yy
        int n
        
    xx = list(x.values())
    yy = list(y.values())
    
    if len(x) < len(y):
        
        n = len(xx)
        
        for i in range(n):
            xi = x[i]
            yi = y[i]
            total += xi * yi
            
    else:
        
        n = len(yy)
        
        for i in range(n):
            xi = x[i]
            yi = y[i]
            total += xi * yi
        
    return total

In [17]:
import random

x = {i: random.random() for i in range(0, 100)}
y = {i: random.random() for i in range(0, 100)}

#%timeit dot(x, y)

In [6]:
%timeit cdot(x, y)

5.72 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [17]:
%timeit cdot(x, y)

3.96 µs ± 41.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [15]:
%%prun

from creme import datasets
from creme import linear_model
from creme import metrics
from creme import model_selection
from creme import preprocessing

X_y = datasets.CreditCard()

model = (
    preprocessing.StandardScaler() |
    linear_model.LogisticRegression(intercept_lr=.1)
)
metric = metrics.LogLoss()

model_selection.progressive_val_score(X_y, model, metric)

 

         143294845 function calls (143294843 primitive calls) in 79.844 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   569614   16.713    0.000   23.033    0.000 scale.py:92(<dictcomp>)
 17657972    5.023    0.000    7.025    0.000 math.py:222(<genexpr>)
  8544210    4.475    0.000    6.013    0.000 base.py:18(learning_rate)
   284808    4.264    0.000    9.780    0.000 stream.py:185(iter_csv)
   284807    3.569    0.000    4.471    0.000 glm.py:72(<dictcomp>)
   284807    3.506    0.000    9.569    0.000 sgd.py:45(_update_after_pred)
 26202184    3.050    0.000    3.050    0.000 {method 'get' of 'dict' objects}
 17088420    3.030    0.000    3.030    0.000 scale.py:15(safe_div)
  1139234    3.016    0.000    3.123    0.000 {built-in method builtins.next}
   284807    2.658    0.000    3.767    0.000 scale.py:84(fit_one)
   569614    2.550    0.000    9.574    0.000 {built-in method builtins.sum}
   284808    2.097    0

In [22]:
import collections
from creme import base
from creme import stats

def safe_div(a, b):
    if b == 0:
        return a
    return a / b

class StandardScaler(base.Transformer):

    def __init__(self):
        self.variances = collections.defaultdict(stats.Var)

    def fit_one(self, x, y=None):

        for i, xi in x.items():
            self.variances[i].update(xi)

        return self

    def transform_one(self, x):
        return {
            i: safe_div(xi - self.variances[i].mean.get(), self.variances[i].get() ** .5)
            for i, xi in x.items()
        }
    
scaler = StandardScaler()
scaler.fit_one(x)
%timeit scaler.transform_one(x)

52.6 µs ± 241 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [24]:
class StandardScaler(base.Transformer):

    def __init__(self):
        self.variances = collections.defaultdict(stats.Var)
        self._tmp = {}

    def fit_one(self, x, y=None):

        for i, xi in x.items():
            self.variances[i].update(xi)

        return self

    def transform_one(self, x):
        
        for i, xi in x.items():
            self._tmp[i] = safe_div(xi - self.variances[i].mean.get(), self.variances[i].get() ** .5)
        
        return self._tmp
    
scaler = StandardScaler()
scaler.fit_one(x)
%timeit scaler.transform_one(x)

55.2 µs ± 1.56 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
%%cython --annotate

cimport cython

import collections
from creme import base
from creme import stats

@cython.cdivision(True)
cdef float divide(float a, float b):
    if b == 0:
        return a
    return a / b


cdef float scale(float x, float mean, float var):
    return divide(x - mean, var ** .5)


class StandardScaler(base.Transformer):

    def __init__(self):
        self.variances = collections.defaultdict(stats.Var)
        self._tmp = {}

    def fit_one(self, x, y=None):

        for i, xi in x.items():
            self.variances[i].update(xi)

        return self

    def transform_one(self, x):
        
        for i, xi in x.items():
            self._tmp[i] = scale(xi, self.variances[i].mean.get(), self.variances[i].get())
        
        return self._tmp

In [18]:
scaler = StandardScaler()
scaler.fit_one(x)
%timeit scaler.transform_one(x)

28.5 µs ± 561 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
