In [1]:
def minDist(clusters, item):
    dists = map(lambda cl: (sum((cl.center - item) ** 2) ** (1/2), cl), clusters)
    d, cl = min(dists, key=lambda x: x[0])
    return d, cl

In [2]:
import numpy as np
from minas.map_minas_support import *
np.random.seed(300)

# cria lista de meta-classes contendo etiqueta, centro e desvio padrão
classes = list(map(mkClass, range(1000)))

# a partir das classes, cria objetos <minas.Cluster>
clusters = sampleClusters(classes)

# a partir das classes, cria objetos <minas.Example>
inputStream = loopExamplesIter(classes)

In [3]:
init = time.time()
counter = 0
while time.time() - init < 1.0:
    counter += 1
    example = next(inputStream)
    minDist(clusters, example.item)
elapsed = time.time() - init
print(f'minasOnline testSamples {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

minasOnline testSamples 1.0039143562316895 seconds, consumed 152 items, 151 i/s


In [4]:
examples = list(zip(range(200), inputStream))

In [18]:
%%timeit
counter = 0
results = []
init = time.time()
for i, example in examples:
    counter += 1
    result = minDist(clusters, example.item)
    results.append(result)
elapsed = time.time() - init
len(results)
print(f'minasOnline testSamples {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

minasOnline testSamples 1.5145623683929443 seconds, consumed 200 items, 132 i/s
minasOnline testSamples 1.4813456535339355 seconds, consumed 200 items, 135 i/s
minasOnline testSamples 1.5256237983703613 seconds, consumed 200 items, 131 i/s
minasOnline testSamples 1.4844377040863037 seconds, consumed 200 items, 134 i/s
minasOnline testSamples 1.457089900970459 seconds, consumed 200 items, 137 i/s
minasOnline testSamples 1.4590590000152588 seconds, consumed 200 items, 137 i/s
minasOnline testSamples 1.5104143619537354 seconds, consumed 200 items, 132 i/s
minasOnline testSamples 1.5397577285766602 seconds, consumed 200 items, 129 i/s
1.49 s ± 29.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
import dask.distributed
cli = dask.distributed.Client('tcp://192.168.15.14:8786', )
cli

0,1
Client  Scheduler: tcp://192.168.15.14:8786  Dashboard: http://192.168.15.14:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 12.46 GB


In [9]:
import dask
import dask.bag as db

@dask.delayed
def minDistDelayed(clusters, item):
    dists = map(lambda cl: (sum((cl[1] - item) ** 2) ** (1/2), cl[0]), clusters)
    d, cl = min(dists, key=lambda x: x[0])
    return d, cl
# init = time.time()
# counter = 0
# simpleClusters = [(id(cl), cl.center) for cl in clusters]
# simpleClusters = db.from_sequence(simpleClusters)
# cli.persist(simpleClusters)
# simpleClusters = cli.scatter(simpleClusters)
# results = []
# while time.time() - init < 1.0:
#     counter += 1
#     example = next(inputStream)
#     results.append(minDistDelayed(simpleClusters, example.item))
# dask.compute(results)
# elapsed = time.time() - init
# print(f'minasOnline testSamples {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

In [17]:
%%timeit
counter = 0
simpleClusters = [(id(cl), cl.center) for cl in clusters]
simpleClusters = db.from_sequence(simpleClusters)
cli.persist(simpleClusters)
simpleClusters = cli.scatter(simpleClusters)
results = []
init = time.time()
for i, example in examples:
    counter += 1
    result = minDistDelayed(simpleClusters, example.item)
    results.append(result)
elapsed = time.time() - init
print(f'loop {elapsed} seconds')
init = time.time()
dask.compute(results)
elapsed = time.time() - init
print(f'minasOnline testSamples {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

loop 0.009139537811279297 seconds
minasOnline testSamples 5.950003147125244 seconds, consumed 200 items, 33 i/s
loop 0.010687589645385742 seconds
minasOnline testSamples 5.447688341140747 seconds, consumed 200 items, 36 i/s
loop 0.008136987686157227 seconds
minasOnline testSamples 6.795172452926636 seconds, consumed 200 items, 29 i/s
loop 0.017217397689819336 seconds
minasOnline testSamples 8.889552593231201 seconds, consumed 200 items, 22 i/s
loop 0.013754844665527344 seconds
minasOnline testSamples 7.059620141983032 seconds, consumed 200 items, 28 i/s
loop 0.008990287780761719 seconds
minasOnline testSamples 8.9894859790802 seconds, consumed 200 items, 22 i/s
loop 0.017444133758544922 seconds
minasOnline testSamples 10.18173623085022 seconds, consumed 200 items, 19 i/s
loop 0.016421794891357422 seconds
minasOnline testSamples 9.675263166427612 seconds, consumed 200 items, 20 i/s
8.33 s ± 1.64 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
import dask
import dask.bag as db

def minDistSimple(clusters, item):
    dists = map(lambda cl: (sum((cl[1] - item) ** 2) ** (1/2), cl[0]), clusters)
    d, cl = min(dists, key=lambda x: x[0])
    return d, cl

In [None]:
init = time.time()
counter = 0
simpleClusters = [(id(cl), cl.center) for cl in clusters]
simpleClusters = db.from_sequence(simpleClusters)
cli.persist(simpleClusters)
simpleClusters = cli.scatter(simpleClusters)
futures = []
while time.time() - init < 1.0:
    counter += 1
    example = next(inputStream)
    future = cli.submit(minDistSimple, simpleClusters, example.item)
    futures.append(future)

In [None]:
results = cli.gather(futures)
elapsed = time.time() - init
print(f'minasOnline testSamples {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

In [None]:
%%timeit
import pandas as pd
from sklearn.datasets import fetch_covtype
covtype = fetch_covtype()
total = len(covtype.data)

zipToMap = lambda x: {'item': x[0], 'label': str(x[1])}
onePercent = int(total*0.01)
baseMap = map(zipToMap, zip(covtype.data[:onePercent], covtype.target[:onePercent]))
onPercentDataFrame = pd.DataFrame(baseMap)

clusters = minasOffline(onPercentDataFrame)
print(len(clusters))

In [16]:
counter = 0
simpleClusters = [(id(cl), cl.center) for cl in clusters]
simpleClusters = db.from_sequence(simpleClusters)
cli.persist(simpleClusters)
simpleClusters = cli.scatter(simpleClusters)
futures = []
init = time.time()
for i, example in examples:
    counter += 1
    future = cli.submit(minDistSimple, simpleClusters, example.item)
    futures.append(future)
elapsed = time.time() - init
print(f'loop submit {elapsed} seconds')
init = time.time()
results = cli.gather(futures)
elapsed = time.time() - init
print(f'minasOnline submit {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

In [20]:
counter = 0
localClusters = [(id(cl), cl.center) for cl in clusters]
simpleClusters = db.from_sequence(localClusters)
cli.persist(simpleClusters)
simpleClusters = cli.scatter(simpleClusters)
futures = []
init = time.time()
for i, example in examples:
    counter += 1
    item = example.item
    #
    def d
    for cl in simpleClusters:
        for c, x in zip(cl[1], item):
            s = (c - x) ** 2
        d = s ** (1/2), cl[0]
        
    dists = cli.map(lambda cl: (), localClusters)
    future = cli.submit(min, dists, key=lambda x: x[0])
    #
    # future = cli.submit(minDistSimple, simpleClusters, example.item)
    futures.append(future)
elapsed = time.time() - init
print(f'loop submit {elapsed} seconds')
init = time.time()
results = cli.gather(futures)
elapsed = time.time() - init
print(f'minasOnline submit {elapsed} seconds, consumed {counter} items, {int(counter / elapsed)} i/s')

loop submit 68.95565295219421 seconds
minasOnline submit 89.48315143585205 seconds, consumed 200 items, 2 i/s


In [None]:
from dask import delayed
@delayed
def sub(a, b):
    return a - b
@delayed
def sqr(a, b):
    return a ** b
@delayed
def summ(a, b):
    return a + b
@delayed
def extractI(x, i):
    return x[i]
#
localClusters = [(id(cl), cl.center) for cl in clusters]
dimentions = len(clusters[0].center)
scatterClusters = cli.scatter(localClusters)
k = None
result = None
for i, example in examples:
    scatterItem = cli.scatter(example.item)
    dists = []
    for cl in scatterClusters:
        c = []
#         for i in range(dimentions):
#             ci = extractI(extractI(cl, 1), i)
#             xi = extractI(scatterItem, i)
        a = sub(ci, xi)
        b = sqr(a, 2)
        c.append(b)
        s = delayed(sum)(c)
        d = (s ** (1/2), extractI(cl, 0))
        dists.append(d)
    result = delayed(min)(dists, key=lambda x: x[0])

In [None]:
result.visualize()

In [None]:
result = rs.compute()