In [1]:
import numpy as np
import pandas as pd
def mkClass(label): return dict(label=label, mu=np.random.random_sample((2,)) * 100, sigma=np.random.random_sample((2,)) * 100)

In [13]:
import dataclasses
import typing
import time

Vector = typing.Union[list, np.ndarray, typing.Any, None]

@dataclasses.dataclass(eq=False)
class Example():
    item: Vector = dataclasses.field(repr=False)
    label: typing.Union[str, None] = None
    timestamp: int = time.time_ns()
    tries: int = 0

@dataclasses.dataclass
class Cluster():
    center: Vector = dataclasses.field(repr=False)
    latest: int = 0
    label: typing.Union[str, None] = None
    n: int = 0
    maxDist: float = 0.0
    temp_examples=[]
    timestamp: int = time.time_ns()
    def __eq__(self, other):
        return self.timestamp == other.timestamp
    def __hash__(self):
        return self.timestamp

def nextExample(klass): return Example(label=klass['label'], item=np.random.normal(klass['mu'], klass['sigma']))

In [14]:
classes = list(map(mkClass, ['zero', 'one', 'duo', 'tri']))
list(map(nextExample, classes))
def nextRandExample(classes=classes): return nextExample(np.random.choice(classes) )
def randExamplesIter(classes=classes):
    while True:
        yield nextRandExample(classes=classes)
def loopExamplesIter(classes=classes):
    i = 0
    while True:
        msg = yield nextExample(classes[i])
        if not msg is None:
            classes = msg
        i = (i + 1) % len(classes)

In [15]:
def distKlass(ex, classes=classes): return map(lambda cl: (sum((cl['mu'] - ex.item) ** 2) ** 1/2, cl), classes)
def minDistKlass(ex, classes=classes): return min(distKlass(ex, classes), key=lambda x: x[0])
minDistKlass(nextExample(classes[0]))

(2786.5029823676064,
 {'label': 'zero',
  'mu': array([28.94058831, 16.26390948]),
  'sigma': array([80.51656701, 13.87988037])})

In [16]:
clusters = [ Cluster(center=cl['mu'], label=cl['label'], n=0, maxDist=sum(cl['sigma']), latest=0) for cl in classes ]
unkBuff = []
outStream = []
inStream = zip(range(10), loopExamplesIter())

def dist(ex, clusters=clusters): return map(lambda cl: (sum((cl.center - ex.item) ** 2) ** 1/2, cl), clusters)
def minDist(ex, clusters=clusters): return min(dist(ex, clusters), key=lambda x: x[0])
for i, example in inStream:
    d, cl = minDist(example)
    if d / cl.maxDist <= 1.1:
        outStream.append(f'[CLASSIFIED] {i}: {cl.label}')
        cl.maxDist = max(cl.maxDist, d)
    else:
        outStream.append(f'[UNKNOWN] {i}: {example.item}')
        unkBuff.append(example)
outStream

['[UNKNOWN] 0: [64.64031652  9.92274428]',
 '[UNKNOWN] 1: [42.67465003 -9.59614246]',
 '[UNKNOWN] 2: [172.48592745 122.7659806 ]',
 '[UNKNOWN] 3: [126.04851997  59.10808376]',
 '[UNKNOWN] 4: [  7.36125681 -12.16426458]',
 '[UNKNOWN] 5: [39.54425779 -6.45368559]',
 '[UNKNOWN] 6: [-61.84162789 141.67665956]',
 '[UNKNOWN] 7: [130.33388127  47.73606348]',
 '[UNKNOWN] 8: [20.07312355 28.85400113]',
 '[UNKNOWN] 9: [59.51815929 54.98695464]']

In [17]:
from sklearn.cluster import KMeans
import time

In [42]:
def minasOnline(exampleSource, inClusters=[]):
    unknownBuffer = []
    clusters=[cl for cl in inClusters]
    sleepClusters = []
    counter = 0
    noveltyIndex = 0
    sentinel = object()
    while True:
        example = next(exampleSource, sentinel)
        if example is sentinel:
            yield 'done'
            return
        example = Example(item=example.item)
        counter += 1
        example.timestamp = time.time_ns()
        example.n = counter
        dists = map(lambda cl: (sum((cl.center - example.item) ** 2) ** 1/2, cl), clusters)
        d, cl = min(dists, key=lambda x: x[0])
        if d / cl.maxDist <= 1.1:
            cl.maxDist = max(cl.maxDist, d)
            cl.latest = counter
            cl.n += 1
            yield f"[CLASSIFIED] {example.n}: {cl.label}"
        else:
            unknownBuffer.append(example)
            yield f"[UNKNOWN] {example.n}: {example.item}"
            if len(unknownBuffer) > 100:
                if len(sleepClusters) > 0:
                    yield f'[recurenceDetection] unk={len(unknownBuffer)}, sleep={len(sleepClusters)}'
                    # recurenceDetection
                    for sleepExample in unknownBuffer:
                        sleepDists = list(map(lambda cl: (sum((cl.center - sleepExample.item) ** 2) ** 1/2, cl), sleepClusters))
                        if len(sleepDists) == 0: continue
                        d, cl = min(sleepDists, key=lambda x: x[0])
                        if d / cl.maxDist <= 1.1:
                            cl.maxDist = max(cl.maxDist, d)
                            cl.latest = counter
                            unknownBuffer.remove(sleepExample)
                            yield f"[CLASSIFIED] {sleepExample.n}: {cl.label}"
                            if cl in sleepClusters:
                                clusters.append(cl)
                                sleepClusters.remove(cl)
                                yield f"[Recurence] {cl.label}"
                if len(unknownBuffer) % 100 == 0:
                    yield '[noveltyDetection]'
                    # noveltyDetection
                    df = pd.DataFrame([ex.item for ex in unknownBuffer])
                    n_clusters = min(100, len(unknownBuffer) // ( 3 * 20))
                    kmeans = KMeans(n_clusters=n_clusters)
                    kmeans.fit(df)
                    newClusters = [Cluster(center=centroid, label=None, n=0, maxDist=0, latest=0) for centroid in kmeans.cluster_centers_]
                    temp_examples = {cl: [] for cl in newClusters}
                    for sleepExample in unknownBuffer:
                        dists = map(lambda cl: (sum((cl.center - sleepExample.item) ** 2) ** 1/2, cl), newClusters)
                        d, cl = min(dists, key=lambda x: x[0])
                        cl.maxDist = max(cl.maxDist, d)
                        cl.latest = counter
                        cl.n += 1
                        temp_examples[cl].append((sleepExample, d))
                    for ncl in newClusters:
                        if ncl.n < 2: continue
                        distances = [ d for ex, d in temp_examples[ncl] ]
                        if len(distances) == 0: continue
                        distsCl2Cl = map(lambda cl: (sum((cl.center - ncl.center) ** 2) ** 1/2, cl), clusters + sleepClusters)
                        distCl2Cl, nearCl2Cl = min(distsCl2Cl, key=lambda x: x[0])
                        #
                        mean = sum(distances) / len(distances)
                        devianceSqrSum = sum([(d - mean) **2 for d in distances])
                        var = devianceSqrSum / len(distances)
                        stdDevDistance = var **0.5
                        silhouetteFn = lambda a, b: (b - a) / max([a, b])
                        silhouette = silhouetteFn(stdDevDistance, distCl2Cl)
                        if silhouette < 0: continue
                        if distCl2Cl / nearCl2Cl.maxDist < 10:
                            yield f'Extention {nearCl2Cl.label}'
                            ncl.label = nearCl2Cl.label
                        else:
                            label = 'Novelty {}'.format(noveltyIndex)
                            ncl.label = label
                            yield label
                            noveltyIndex += 1
                        clusters.append(ncl)
                        for ex, d in temp_examples[ncl]:
                            yield f"[CLASSIFIED] {ex.n}: {ncl.label}"
                            unknownBuffer.remove(ex)
        if counter % 100 == 0:
            yield '[cleanup]'
            for ex in unknownBuffer:
                if counter - ex.n < 2000:
                    unknownBuffer.remove(ex)
            for cl in clusters:
                if counter - cl.latest < 200:
                    sleepClusters.append(cl)
                    clusters.remove(cl)
            if len(clusters) == 0:
                yield f'[fallback] {len(sleepClusters)} => clusters'
                # fallback 
                clusters.extend(sleepClusters)
                sleepClusters.clear()
            #
        #
    #
#


In [43]:
# import os
# import sys
# module_path = os.path.abspath(os.path.join('..'))
# if module_path not in sys.path:
#     sys.path.append(module_path)

# from minas.map_minas import minasOnline

clusters = [ Cluster(center=cl['mu'], label=cl['label'], n=0, maxDist=sum(cl['sigma']), latest=0) for cl in classes ]
classes = list(map(mkClass, ['zero', 'one', 'duo', 'tri']))
    
inputStream = loopExamplesIter()
known = 0
unknown = 0
cleanup = 0
fallback = 0
recurenceDetection = 0
noveltyDetection = 0
for kl in range(10):
    i = 3000
    for o in minasOnline(inputStream, clusters):
        if '[CLASSIFIED]' in o:
            known += 1
        elif '[UNKNOWN]' in o:
            unknown += 1
        elif '[cleanup]' in o:
            cleanup += 1
        elif '[fallback]' in o:
            fallback += 1
        elif '[recurenceDetection]' in o:
            recurenceDetection += 1
        elif '[noveltyDetection]' in o:
            noveltyDetection += 1
        else: 
            print(o)
        i -= 1
        if i == 0: break
    else:
        print('Stream Done')
    newClass = mkClass(f'New {kl}')
    print(newClass)
    classes.append(newClass)
    inputStream.send(classes)
print(f'known={known}, unknown={unknown}, cleanup={cleanup}, fallback={fallback}, recurenceDetection={recurenceDetection}, noveltyDetection={noveltyDetection}')
# for i, o in zip(range(100), outStream):
#     print(o)

[Recurence] zero
[Recurence] New 0
[Recurence] New 0
[Recurence] New 2
[Recurence] New 2
[Recurence] New 2
[Recurence] New 3
[Recurence] New 5
[Recurence] New 5
[Recurence] New 7
[Recurence] New 9
[Recurence] New 9
[Recurence] New 9
[Recurence] New 5
[Recurence] New 5
[Recurence] New 2
[Recurence] New 7
[Recurence] New 9
[Recurence] New 5
[Recurence] New 2
[Recurence] New 7
[Recurence] New 9
[Recurence] New 9
[Recurence] New 2
[Recurence] New 2
[Recurence] New 5
[Recurence] New 5
[Recurence] New 2
[Recurence] New 2
[Recurence] New 2
[Recurence] New 7
[Recurence] New 9
[Recurence] New 5
[Recurence] New 2
[Recurence] New 5
[Recurence] New 9
[Recurence] New 9
[Recurence] New 5
[Recurence] New 5
[Recurence] New 2
[Recurence] New 9
[Recurence] New 2
[Recurence] New 2
[Recurence] New 5
[Recurence] New 9
[Recurence] New 5
[Recurence] New 2
[Recurence] New 5
[Recurence] New 2
[Recurence] New 9
{'label': 'New 0', 'mu': array([95.62776888, 49.53721567]), 'sigma': array([ 9.26680914, 25.88768122]