In [3]:
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_covtype

In [6]:
help(fetch_covtype)
dataset = fetch_covtype()
print(type(dataset))
print(dataset)

Help on function fetch_covtype in module sklearn.datasets.covtype:

fetch_covtype(data_home=None, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False)
    Load the covertype dataset (classification).
    
    Download it if necessary.
    
    Classes                        7
    Samples total             581012
    Dimensionality                54
    Features                     int
    
    Read more in the :ref:`User Guide <covtype_dataset>`.
    
    Parameters
    ----------
    data_home : string, optional
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    
    download_if_missing : boolean, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.
    
    random_state : int, RandomState instance or None (default)
        Determines random number genera

In [43]:
import scipy, time, os, yaml
from timed import timed

class Example:
    __slots__ = ['label', 'item', 'timestamp', 'tries']
    def __init__(self, item, label=None):
        self.label = label
        self.item = item
        self.timestamp = time.time_ns()
        self.tries = 0
    def asDict(self):
        return {'label': self.label, 'item': self.item, 'timestamp': self.timestamp, 'tries': self.tries, }
    def __repr__(self):
        return 'Example({!r})'.format(self.asDict())
    def __str__(self):
        return repr(self)
    def __len__(self):
        return len(self.item)
t = Example(item=[])
print(t, type(t))
del t
class Cluster:
    __slots__ = [ 'label', 'center', 'n', 'lastExapleTMS', 'maxDistance', ]
    def __init__(self, label, center):
        self.label = label
        self.center = center
        self.n = 0
        self.maxDistance = 0.0
        self.lastExapleTMS = 0
    def asDict(self):
        return {'label': self.label, 'center': self.center, 'n': self.n, 
                'maxDistance': '{:2.2f}'.format(self.maxDistance), 'lastExapleTMS': self.lastExapleTMS,}
    def __repr__(self):
        return 'Cluster({!r})'.format(self.asDict())
    def radius(self):
        return self.maxDistance
    def dist(self, vec):
        return scipy.spatial.distance.euclidean(self.center, vec)
    def __add__(self, other):
        if type(other) == Example:
            self.n += 1
            self.lastExapleTMS = max(example.timestamp, self.lastExapleTMS)
            self.maxDistance = max(self.dist(example.item), self.maxDistance)
t = Cluster(label='label', center=[])
print(t, type(t))
del t
class MinasConsts:
    __slots__ = ['k', 'radiusFactor', 'noveltyThr', 'windowTimeSize', 'ndProcedureThr', 'representationThr', ]
    def __init__(self):
        self.k = 100
        self.radiusFactor = 1.1
        self.noveltyThr = 100
        self.windowTimeSize = 100
        self.ndProcedureThr = 2000
        self.representationThr = 3
CONSTS = MinasConsts()
class Minas:
    __slots__ = ['exampleCount', 'knownCount', 'noveltyIndex',
                 'lastExapleTMS', 'lastCleaningCycle', 
                 'clusters', 'sleepClusters', 'unknownBuffer', ]
    def __init__(self):
        self.exampleCount = 0
        self.knownCount = 0
        self.noveltyIndex = 0
        self.lastExapleTMS = 0
        self.lastCleaningCycle = 0
        self.clusters = []
        self.sleepClusters = []
        self.unknownBuffer = []
    def asDict(self):
        asDictMap = lambda l: [x.asDict for x in l]
        return {
            'exampleCount': self.exampleCount, 'knownCount': self.knownCount, 'diff': self.exampleCount - self.knownCount,
            'noveltyIndex': self.noveltyIndex,
            'lastExapleTMS': self.lastExapleTMS, 'lastCleaningCycle': self.lastCleaningCycle,
            'clusters': asDictMap(self.clusters), 'sleepClusters': asDictMap(self.sleepClusters),
            'unknownBuffer': asDictMap(self.unknownBuffer),}
    def __repr__(self):
        return 'Minas({!r})'.format(self.asDict())
    def storeToFile(self, filename: str):
        directory = os.path.dirname(filename)
        if len(directory) > 0 and not os.path.exists(directory):
            os.makedirs(directory)
        with open(filename, 'w') as f:
            f.write(yaml.dump(self.asDict()))
        return self
    def restoreFromFile(self, filename: str):
        with open(filename, 'r') as f:
            dic = yaml.load(f, Loader=yaml.SafeLoader)
            self.exampleCount = dic.get('exampleCount', self.exampleCount)
            self.knownCount = dic.get('knownCount', self.knownCount)
            self.noveltyIndex = dic.get('noveltyIndex', self.noveltyIndex)
            self.lastExapleTMS = dic.get('lastExapleTMS', self.lastExapleTMS)
            self.lastCleaningCycle = dic.get('lastCleaningCycle', self.lastCleaningCycle)
            if 'clusters' in dic.keys():
                self.clusters = [Cluster(**cl) for cl in dic['clusters']]
            if 'sleepClusters' in dic.keys():
                self.sleepClusters = [Cluster(**cl) for cl in dic['sleepClusters']]
            if 'unknownBuffer' in dic.keys():
                self.unknownBuffer = [Example(**ex) for ex in dic['unknownBuffer']]
        return self
    #
    #
    @timed
    def clustering(self, examples, label=None):
        kmeans = KMeans( n_clusters = min(CONSTS.k, int(len(examples) / (3 * CONSTS.representationThr))) )
        with joblib.parallel_backend('dask'):
            kmeans.fit(examples)
        return [Cluster(center=centroid, label=label) for centroid in kmeans.cluster_centers_]
    def trainGroup(self, label, group):
        clusters = clustering(group, label)
        for ex in group:
            nearCl, dist = closestCluster(ex, clusters)
            nearCl.addExample(ex)
        return [cluster for cluster in clusters if cluster.n > CONSTS.representationThr]
    def offline(self, examplesDf):
        for label, group in df.groupby('label'):
            self.clusters.append(trainGroup(label, group))
    def classify(self, example, clusters=None):
        example.tries += 1
        if clusters == None:
            clusters = self.clusters + self.sleepClusters
        cluster, dist = closestCluster(example.item, clusters)
        isClassified = dist <= (CONSTS.radiusFactor * cluster.radius())
        return isClassified, cluster, dist, example
    def online(self, stream):
        for example in stream:
            if example is None:
                break
        self.onlineProcessExample(example)
        return self
    @timed
    def onlineProcessExample(self, item):
        self.exampleCount += 1
        example = Example(item=item)
        self.lastExapleTMS = example.timestamp
        isClassified, cluster, dist, example = self.classify(example, self.clusters)
        if isClassified:
            example.label = cluster.label
            cluster.addExample(example)
            self.knownCount += 1
        else:
            self.unknownBuffer.append(example)
        #
        if len(self.unknownBuffer) > CONSTS.ndProcedureThr:
            print('bufferFull')
            self.wakeupWithUnkownBuffer()
            self.noveltyDetection()
            self.cleanupCycle()
        return example, isClassified, cluster, dist
    @timed
    def wakeupWithUnkownBuffer(self):
        for sleepExample in self.unknownBuffer:
            isClassified, cluster, dist, example = self.classify(example, self.sleepClusters)
            if isClassified:
                sleepExample.label = cluster.label
                cluster.addExample(sleepExample)
                self.unknownBuffer.remove(sleepExample)
                # wakeup
                print('wakeup')
                self.clusters.append(cluster)
                self.sleepClusters.remove(cluster)
                self.counter += 1
    @timed
    def cleanupCycle(self):
        # Model ← move-sleepMem(Model, SleepMem, CurrentTime, windowSize)
        ogLen = len(self.clusters)
        newClusters = []
        for cl in self.clusters:
            if cl.lastExapleTMS < self.lastCleaningCycle:
                self.sleepClusters.append(cl)
            else:
                newClusters.append(cl)
        self.clusters = newClusters
        self.lastCleaningCycle = time.time_ns()
        print(f'put to sleep {ogLen - len(newClusters)} clusters')
        # ShortMem ← remove-oldExamples(ShortMem, windowsize)
        ogLen = len(self.unknownBuffer)
        self.unknownBuffer = [ex for ex in self.unknownBuffer if ex.tries >= 3]
        print(f'removed {ogLen - len(self.unknownBuffer)} examples')
    @timed
    def noveltyDetection(self):
        for cluster in clustering(self.unknownBuffer):
            # ---------------------------------------------------------------------------------------------------
            isRepresentative = cluster.n > CONSTS.representationThr
            # 
            near, dist = closestCluster(cluster.center, self.clusters + self.sleepClusters)
            distances = []
            for ex in unknownBuffer:
                d = cluster.dist(ex.item)
                if d <= (CONSTS.radiusFactor * cluster.radius()):
                    distances.append(d)
            mean = sum(distances) / len(distances)
            devianceSqrSum = sum((d - mean) **2 for d in distances)
            var = devianceSqrSum / len(distances)
            stdDevDistance = var **0.5
            silhouette = lambda a, b: (b - a) / max([a, b])
            # 
            isCohesive = silhouette(dist, stdDevDistance) > 0
            validationCriterion = isRepresentative and isCohesive
            # ---------------------------------------------------------------------------------------------------
            if not validationCriterion:
                continue
            if dist <= CONSTS.noveltyThr:
                print('Extention {}'.format(near.label))
                cluster.label = near.label
            else:
                self.noveltyIndex += 1
                label = 'Novelty {}'.format(self.noveltyIndex)
                print(label)
                cluster.label = label
            self.clusters.append(cluster)
        
t = Minas()
t.storeToFile('t.yaml')
t.restoreFromFile('t.yaml')
print(t, type(t))
del t


Example({'label': None, 'item': [], 'timestamp': 1557118471604864000, 'tries': 0}) <class '__main__.Example'>
Cluster({'label': 'label', 'center': [], 'n': 0, 'maxDistance': '0.00', 'lastExapleTMS': 0}) <class '__main__.Cluster'>
Minas({'exampleCount': 0, 'knownCount': 0, 'diff': 0, 'noveltyIndex': 0, 'lastExapleTMS': 0, 'lastCleaningCycle': 0, 'clusters': [], 'sleepClusters': [], 'unknownBuffer': []}) <class '__main__.Minas'>


In [38]:
from dask.datagr
dataset

Help on built-in function min in module builtins:

min(...)
    min(iterable, *[, default=obj, key=func]) -> value
    min(arg1, arg2, *args, *[, key=func]) -> value
    
    With a single iterable argument, return its smallest item. The
    default keyword-only argument specifies an object to return if
    the provided iterable is empty.
    With two or more arguments, return the smallest argument.

