In [74]:
import random
import numpy as np
from math import sqrt
from sklearn.preprocessing import normalize as sknormalize

class Ondisk_KMeans:
    def __init__(self, n_clusters, max_iter=20, tol=0.001, metric='cosine', verbose=True):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol if 0 <= tol < 1 else 0.001
        self.verbose = verbose
        self.metric = metric
        self._distance = self._euclidean if metric == 'euclidean' else self._cosine
        
        self.labels = None

    def _euclidean(self, query, centroid):
        """query: {term:count}
        centroid: list of float - like
        """
        return 0
    
    def _cosine(self, query, centroid):
        """query: {term:count}
        centroid: list of float - like
        """
        sim = sum({v*centroid[j] for j,v in query.items()})
        return 1 - sim
    
    def _most_similar(self, j_dict):
        cdist = [self._distance(j_dict, centroid) for centroid in self.c0]
        return sorted(enumerate(cdist), key=lambda x:x[1])[0][0]
    
    def fit_predict(self, mm_fname):
        mm = MatrixMarket(mm_fname)
        self._initialize(mm)

        mm.iter_line = False
        for n_iter in range(1, self.max_iter+1):
            c1 = np.zeros((self.n_clusters, self.n_words))
            self.labels = [-1] * self.n_docs
            for n_nonempty_doc, (i, j_dict) in enumerate(mm):
                best_c = self._most_similar(j_dict)
                self.labels[i] = best_c
                for j, v in j_dict.items():
                    c1[best_c,j] += v
                if self.verbose and n_nonempty_doc % 100 == 99:
                    print('\r  - iter = {} / {}, {} %'.format(n_iter, self.max_iter, '%.2f'%(100*n_nonempty_doc/self._n_nonempty_docs)), flush=True, end='')
            self.c0 = sknormalize(c1)
            if self.verbose:
                print('\rIteration = {} was done{}'.format(n_iter, ' '*40), flush=True)

    def _initialize(self, mm):
        self.n_docs, self.n_words, self.n_elements = mm.n_rows, mm.n_cols, mm.n_elements
        # Scanning document idx
        docs_set = set()
        for n_row, (i, _, _) in enumerate(mm):
            if self.verbose and n_row % 5000 == 4999:
                print('\r  - scanning document idx {} %'.format('%.2f'%(100*n_row/self.n_elements)), flush=True, end='')
            if not i in docs_set:
                docs_set.add(i)
        self._n_nonempty_docs = len(docs_set)
        self.c0 = np.zeros((self.n_clusters, self.n_words))
                
        # Select seeds
        if self.verbose:
            print('\r  - initialize centroids', flush=True, end='')
        seeds = random.sample(docs_set, self.n_clusters)
        mm.iter_line = False
        _c0_index = 0
        for i, j_dict in mm:
            if i in seeds:
                for j, v in j_dict.items():
                    self.c0[_c0_index,j] = v
                _c0_index += 1
            if _c0_index == (self.n_clusters):
                break
        if self.verbose:
            print('\rInitialization was done{}'.format(' '*40), flush=True)
        
class MatrixMarket:
    def __init__(self, mm_fname, iter_line=True, do_normalize=True):
        self.fname = mm_fname
        self.iter_line = iter_line
        self.do_normalize = do_normalize
        with open(mm_fname, encoding='utf-8') as f:
            for _ in range(2):
                next(f)
            self.n_rows, self.n_cols, self.n_elements = [int(v) for v in next(f).split()]

    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for _ in range(3):
                next(f)
            self._n_iter = 0
            if self.iter_line:
                for row in f:
                    self._n_iter += 1
                    i,j,v = row.split()
                    yield int(i)-1, int(j)-1, float(v)
            else:
                i_prev = -1
                d = {}
                for row in f:
                    self._n_iter += 1
                    i,j,v = row.split()
                    i, j, v = int(i)-1, int(j)-1, float(v)
                    if i != i_prev:
                        if d:
                            if self.do_normalize:
                                yield i_prev, self._normalize(d)
                            else:
                                yield i_prev, d
                        d = {}
                    i_prev = i
                    d[j] = v
                if d:
                    if self.do_normalize:
                        yield i_prev, self._normalize(d)
                    else:
                        yield i_prev, d

    def _normalize(self, d):
        sum_ = sum([v**2 for v in d.values()])
        return {k:sqrt(v**2/sum_) for k,v in d.items()}
        
    def __str__(self):
        return 'MatrixMarket file (n_rows={}, n_cols={}, n_elements={})'.format(self.n_rows, self.n_cols, self.n_elements)

In [75]:
mm_fname = '/mnt/sdb2/carblog_sample/models/base_c10.mtx'
kmeans = Ondisk_KMeans(n_clusters=10, max_iter=5)
labels = kmeans.fit_predict(mm_fname)

Initialization was done                                        
Iteration = 1 was done                                        
Iteration = 2 was done                                        
Iteration = 3 was done                                        
Iteration = 4 was done                                        
Iteration = 5 was done                                        


In [76]:
from collections import Counter
Counter(kmeans.labels)

Counter({-1: 6,
         0: 51,
         1: 302,
         2: 172,
         3: 68,
         4: 230,
         5: 54,
         6: 274,
         7: 104,
         8: 168,
         9: 71})

In [77]:
mm = MatrixMarket('/mnt/sdb2/carblog_sample/models/base_c10.mtx')
print(mm)

MatrixMarket file (n_rows=1500, n_cols=5555, n_elements=282538)


In [80]:
mm.iter_line = True
for n_row, line in enumerate(mm):
    if n_row > 2: break
    print(line)

(0, 26, 1.0)
(0, 34, 1.0)
(0, 38, 4.0)
