In [2]:
import csv
from sklearn import cluster
import numpy
import numpy as np
from scipy.sparse import dok_matrix
from itertools import groupby
import multiprocessing
import os
import utils
import random
import bz2

In [3]:
# The maximum distance between two samples for them to be considered as in the same neighborhood.
eps=0.05
compressed = False
#data_dir = "/data/Dropbox/python-scrapy-data-mining/" # utils.data_dir
data_dir = "/data/python-scrapy-data-mining/" # utils.data_dir
categories = utils.get_categories()

In [4]:
def get_clusters(category, eps=eps):
    """
    Runs clustering and returns labeled non-distinct product entires
    grouped by lables in a dictionary.
    """
    global data_dir, compressed
    file_name =  os.path.join(data_dir, "sims_" + category.replace('/', '_') + (".csv.bz2" if compressed else ".csv"))
    if not os.path.exists(file_name):
        return []

    with (bz2.BZ2File(file_name) if compressed else open(file_name)) as raw:
        values = dict(( ((int(p1), int(p2)), float(sim)) for p1, p2, sim in csv.reader(raw) ))

    prod_ids = sorted(list(set((p1 for (p1, _) in values.keys()))))
    #print "N:", len(prod_ids)

    def get_item(i, j):
        """
        Reindex/map similarity matrix from 0..(N-1) to [id_0...id_(N-1)]
        """
        i, j = prod_ids[i], prod_ids[j]
        if i > j:
            sim = values.get((j, i), 0)
        elif i == j:
            return 0.0
        else:
            sim = values.get((i, j), 0)

        return 1-sim # (max_sim - sim) / max_sim

    arr = numpy.ndarray((len(prod_ids), len(prod_ids)))
    for i in xrange(len(prod_ids)):
        for j in xrange(len(prod_ids)):
            arr[i, j] = get_item(i, j)    

    # create and fit the model:
    db = cluster.DBSCAN(metric="precomputed", eps=eps).fit(arr)
    return [
        (label, [p for _, p in prodi]) for label, prodi in groupby(
                ((l, prod_ids[i]) for i, l in sorted(enumerate(db.labels_), key=lambda e: -e[1]) if l != -1),
                key=lambda e: e[0]
        )
    ]

In [6]:
%%time
get_clusters('fashion/wanita/tas-wanita')

CPU times: user 17.5 s, sys: 860 ms, total: 18.3 s
Wall time: 18.9 s


[(8, [15398, 15455, 65511, 160463, 160474]),
 (7,
  [159323,
   159344,
   159365,
   159367,
   159368,
   159370,
   159397,
   159399,
   159402,
   159429,
   159471,
   159573,
   159657,
   159670,
   159765,
   159811]),
 (6,
  [60157,
   60159,
   60527,
   60528,
   159317,
   159343,
   159345,
   159359,
   159369,
   159371,
   159372,
   159373,
   159374,
   159375,
   159376,
   159377,
   159378,
   159379,
   159380,
   159381,
   159382,
   159383,
   159384,
   159385,
   159386,
   159387,
   159388,
   159389,
   159391,
   159394,
   159401,
   159406,
   159407,
   159408,
   159411,
   159413,
   159414,
   159415,
   159416,
   159417,
   159421,
   159422,
   159424,
   159425,
   159426,
   159432,
   159433,
   159434,
   159453,
   159455,
   159457,
   159458,
   159460,
   159461,
   159464,
   159465,
   159468,
   159469,
   159470,
   159472,
   159474,
   159475,
   159477,
   159479,
   159480,
   159481,
   159483,
   159484,
   159485,
   159486,
 

In [7]:
def category_clusters(categories):
    return [(c, get_clusters(c)) for c in categories]


In [8]:
%%time
random.shuffle(categories)
pool = multiprocessing.Pool(processes=8)
clusters = pool.map(category_clusters, utils.chunks(categories, 8))

CPU times: user 176 ms, sys: 92 ms, total: 268 ms
Wall time: 2min 39s


In [17]:
clusters = filter(lambda c: c[1], sum(clusters, []))

In [18]:
for c, labels in clusters:
    for l, products in labels:
        print "\"{0}\", {1}: {2}".format(c, l, ','.join(map(str, products)))

"rumah-tangga/furniture-interior", 0: 1633,1734,19503,90819,190009
"hobi/mainan/action-figure", 0: 58187,58191,58192,58193,192643
"elektronik/speaker", 1: 31484,61642,117366,117370,117457
"elektronik/speaker", 0: 69517,69521,69529,69530,69541,69702,84517,84520,84522,123646,123652,123674,123678
"handphone/kabel-data", 2: 211048,215954,215955,215959,215966,215976
"handphone/kabel-data", 1: 3568,58881,111657,212637,235300
"handphone/kabel-data", 0: 1090,3697,16316,163500,179300,212638
"hobi/olahraga/roller-skate", 0: 30962,30978,198627,198628,198629,198664
"fashion/wanita/perhiasan-aksesoris", 0: 130910,245343,245345,245350,245351
"handphone/tongsis", 6: 81409,91306,93543,117742,123506,187160,210141
"handphone/tongsis", 5: 65552,81361,121836,121839,126166
"handphone/tongsis", 4: 53797,57367,74462,78578,79661,115888,243650
"handphone/tongsis", 3: 49590,91308,91314,122285,212579
"handphone/tongsis", 2: 17277,42011,155904,204165,233657
"handphone/tongsis", 1: 16261,21357,52931,65554,68982,12