In [1]:
import dill
import math
from collections import defaultdict
import csv
import itertools
import random
import multiprocessing
import csv

In [2]:
vectors = dill.load(open("vectors.pkl", 'r'))

In [3]:
%%time

product_category = defaultdict(lambda: None)
with open('/data/corpus.csv') as raw_file:
    for (prod_id, _, url) in csv.reader(raw_file):
        url = url.split('/')
        if url[3] != 'p':
            continue
        product_category[int(prod_id)] = '/'.join(url[4:-1])

CPU times: user 876 ms, sys: 36 ms, total: 912 ms
Wall time: 1.63 s


In [4]:
categories = sorted(list(set(product_category.values())))
print "Total products:", len(product_category)
print "Total categories:", len(categories)

Total products: 228562
Total categories: 656


In [5]:
category_product = sorted(((c, p) for (p, c) in product_category.items()), key=lambda e: e[0])
category_product = [(c, [p for (_, p) in pi]) for c, pi in itertools.groupby(category_product, key=lambda p: p[0])]

In [6]:
cat_counts = ((c, len(pl)) for (c, pl) in category_product)
cat_counts = sorted(cat_counts, key=lambda e: e[1])

In [7]:
cat_counts[:10] + cat_counts[-10:]

[('food/bumbu', 1),
 ('hobi/olahraga', 1),
 ('onderdil-mobil', 1),
 ('onderdil-mobil/eksterior-mobil', 1),
 ('onderdil-mobil/exhaust-system-mobil/header', 1),
 ('onderdil-mobil/interior-mobil', 1),
 ('onderdil-mobil/produk-perawatan-mobil', 1),
 ('onderdil-motor/sparepart-motor/dinamo', 1),
 ('perlengkapan-bayi/makanan-711/jus', 1),
 ('sepeda/brake/cantilever', 1),
 ('hobi/mainan/diecast', 3395),
 ('fashion/pria/sepatu-169', 3494),
 ('rumah-tangga/home-stuff', 3976),
 ('fashion/wanita/tas-wanita', 4211),
 ('personal-care/produk-kesehatan', 4672),
 ('rumah-tangga/dapur', 4803),
 ('elektronik/lain-lain-208', 5011),
 ('fashion/pria/jam-tangan-171', 6208),
 ('handphone/aksesoris-handphone', 6827),
 ('handphone/case-cover', 9072)]

In [8]:
category_product = dict(category_product)

In [9]:
def category_products(category):
    """
    Returns all products belonging to the product category
    """
    return category_product[category]

In [10]:
#category_product["hobi/mainan/diecast"]
#vectors[1]

In [11]:
%%time
products = {}
for v in vectors:
    products[v[0]] = (v[1], v[2])

CPU times: user 712 ms, sys: 40 ms, total: 752 ms
Wall time: 733 ms


In [12]:
del(vectors)

In [13]:
def norm(vect):
    if type(vect) == tuple:
        vect = vect[0]
    return math.sqrt(sum((v * v for v in vect.values())))

def distance(v1, v2):
    #global product_category
    #c1, c2 = product_category[v1[2]], product_category[v2[2]]
    # if produts belong to different categories they ARE DIFFERENT:
    #if c1 == None or c2 == None or c1 != c2:
    #    return 0.
    v1, n1 = v1[:2]
    v2, n2 = v2[:2] 
    dist = sum((v1[w] * v2[w] for w in set(v1).intersection(v2)), 0.0)
    return dist / (n1 * n1)

def chunks(l, num):
    """
    splits list in num chunks
    for running in parallel
    """
    n = len(l) / num + 1 
    return [l[i:i + n] for i in range(0, len(l), n)]

In [14]:
def save_sims(category, sims):
    with open("/data/python-scrapy-data-mining/sims_{0}.csv".format(category.replace('/','_')), "w") as f:
        csv_writer = csv.writer(f)
        csv_writer.writerows(sims)

def get_sim(categories):
    global category_product, products
    for cat in categories:
        sim = []
        prod_ids = category_product[cat]
        # ignore categories with a single product:
        if len(prod_ids) < 2:
            continue
        for i in prod_ids[:-1]:
            for j in (pid for pid in prod_ids if pid > i):
                try: 
                    dist = distance(products[i], products[j])
                    if dist > 0.01:
                        sim.append((i, j, round(dist, 7)))
                except Exception as e:
                    print e
                    print "***", i, j
        # Store in a CSV file and free the memory:
        save_sims(cat, sim)
        del(sim)
        #print "*** Category:", cat, "of", len(prod_ids), "products"


In [15]:
%%time

random.shuffle(categories)
pool = multiprocessing.Pool(processes=8)
pool.map(get_sim, chunks(categories, 8))

CPU times: user 508 ms, sys: 288 ms, total: 796 ms
Wall time: 8min 1s
