In [5]:
import io, os
import pandas as pd
import numpy as np
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from skimage.data import imread   # or, whatever image library you prefer
from skimage.io import imsave
import multiprocessing as mp      # will come in handy due to the size of the data

In [16]:
%%time
NCORE =  2

prod_to_category = mp.Manager().dict() # note the difference

def process(q, iolock):
    while True:
        d = q.get()
        if d is None:
            break
        product_id = d['_id']
        category_id = d['category_id']
        prod_to_category[product_id] = category_id
        for e, pic in enumerate(d['imgs']):
            picture = imread(io.BytesIO(pic['picture']))
            if not os.path.exists('../' + str(category_id)):
                os.makedirs('../' + str(category_id))
                
            file_to_save = '../' + str(category_id) + '/' + str(product_id) + '.jpg'
            #print(file_to_save)
            imsave(fname=file_to_save, arr=picture)
    
q = mp.Queue(maxsize=NCORE)
iolock = mp.Lock()
pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock))

# process the file

data = bson.decode_file_iter(open('../train_example.bson', 'rb'))
for c, d in enumerate(data):
    q.put(d)  # blocks until q below its max size

# tell workers we're done

for _ in range(NCORE):  
    q.put(None)
pool.close()
pool.join()

# convert back to normal dictionary
prod_to_category = dict(prod_to_category)

prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index')
prod_to_category.index.name = '_id'
prod_to_category.rename(columns={0: 'category_id'}, inplace=True)

CPU times: user 16 ms, sys: 24 ms, total: 40 ms
Wall time: 344 ms


In [15]:
prod_to_category.head()

Unnamed: 0_level_0,category_id
_id,Unnamed: 1_level_1
0,1000010653
1,1000010653
2,1000004079
3,1000004141
4,1000015539
