In [84]:
import glob
import json
import os
from urllib.request import urlretrieve
from multiprocessing import Pool
from collections import defaultdict, namedtuple
from pprint import pprint
import tqdm

In [104]:
SUPPORTED_TYPES = ['photo', 'audio']
DUMP_DIR = 'dumps/nikkorobk/'
p = Pool(processes=8)


In [105]:
def photo_url_extractor(photo_at):
    return(photo_at['photo']['sizes'][-1]['url'])
def audio_url_extractor(audio_at):
    return(audio_at['audio']['url'].split('?')[0])


In [106]:
def extract_attachment_url(at):
    att = at['type']
    if att == 'photo':
        return photo_url_extractor(at)
    if att == 'audio':
        return audio_url_extractor(at)
    return ''

In [107]:
DTask = namedtuple('DTask',['url', 'type', 'user_id'])

def get_download_tasks(dump_dir):
    tasks = []
    files_list=glob.glob(os.path.join(dump_dir, '*.json'))
    for file in tqdm.tqdm_notebook(files_list):
        user_urls = defaultdict(list)
        user_id = os.path.splitext(os.path.basename(file))[0]
        conv = json.load(open(file))
        for msg in conv['items']:
            for at in msg['attachments']:
                att = at['type']
                if att in SUPPORTED_TYPES:
                    url = extract_attachment_url(at)
                    if url:
                        tasks.append(DTask(url, att, user_id))
    return tasks


In [108]:
def process_task(task):
    path_to_save =  os.path.join(DUMP_DIR, 'attachments', task.user_id, task.type)
    filename = task.url.split('/')[-1]
    full_name = os.path.join(path_to_save, filename)
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    if not os.path.exists(full_name):
        urlretrieve(task.url, full_name)

In [111]:
tasks = get_download_tasks(DUMP_DIR)

HBox(children=(IntProgress(value=0, max=808), HTML(value='')))




In [110]:
def process_tasks(tasks):
    with tqdm.tqdm_notebook(total = len(dt)) as td:
        for r in p.imap_unordered(process_task, tasks):
            td.update()


In [None]:
process_tasks(tasks)

HBox(children=(IntProgress(value=0, max=9585), HTML(value='')))

In [97]:
pprint(dt)

[DTask(url='https://pp.userapi.com/c639431/v639431966/22384/6jp7L-DEMqE.jpg', type='photo', user_id='55347865'),
 DTask(url='https://pp.userapi.com/c629305/v629305966/14316/A6CwT4uOXGg.jpg', type='photo', user_id='55347865'),
 DTask(url='https://pp.userapi.com/c623831/v623831966/230bf/530ncxOJ6C0.jpg', type='photo', user_id='55347865'),
 DTask(url='https://sun9-12.userapi.com/Idc5ZQFAQK4X_vG3y7tZT7BlR0OnP05iTw7SdA/A2fI6_vqxX4.jpg', type='photo', user_id='148074166'),
 DTask(url='https://pp.userapi.com/mVIfxCtMqOQlEoCuC6ItIV__zB9MnS1myroeEg/AARn-M6140s.jpg', type='photo', user_id='148074166'),
 DTask(url='https://pp.userapi.com/cOvMezWklHuWpdvdcucNOfmL1Cf-_ifvRKgWKQ/hhjN8a71Igk.jpg', type='photo', user_id='148074166'),
 DTask(url='https://pp.userapi.com/Qe3GGnnWqiZS5E3-1sr9ZunVnhw8PgQ2zsBMlQ/Y7r2NQEJB78.jpg', type='photo', user_id='148074166'),
 DTask(url='https://pp.userapi.com/j1L82o2bBL_cPtUpMtKetdSXjuZ50CzT6kgrVg/cnCRLc_xKgk.jpg', type='photo', user_id='148074166'),
 DTask(url='http

 DTask(url='https://pp.userapi.com/I7226_8aJkq-F-gmVi3stkMeMt7_cLByriD4Qw/6YJh6EWd_Ug.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/c317930/v317930636/7e0c/WheLi4ch6YM.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/WP2gzGNoopR5AVC9WNBTnnqzvWoh3fdedWgCfw/PsRSo2wcXKU.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/46cnnnSXWSQlGCjb0ilE7mHszwnX7s2vhRHfbQ/QowO0_JtvvA.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/AWFiw49oZ12HGqlUhvJrKFFiAVw-HkxaPM_19g/YWniOWRsB8E.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/Ve_-dgcPJIJwUdkSMU8jeYf6v-ULWADhvl4CIg/olTNsnd_Ptk.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/-Jgn4HHj4c2w2vs3AdjoG_Jci6DzMRbnNHDZOA/XAsd0IrrnQY.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/WCrnt1FzEa2ZfqE0H2n6KQ03cR1ZNypY7MTYjw/fh067qRfMyQ.jpg', type='photo', user_i

 DTask(url='https://cs9-21v4.vkuseraudio.net/p15/21633c4f69600c.mp3', type='audio', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/xUSDSUepPD2sTsCRYm1eJ-IYbLV--XZywLfrOg/5RBduhdnRs4.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/c316326/v316326636/4e95/_7-pZ0oy9wk.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/qhuT14Z3gfveMYZbTAr-pIokk3fhGjW9725fiA/vO7vfewOFF4.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/Yh6QHyfd0ev6rkfBaiJaoRjvxpgioKNidJUkGQ/iP94F7s-ewY.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/k5OppF38dZd2jjQ6ilY3k20m-EMhMtMhl_eVWQ/RKVqT_V0bRc.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/krXjsMZXEFz7U19D9wOYfUA_uboXnRAR5jcc5w/EEfIfESQqUE.jpg', type='photo', user_id='2000000027'),
 DTask(url='https://pp.userapi.com/Yh6QHyfd0ev6rkfBaiJaoRjvxpgioKNidJUkGQ/iP94F7s-ewY.jpg', type='photo', user_id='2000000027'),
 DTas

 DTask(url='https://pp.userapi.com/c639624/v639624316/552/h36gUaNNKvs.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c639624/v639624316/55c/LXYz-RbvCvg.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c604831/v604831304/291b1/w1peX4MFeL4.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c637730/v637730122/29fb7/Sa3L9USARmg.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c836334/v836334316/1b393/uzJf35LACWA.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c837521/v837521386/19d9a/Dz1U6ihKJOY.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c837521/v837521386/19d91/-ckdJsc0cYg.jpg', type='photo', user_id='2000000091'),
 DTask(url='https://psv4.vkuseraudio.net/c422318/u172996706/audios/5a46940823a0.mp3', type='audio', user_id='2000000091'),
 DTask(url='https://pp.userapi.com/c836229/v836229966/1824d/uESKwwB-EQA.jpg'

 DTask(url='https://pp.userapi.com/c630921/v630921966/38774/i1gKHsEX3Qc.jpg', type='photo', user_id='9468306'),
 DTask(url='https://pp.userapi.com/c845523/v845523173/14f40b/_YOsKaWD8C0.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c850524/v850524622/60d98/cMvoIheAH8U.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c846520/v846520024/149669/_e5hQIFqnKI.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c844720/v844720024/144306/7ByWf43BJGw.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c849336/v849336024/ccf2e/g47nZ7xpwVA.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c830401/v830401024/1d30a8/oYrc4ghQ3Ok.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c847021/v847021024/147cce/wk-RaIysjUw.jpg', type='photo', user_id='2000000136'),
 DTask(url='https://pp.userapi.com/c849420/v849420024/d3c82/1XBBoGyyAhs.jpg', 

 DTask(url='https://cs9-21v4.vkuseraudio.net/p14/d22ffa9e16493e.mp3', type='audio', user_id='10506587'),
 DTask(url='https://psv4.vkuseraudio.net/c521421/u448999/audios/72fc32cb6419.mp3', type='audio', user_id='10506587'),
 DTask(url='https://psv4.vkuseraudio.net/c521104/u4120450/audios/05d7ee3347df.mp3', type='audio', user_id='10506587'),
 DTask(url='https://psv4.vkuseraudio.net/c5106/u52161701/audios/c93f489f50dd.mp3', type='audio', user_id='10506587'),
 DTask(url='https://psv4.vkuseraudio.net/c6176/u7948298/audios/58b8ed51d0b9.mp3', type='audio', user_id='10506587'),
 DTask(url='https://psv4.vkuseraudio.net/c521408/u155550845/audios/93ee2a795f72.mp3', type='audio', user_id='10506587'),
 DTask(url='https://psv4.vkuseraudio.net/c1624/u1113743/audios/85646f683d9b.mp3', type='audio', user_id='10506587'),
 DTask(url='https://cs9-18v4.vkuseraudio.net/p14/31855babd8c4ec.mp3', type='audio', user_id='10506587'),
 DTask(url='https://cs9-18v4.vkuseraudio.net/p5/84fc6e5d691189.mp3', type='audio

 DTask(url='https://pp.userapi.com/c837426/v837426962/3f3ac/pRcMzB-sy2Q.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426132/5a61f/R_MgjRer2qU.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426132/5a629/MCWAUtf4Q2o.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426464/51475/_pBB_idQOsQ.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426464/51483/cSMgSqVyBik.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426464/5141b/4ClC70AZ33o.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426464/51425/Usr4Xbz-NJ0.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426464/5142f/1f9YOjWd7w0.jpg', type='photo', user_id='2000000130'),
 DTask(url='https://pp.userapi.com/c837426/v837426464/51439/M7MdiV3dvrI.jpg', ty

In [98]:
len(dt)

9652

In [103]:
os.path.exists('dumps/nikkorobk/1300.json')

False