In [42]:
import vk
import tqdm

import time
import math
from collections import deque
import json
import logging
import sys
import os


In [93]:
MAX_COUNT = 200
FIELDS = 'first_name,last_name,screen_name,bdate,common_count,is_friend,photo_max,photo_50'
API_VERSION = '5.87'

flags = {'get_chats': True,
        'min_len': 500, 
        'test_run': True,
        'creds_path':'creds.json', 
        'save_path':'dumps/'}


logger =logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)

def get_password_and_id(cp):
    creds = json.load(open(cp))
    return creds['pass'], creds['id']

password, id = get_password_and_id(flags['creds_path'])

session = vk.AuthSession(app_id='6787646', user_login=id,
                         scope='messages', user_password=password) if password else vk.AuthSession()
vkapi = vk.API(session)

In [68]:
class Throater:
    
    def __init__(self, mrc = 3, ti = 1.5):
        self.total_sleep = 0
        self.time_interval = ti
        self.max_req_c = mrc
        self.history = deque([0]*self.max_req_c)
    def ready(self):
        now = time.time()
        self.history.append(now)
        prev = self.history.popleft()
        to_sleep = prev + self.time_interval - now 
        if to_sleep > 0:
            self.total_sleep += to_sleep
            time.sleep(to_sleep)
        return
t = Throater()

In [69]:
def get_list(func, initial_offset = 0, **kwargs):
    begin_t = time.time()
    i = 0
    t.ready()
    things = func(v='5.87', count = MAX_COUNT, offset = initial_offset, **kwargs)
    count = things['count']
    things = things['items']
    while len(things)< count-initial_offset:
        i+=1
        t.ready()
        new_things = func(v=API_VERSION, count = MAX_COUNT, offset = initial_offset + len(things), **kwargs)
        things.extend(new_things['items'])
    return things

def get_all_messages(peer_id, initial_offset = 0):
    return get_list(vkapi.messages.getHistory,initial_offset, user_id = peer_id)

def get_all_convs(initial_offset=0):
    return get_list(vkapi.messages.getConversations,initial_offset)

In [70]:
def get_dm_and_chat_ids_from_convs(convs):
    direct_conv_ids = []
    chat_conv_ids = []
    for c in convs:
        peer = c['conversation']['peer']
        if peer['type'] == 'user':
            direct_conv_ids.append(peer['id'])
        else:
            chat_conv_ids.append(peer['id'])
    return direct_conv_ids, chat_conv_ids

In [71]:
def get_data_draft(ids):
    data = {'total_msg_count':0, 'items':{}}
    for i in tqdm.tqdm_notebook(ids):
        t.ready()
        batch = vkapi.messages.getHistory(v=API_VERSION, count = MAX_COUNT, extended = 1, fields=FIELDS, peer_id = i)
        count = batch['count']
        data['total_msg_count'] += count
        data['items'][i] = batch
    return data
    

In [96]:
def estimate_requests(data):
    r = 0
    l = 0
    for v in list(data['items'].values()):
        if v['count']< flags['min_len']:
            continue
        done = len(v['items'])
        left = v['count'] - done
        r += math.ceil(left/MAX_COUNT)
        l += left
    return r, l

In [97]:
def complete_data_draft(data):
    ids = list(data['items'].keys())
    
    ids.sort(key = lambda x: data['items'][x]['count'])
    rn, l = estimate_requests(data)
    logger.info('Need to download {} messages in {} requests'.format(l, rn))
    with tqdm.tqdm_notebook(total = rn) as pbar:
        for user_id in ids:
            done = len(data['items'][user_id]['items'])
            total = data['items'][user_id]['count']
            if done >= total:
                continue
            expected_requests = math.ceil((total - done)/MAX_COUNT)
            data['items'][user_id]['items'].extend(get_all_messages(user_id, done))
            # here we might miss some info about attached messages authors. 
            pbar.update(expected_requests)
            
    return data

In [98]:
def get_all_messages_data():
    logger.info('Getting info about all conversations')
    convs = get_all_convs()
    dmi, ci = get_dm_and_chat_ids_from_convs(convs)
    if not flags['get_chats']: ci = []
    if flags['test_run']: dmi, ci = dmi[:5],ci[:5]
    logger.info('Starting fetch \n Collecting meta info and estimates')
    data = get_data_draft(dmi+ci)
    logger.info('Meta info collected. \nTotal messages found: {}\nCollecting messages text'.format(data['total_msg_count']))
    data = complete_data_draft(data)
    return data

In [99]:
data = get_all_messages_data()

2018-12-18 21:43:28,702 | INFO : Getting info about all conversations


KeyboardInterrupt: 

In [100]:
def generate_full_conversations_from_draft(data):
    ids = list(data['items'].keys())
    
    ids.sort(key = lambda x: data['items'][x]['count'])
    rn, l = estimate_requests(data)
    logger.info('Need to download {} messages in {} requests'.format(l, rn))
    with tqdm.tqdm_notebook(total = rn) as pbar:
        for user_id in ids:
            obj = data['items'][user_id]
            done = len(obj['items'])
            total = obj['count']
            if flags['min_len'] is not None and total< flags['min_len']:
                continue
            if done >= total:
                yield user_id, obj
                continue
            expected_requests = math.ceil((total - done)/MAX_COUNT)
            
            obj['items'].extend(get_all_messages(user_id, done))
            # here we might miss some info about attached messages authors. 
            pbar.update(expected_requests)
            yield user_id, obj
            
            
    

In [101]:
def organize_filestructure():
    t.ready()
    sc_name = vkapi.account.getProfileInfo(v=API_VERSION)['screen_name']
    dirname = os.path.join(flags['save_path'], '{} at {}'.format(sc_name, time.asctime()))
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    return os.path.join(dirname,"{}.json")

In [102]:
def save_all_messages_data():
    logger.info('Getting info about all conversations')
    convs = get_all_convs()
    dmi, ci = get_dm_and_chat_ids_from_convs(convs)
    if not flags['get_chats']: ci = []
    if flags['test_run']: dmi, ci = dmi[:5],ci[:5]
    logger.info('Starting fetch \n Collecting meta info and estimates')
    data = get_data_draft(dmi+ci)
    name_template = organize_filestructure()

    logger.info('Meta info collected. \nTotal messages found: {}\Saving messages text in {}'
                .format(data['total_msg_count'], name_template))    
    for user_id, user_data in generate_full_conversations_from_draft(data):
        with open(name_template.format(user_id), 'w') as file:
            json.dump(user_data, file, ensure_ascii=False)


In [103]:
save_all_messages_data()

2018-12-18 21:43:36,031 | INFO : Getting info about all conversations
2018-12-18 21:43:38,028 | INFO : Starting fetch 
 Collecting meta info and estimates


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


2018-12-18 21:43:42,664 | INFO : Meta info collected. 
Total messages found: 64957\Saving messages text in dumps/nikkorobk at Tue Dec 18 21:43:42 2018/{}.json
2018-12-18 21:43:42,667 | INFO : Need to download 63088 messages in 319 requests


HBox(children=(IntProgress(value=0, max=319), HTML(value='')))




In [329]:
json.dump(data, open('nikkorobk_data_first_iteration.json', 'w'))

In [47]:
time.asctime()

'Tue Dec 18 20:50:20 2018'

'dumps/nikkorobk at Tue Dec 18 20:52:25 2018/{}.json'

In [85]:
a = json.load(open('100.json', 'r'))

In [86]:
json.dump(a,open('110.json', 'w'), ensure_ascii=False)