In [17]:
import vk
import time
import math
from pprint import pprint
from collections import deque
import tqdm
import json
import logging
import sys


MAX_COUNT = 200
FIELDS = 'first_name,last_name,screen_name,bdate,common_count,is_friend,photo_max,photo_50'
API_VERSION = '5.87'

flags = {'get_chats': True,
        'min_len': None, 
        'test_run': True,
        'creds_path':'creds.json'}


logger =logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)

def get_password_and_id(cp):
    creds = json.load(open(cp))
    return creds['pass'], creds['id']

password, id = get_password_and_id(flags['creds_path'])

session = vk.AuthSession(app_id='6787646', user_login=id,
                         scope='messages', user_password=password) if password else vk.AuthSession()
vkapi = vk.API(session)

In [18]:
class Throater:
    
    def __init__(self, mrc = 3, ti = 1.5):
        self.total_sleep = 0
        self.time_interval = ti
        self.max_req_c = mrc
        self.history = deque([0]*self.max_req_c)
    def ready(self):
        now = time.time()
        self.history.append(now)
        prev = self.history.popleft()
        to_sleep = prev + self.time_interval - now 
        if to_sleep > 0:
            self.total_sleep += to_sleep
            time.sleep(to_sleep)
        return
t = Throater()

In [19]:
def get_list(func, initial_offset = 0, **kwargs):
    begin_t = time.time()
    i = 0
    t.ready()
    things = func(v='5.87', count = MAX_COUNT, offset = initial_offset, **kwargs)
    count = things['count']
    things = things['items']
    while len(things)< count-initial_offset:
        i+=1
        t.ready()
        new_things = func(v=API_VERSION, count = MAX_COUNT, offset = initial_offset + len(things), **kwargs)
        things.extend(new_things['items'])
    return things

def get_all_messages(peer_id, initial_offset = 0):
    return get_list(vkapi.messages.getHistory,initial_offset, user_id = peer_id)

def get_all_convs(initial_offset=0):
    return get_list(vkapi.messages.getConversations,initial_offset)

In [20]:
def get_dm_and_chat_ids_from_convs(convs):
    direct_conv_ids = []
    chat_conv_ids = []
    for c in convs:
        peer = c['conversation']['peer']

        if peer['type'] == 'user':
            direct_conv_ids.append(peer['id'])
        else:
            chat_conv_ids.append(peer['id'])
    return direct_conv_ids, chat_conv_ids

In [21]:
def get_data_draft(ids):
    data = {'total_msg_count':0, 'items':{}}
    for i in tqdm.tqdm_notebook(ids):
        t.ready()
        batch = vkapi.messages.getHistory(v=API_VERSION, count = MAX_COUNT, extended = 1, fields=FIELDS, peer_id = i)
        count = batch['count']
        data['total_msg_count'] += count
        data['items'][i] = batch
    return data
    

In [26]:
def estimate_requests(data):
    r = 0
    l = 0
    for v in list(data['items'].values()):
        done = len(v['items'])
        left = v['count'] - done
        r += math.ceil(left/MAX_COUNT)
        l += left
    return r, l

In [30]:
def complete_data_draft(data):
    ids = list(data['items'].keys())
    
    ids.sort(key = lambda x: data['items'][x]['count'])
    rn, l = estimate_requests(data)
    logger.info('Need to download {} messages in {} requests'.format(l, rn))
    with tqdm.tqdm_notebook(total = rn) as pbar:
        for user_id in ids:
            done = len(data['items'][user_id]['items'])
            total = data['items'][user_id]['count']
            if done >= total:
                continue
            expected_requests = math.ceil((total - done)/MAX_COUNT)
            data['items'][user_id]['items'].extend(get_all_messages(user_id, done))
            # here we might miss some info about attached messages authors. 
            pbar.update(expected_requests)
            
    return data

In [31]:
def get_all_messages_data():
    logger.info('Getting info about all conversations')
    convs = get_all_convs()
    dmi, ci = get_dm_and_chat_ids_from_convs(convs)
    if not flags['get_chats']: ci = []
    if flags['test_run']: dmi, ci = dmi[:5],ci[:5]
    logger.info('Starting fetch \n Collecting meta info and estimates')
    data = get_data_draft(dmi+ci)
    logger.info('Meta info collected. \nTotal messages found: {}\nCollecting messages text'.format(data['total_msg_count']))
    data = complete_data_draft(data)
    return data

In [33]:
data = get_all_messages_data()

2018-12-18 20:10:26,401 | INFO : Starting fetch 
 Collecting meta info and estimates


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


2018-12-18 20:10:31,397 | INFO : Meta info collected. 
Total messages found: 64932
Collecting messages text
2018-12-18 20:10:31,398 | INFO : Need to download 63324 messages in 321 requests


HBox(children=(IntProgress(value=0, max=321), HTML(value='')))




In [None]:
def generate_full_conversations_from_draft(data):
    ids = list(data['items'].keys())
    
    ids.sort(key = lambda x: data['items'][x]['count'])
    rn, l = estimate_requests(data)
    logger.info('Need to download {} messages in {} requests'.format(l, rn))
    with tqdm.tqdm_notebook(total = rn) as pbar:
        for user_id in ids:
            obj = data['items'][user_id]
            done = len(obj['items'])
            total = obj['count']
            if done >= total:
                yield obj
                continue
            expected_requests = math.ceil((total - done)/MAX_COUNT)
            
            obj['items'].extend(get_all_messages(user_id, done))
            # here we might miss some info about attached messages authors. 
            pbar.update(expected_requests)
            yield obj
            
            
    

In [None]:
def save_all_messages_data():
    logger.info('Getting info about all conversations')
    convs = get_all_convs()
    dmi, ci = get_dm_and_chat_ids_from_convs(convs)
    if not flags['get_chats']: ci = []
    if flags['test_run']: dmi, ci = dmi[:5],ci[:5]
    logger.info('Starting fetch \n Collecting meta info and estimates')
    data = get_data_draft(dmi+ci)
    logger.info('Meta info collected. \nTotal messages found: {}\nCollecting messages text'.format(data['total_msg_count']))
    data = complete_data_draft(data)
    return data

In [329]:
json.dump(data, open('nikkorobk_data_first_iteration.json', 'w'))

In [37]:
creds = json.load(open('creds.json'))
creds

{'pass': 'your_pass_here', 'id': 'your_id_here'}

In [35]:
ci = [1,2,3]

In [36]:
if not flags['get_chats']: ci = []
