In [1]:
import json
import os
import re
import pytz
import emoji
import requests
import pandas as pd

from time import sleep
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
from rauth import OAuth2Service

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
APP_ID=
SECRET_KEY_VK=''

In [32]:
def parse_group_info(info):
    item = {}

    item['id'] = abs(info['id'])
    item['description'] = cleaning_text(info['description']) if 'description' in info else ''
    item['members_count'] = info['members_count'] if 'members_count' in info else 0
    item['albums'] = info['counters']['albums'] if 'counters' in info and 'albums' in info['counters'] else 0
    item['audios'] = info['counters']['audios'] if 'counters' in info and 'audios' in info['counters'] else 0
    item['audio_playlists'] = info['counters']['audio_playlists'] if 'counters' in info and 'audio_playlists' in info[
        'counters'] else 0
    item['docs'] = info['counters']['docs'] if 'counters' in info and 'docs' in info['counters'] else 0
    item['photos'] = info['counters']['photos'] if 'counters' in info and 'photos' in info['counters'] else 0
    item['videos'] = info['counters']['videos'] if 'counters' in info and 'videos' in info['counters'] else 0
    item['articles'] = info['counters']['articles'] if 'counters' in info and 'articles' in info['counters'] else 0
    item['narratives'] = info['counters']['narratives'] if 'counters' in info and 'narratives' in info[
        'counters'] else 0
    item['clips'] = info['counters']['clips'] if 'counters' in info and 'clips' in info['counters'] else 0
    item['clips_followers'] = info['counters']['clips_followers'] if 'counters' in info and 'clips_followers' in info[
        'counters'] else 0
    item['site'] = info['site'] if 'site' in info else ''
    item['name'] = cleaning_text(info['name']) if 'name' in info else ''
    item['screen_name'] = info['screen_name']
    item['is_closed'] = info['is_closed']
    item['type'] = info['type']
    item['country'] = info['country']['title'] if 'country' in info and 'title' in info['country'] else ''
    if not item['country']:
        item['country'] = info['addresses']['main_address']['country']['title'] if 'addresses' in info and \
                                                                                   'main_address' in info[
                                                                                       'addresses'] and 'address' in \
                                                                                   info['addresses'][
                                                                                       'main_address'] and 'country' in \
                                                                                   info['addresses'][
                                                                                       'main_address'] else ''
    item['city'] = info['city']['title'] if 'city' in info and 'title' in info['city'] else ''
    if not item['city']:
        item['city'] = info['addresses']['main_address']['city']['title'] if 'addresses' in info and \
                                                                             'main_address' in info[
                                                                                 'addresses'] and 'address' in \
                                                                             info['addresses'][
                                                                                 'main_address'] and 'city' in \
                                                                             info['addresses']['main_address'] else ''
    item['address'] = info['addresses']['main_address']['address'] if 'addresses' in info and 'main_address' in info[
        'addresses'] and 'address' in info['addresses']['main_address'] else ''

    return item

In [33]:
def parse_group_post(post):
    item = {}
    item['donut'] = int(post['donut']['is_donut'])
    item['comments'] = post['comments']['count'] if 'comments' in post else 0
    item['marked_as_ads'] = post['marked_as_ads']
    item['short_text_rate'] = post['short_text_rate']
    item['type'] = post['type']

    item['date'] = datetime.fromtimestamp(post['date'], tz=pytz.timezone('Europe/Moscow'))
    item['from_id'] = abs(post['from_id'])

    item['id'] = int(post['id'])
    item['is_favorite'] = int(post['is_favorite'])
    item['likes'] = post['likes']['count'] if 'likes' in post else 0
    item['owner_id'] = abs(int(post['owner_id']))
    item['post_source'] = post['post_source']['type'] if 'post_source' in post else ''
    item['post_type'] = post['post_type']
    item['reposts'] = post['reposts']['count']
    item['user_reposted'] = post['reposts']['user_reposted']
    item['post_text'] = cleaning_text(post['text'])
    item['views'] = post['views']['count'] if 'views' in post else 0

    attachments = parse_attachments(post)

    return item, attachments

In [34]:
def parse_url_photo(atts):
    url = ''
    if 'sizes' in atts:
        max_width = max(list(map(lambda x: x['width'], atts['sizes'])))
        big_size = list(filter(lambda x: x['width'] == max_width, atts['sizes']))
        if big_size:
            url = big_size[0]['url']
    return url

In [35]:
def parse_attachments(post):
    items = []
    for attachment in post['attachments']:
        _type = attachment['type']
        _url = ''

        item = {'type': _type}

        if _type in ['photo', 'doc']:
            item['id'] = attachment[_type]['id']
            item['owner_id'] = abs(int(attachment[_type]['owner_id']))
            item['title'] = cleaning_text(attachment[_type]['title']) if 'title' in attachment[_type] else ''
            item['post_id'] = int(post['id'])
            item['date'] = datetime.fromtimestamp(attachment[_type]['date'], tz=pytz.timezone('Europe/Moscow'))

            if _type == 'photo':

                item['album_id'] = abs(int(attachment[_type]['album_id']))
                item['text'] = cleaning_text(attachment[_type]['text'])

                item['has_tags'] = int(attachment[_type]['has_tags'])
                _url = parse_url_photo(attachment[_type])

            elif _type == 'doc':
                if attachment[_type]['type'] == 1:
                    _url = attachment[_type]['url']

            item['url'] = _url

            items.append(item)
    return items


In [36]:
def parse_post_comments(comments, post_id, owner_id):
    item = {
        'id': comments['id'], 'from_id': abs(comments['from_id']),
        'date': datetime.fromtimestamp(comments['date'], tz=pytz.timezone('Europe/Moscow')),
        'text': cleaning_text(comments['text']), 'post_id': post_id, 'owner_id': abs(owner_id),
        'count_likes': comments['likes']['count'] if 'likes' in comments else 0,
        'reply_user': abs(comments['reply_to_user']) if 'reply_to_user' in comments else None,
        'reply_comment': comments['reply_to_comment'] if 'reply_to_comment' in comments else None,
        'parents_stack': ','.join(
            str(x) for x in comments['parents_stack']) if 'parents_stack' in comments else None}

    return item

In [37]:
def parse_users(user_info):
    item = {'user_id': user_info['id']}

    item['age'] = 0
    if 'bdate' in user_info:
        dt = user_info['bdate'].split(".")

        if len(dt) > 2:
            date = datetime(int(dt[2]), int(dt[1]), int(dt[0]))
            item['age'] = datetime.now().year - date.year

    item['education'] = 0
    if 'universities' in user_info and user_info['universities']:
        item['education'] = 3 if user_info['universities'][0]['id'] else 0

    elif 'schools' in user_info:
        item['education'] = 1
        if user_info['schools'] and 'type' in user_info['schools'][0]:
            item['education'] = 1 if user_info['schools'][0]['type'] <= 4 else 2

    item['sex'] = user_info['sex'] if 'sex' in user_info else 0
    item['country'] = user_info['country']['title'] if 'country' in user_info and 'title' in user_info[
        'country'] else ''
    item['city'] = user_info['city']['title'] if 'city' in user_info and 'title' in user_info['city'] else ''
    item['home_town'] = user_info['home_town'] if 'home_town' in user_info else ''
    item['friends'] = user_info['counters']['friends'] if 'counters' in user_info and 'friends' in user_info[
        'counters'] else 0
    item['albums'] = user_info['counters']['albums'] if 'counters' in user_info and 'albums' in user_info[
        'counters'] else 0
    item['audios'] = user_info['counters']['audios'] if 'counters' in user_info and 'audios' in user_info[
        'counters'] else 0
    item['followers'] = user_info['counters']['followers'] if 'counters' in user_info and 'followers' in user_info[
        'counters'] else 0
    item['gifts'] = user_info['counters']['gifts'] if 'counters' in user_info and 'gifts' in user_info[
        'counters'] else 0
    item['pages'] = user_info['counters']['pages'] if 'counters' in user_info and 'pages' in user_info[
        'counters'] else 0
    item['photos'] = user_info['counters']['photos'] if 'counters' in user_info and 'photos' in user_info[
        'counters'] else 0
    item['subscriptions'] = user_info['counters']['subscriptions'] if 'counters' in user_info and 'subscriptions' in \
                                                                      user_info['counters'] else 0
    item['groups'] = user_info['counters']['groups'] if 'counters' in user_info and 'groups' in user_info[
        'counters'] else 0
    item['videos'] = user_info['counters']['videos'] if 'counters' in user_info and 'videos' in user_info[
        'counters'] else 0

    item['about'] = cleaning_text(user_info['about']) if 'about' in user_info else ''
    item['status'] = cleaning_text(user_info['status']) if 'status' in user_info else ''
    item['pages_list'] = ','.join(str(x) for x in user_info['users']['items']) if 'users' in user_info and \
                                                                                  user_info['users']['items'] else ''
    item['groups_list'] = ','.join(str(x) for x in user_info['groups']['items']) if 'groups' in user_info and \
                                                                                    user_info['groups'][
                                                                                        'items'] else ''

    return item

In [38]:
def cleaning_text(string):
    emoticon_string = r"""
            (?:
              [<>]?
              [:;=8]                     # eyes
              [\-o\*\']?                 # optional nose
              [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
              |
              [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
              [\-o\*\']?                 # optional nose
              [:;=8]                     # eyes
              [<>]?
            )"""

    # remove graf emoji
    string = emoji.replace_emoji(string, r'')

    # remove [id text]
    string = re.sub(r'\[id[^()]*\]', '', string)

    # remove [club text]
    string = re.sub(r'\[club[^()]*\]', '', string)

    # remove links and all except [^А-я0-9.,!?ё ] and '\n'
    string = re.sub("[^ ]+\.[^ ]+", '', re.sub(r"[^А-я0-9.,!?ё ]", ' ', string.replace('\n', ' ')))

    # remove duplicate
    string = re.sub(r'!+', '!', string)
    string = re.sub(r'\?+', '?', string)
    string = re.sub('[.]+', '.', string)
    string = re.sub(r'\s+', ' ', string)

    string = string.lstrip(',./";:)}({<>!?@#$%^&*_+').strip()

    if not re.search('[а-яА-Я]', string):
        string = ''

    return string

In [39]:
def get_group_info(domains):
    res_group_info = []
    group_info = []
    data = {'group_ids': ','.join(str(x) for x in domains)}

    fields = {
        'fields': 'id,name,screen_name,type,counters,description,fixed_post,links,members_count,site,wiki_page,counters,country,contacts, addresses'}

    response = requests.post('https://api.vk.com/method/groups.getById', data=data | params | fields)

    if 'response' in response.json():
        group_response = response.json().get('response')
        for group_item in group_response:
            group_info = parse_group_info(group_item)

            res_group_info.append(group_info)

    if 'error' in response.json():
        error = response.json().get('error')
        print(error)

    return res_group_info

In [40]:
def get_group_posts(domain):
    uploaded_posts = []
    uploaded_attachments = []
    offset = 0
    count = 100

    data = {'domain': domain, 'filter': str('owner')}
    pbar = tqdm(desc='POSTS')

    while True:

        data['offset'] = offset
        data['count'] = count

        response = requests.post('https://api.vk.com/method/wall.get', data=data | params)
        sleep(0.2)

        if 'response' in response.json():
            group_posts = response.json().get('response')
            items = None

            if group_posts:
                items = group_posts['items']
                pbar.total = group_posts['count']
                offset = offset + len(items)
                pbar.update(len(items))

                res = list(map(lambda item: parse_group_post(item), items))
                uploaded_posts = uploaded_posts + list(map(lambda x: x[0], res))
                for res_attach in res:
                    uploaded_attachments = uploaded_attachments + list(map(lambda x: x, res_attach[1]))

            if not items:
                break

        if 'error' in response.json():
            error = response.json().get('error')
            print(error)

    return uploaded_posts, uploaded_attachments

In [41]:
def get_group_members(group_id):
    uploaded_members = []
    offset = 0
    count = 1000
    total = 0

    data = {'group_id': group_id, 'sort': 'id_asc'}
    fields = {
        'fields': 'bdate,city,common_count,connections,contacts,country,domain,education,last_seen,lists,relation,relatives,schools,sex,site,status,universities'}

    response = requests.get('https://api.vk.com/method/groups.getMembers', data | params | fields)
    if 'response' in response.json():
        total = response.json().get('response')['count']

    while total > offset:
        if total - offset < count:
            count = total - offset

        data['offset'] = offset
        data['count'] = count
        response = requests.get('https://api.vk.com/method/groups.getMembers', data | params | fields)

        items = None

        if 'response' in response.json():
            members = response.json().get('response')

            if members:
                items = members['items']
                offset += len(items)
                uploaded_members += items

            if not items:
                break
        if 'error' in response.json():
            error = response.json().get('error')
            print(error)
        sleep(0.1)

    return uploaded_members

In [42]:
def get_user_info(user_id):
    data = {'user_id': user_id}
    fields = {'fields': 'bdate,sex,universities,education,schools,counters,about, status, city, country, home_town'}

    user_response = requests.post('https://api.vk.com/method/users.get', data=data | params | fields)
    sleep(0.2)
    fields = {'fields': 'user, group'}
    subs_response = requests.post('https://api.vk.com/method/users.getSubscriptions', data=data | params | fields)
    sleep(0.2)

    if 'response' in user_response.json():
        subscriptions = subs_response.json().get('response') if 'response' in subs_response.json() else {}
        user_info = user_response.json().get('response')
        if user_info:
            user_info = user_info[0] | subscriptions

            return parse_users(user_info)
        return {}

    if 'error' in user_response.json():
        error = user_response.json().get('error')
        print(error)
    return None

In [43]:
def get_posts_comments(owner_id, post_id):
    uploaded_comments = []
    offset = 0
    count = 100

    data = {'owner_id': -int(owner_id), 'post_id': int(post_id), 'need_likes': 1, 'extended': 1,
            'thread_items_count': 10}

    while True:
        data['offset'] = offset
        data['count'] = count
        response = requests.post('https://api.vk.com/method/wall.getComments', data=data | params)
        sleep(0.2)
        items = None

        if 'response' in response.json():
            comments_posts = response.json().get('response')

            if comments_posts:
                items = comments_posts['items']
                offset = offset + len(items)

                for item in items:
                    uploaded_comments.append(parse_post_comments(item, post_id, owner_id))
                    if 'thread' in item and item['thread']['items']:
                        uploaded_comments += list(
                            map(lambda x: parse_post_comments(x, post_id, owner_id), item['thread']['items']))

            if not items:
                break
        if 'error' in response.json():
            error = response.json().get('error')
            print(error)
    return uploaded_comments

In [44]:
def get_pages_and_groups_info(df_list):
    df = pd.concat(df_list)
    df = df[['pages_list', 'groups_list']]

    groups_list = []

    for mm in df.itertuples():
        if pd.notna(mm.pages_list) and mm.pages_list:
            p_list = str(mm.pages_list).split(',')
            groups_list += p_list
        if pd.notna(mm.groups_list) and mm.groups_list:
            g_list = str(mm.groups_list).split(',')
            groups_list += g_list

    groups_list = list(map(lambda x: int(x), groups_list))

    groups_info = []
    g_items = pd.Series(groups_list).unique().tolist()

    total = 0
    offset = 500
    pbar = tqdm(desc='groups info', total=len(g_items))
    while len(g_items) > total:
        if len(g_items) - total < 500:
            offset = len(g_items) - total
        groups_info = groups_info + get_group_info(g_items[total:offset + total])
        pbar.update(offset)
        total += offset
    return groups_info

In [45]:
def get_user_token(APP_ID, SECRET_KEY_VK):
    if os.getenv('TOKEN_VK'):
        return os.environ["TOKEN_VK"]
    # создание объекта сервиса OAuth2
    service = OAuth2Service(
        client_id=APP_ID,
        client_secret=SECRET_KEY_VK,
        name='vk',
        authorize_url='https://oauth.vk.com/authorize',
        access_token_url='https://oauth.vk.com/access_token',
        base_url='https://api.vk.com/method/'
    )

    # получение ссылки на авторизацию
    params = {'scope': 'photos,wall,friends, email, offline, groups'}
    url = service.get_authorize_url(**params)

    # переход на страницу авторизации
    print(f'Перейдите по этой ссылке и разрешите доступ: {url}')
    redirect_url = input('Введите URL перенаправления: ')

    # получение токена
    oauth_session = service.get_raw_access_token(
        data={'code': redirect_url.split('code=')[1],
              'client_id': APP_ID,
              'client_secret': SECRET_KEY_VK, },
    )
    user_id = oauth_session.json().get('user_id')
    token = oauth_session.json().get('access_token')

    return token

In [46]:
def save_to_excel(df, name, domain):
    if 'date' in df:
        df['date'] = df['date'].apply(lambda a: pd.to_datetime(a).date())

    path = f'storage/output/{domain}'
    Path(path).mkdir(parents=True, exist_ok=True)
    df.to_excel(f'{path}/{domain}_{name}.xlsx', sheet_name=name)

    print(f'saving the {domain}_{name}.xlsx file is done')

In [56]:
def run(domain):
    # group info
    res_group_info = get_group_info([domain])
    #print(res_group_info)
    df_group_info = pd.DataFrame(res_group_info)
    group_id = res_group_info[0]['id']
    save_to_excel(domain=domain, name='info', df=df_group_info)
    
    '''
    # posts
    res_group_posts = get_group_posts(domain)
    df_posts = pd.DataFrame(res_group_posts[0])
    save_to_excel(domain=domain, name='posts', df=df_posts)
    #print(res_group_posts)
    '''
    
    '''
    # post attachments
    df_attachments = pd.DataFrame(res_group_posts[1])
    save_to_excel(domain=domain, name='attachments_post', df=df_attachments)
    '''
    
    '''
    # posts comments
    res_posts_comments = []
    for post in tqdm(df_posts.itertuples(), total=len(df_posts), desc='COMMENTS'):
        res_posts_comments = res_posts_comments + get_posts_comments(post.owner_id, post.id)
    df_comments = pd.DataFrame(res_posts_comments)
    save_to_excel(domain=domain, name='comments', df=df_comments)    
    '''
    '''
    # members info
    res_group_members = get_group_members(group_id)
    res_members = pd.DataFrame(res_group_members).drop_duplicates(subset=['id'])
    # save_to_excel(domain=domain, name='group_members_raw', df=res_members)
    res_members_info = []
    for user in tqdm(res_members['id'].tolist(), desc='MEMBERS'):
        members_info = get_user_info(user)
        if members_info:
            item_user = {'group_id': group_id} | members_info
            res_members_info.append(item_user)
    df_members = pd.DataFrame(res_members_info)
    save_to_excel(domain=domain, name='members', df=df_members)
    '''
    '''
    # users info from comments
    #df_comments = pd.read_excel('storage/output/sikhotezap/sikhotezap_comments.xlsx')
    df_comments = df_comments[['from_id', 'owner_id']].drop_duplicates().reset_index(drop=True)
    res_users_from_comments = []
    for comment in tqdm(df_comments.itertuples(), total=len(df_comments), desc='USERS FROM COMMENTS'):
        user_info = get_user_info(comment.from_id)
        if user_info:
            item_user = {'group_id': comment.owner_id} | user_info
            res_users_from_comments.append(item_user)
    df_users_from_comments = pd.DataFrame(res_users_from_comments)
    save_to_excel(domain=domain, name='users_from_comments', df=df_users_from_comments)
    '''

In [49]:
os.environ["TOKEN_VK"] = ''
os.environ["TOKEN_VK"] = get_user_token(APP_ID, SECRET_KEY_VK)
params = {'v': '5.131', 'access_token': os.environ["TOKEN_VK"]}
print(os.environ["TOKEN_VK"])

vk1.a.5ZbWLF-CiwHBEJE4uWL1x1_LbF6dYFFHKtwSPQ3ZIHP2nQLq0gKDREJs0ubgKVB_3KS5WNwLzOtbGA53vD-KGBRRCbD0KWcyK0PpXk4YJ6FMu2YTRI5BpVN1Wzvmsa1KZsrfcsX05rhfwjhpGRGVj0eyTYOugq-peTSp6GDkpH0jBumx86eJEQI4ZPmfLDLXowjniDT8D1YQCUs960HYYQ


In [58]:
run('sikhotezap')

saving the sikhotezap_info.xlsx file is done


MEMBERS: 100%|██████████████████████████████| 2170/2170 [28:44<00:00,  1.26it/s]


saving the sikhotezap_members.xlsx file is done
