In [None]:
# Imports, Setup, Functions
from TwitterAPI import TwitterAPI, TwitterPager
import yaml
import json
import os
import tqdm.notebook as tqdm
import json_lines
import time
from random import randint

with open("config.yaml", 'r') as ymlfile:
    config = yaml.safe_load(ymlfile)

api = TwitterAPI(config['twitter']['api_key'], 
                 config['twitter']['api_secret_key'],
                 auth_type='oAuth2'
                )

'''
- ANALYSIS.jsonl (users/lookup)
- fdat/ID.f (friends/ids)
- .gv, .gdf, .gexf
'''

DIRECTORY = 'local_data/'
FDAT_DIR = '{0}fdat/'.format(DIRECTORY)

api_number = 0

In [None]:
# donated Tokens
import pandas as pd

token = pd.read_csv(DIRECTORY+'token.csv')

In [None]:
def drop_broken_token():
    global token
    broken_token = []
    
    for i,row in tqdm.tqdm(token.iterrows()):
        api = TwitterAPI(config['spenden']['api_key'], 
                     config['spenden']['api_secret_key'],
                     token.iloc[i]['oauth_token'],
                     token.iloc[i]['oauth_secret']
                     )
        #r = api.request('account/verify_credentials')
        try:
            r = api.request('friends/ids', {'user_id': 19277941})
            #print(len(list(r)))
            if 'errors' in r.json() or 'suspended' in r.json():
                broken_token.append(i)
                #print(f'{i}: {r.text}')
        except:
            broken_token.append(i)
    if len(broken_token) > 0:
        token.drop(index=broken_token, inplace=True)

drop_broken_token()
len(token)

In [None]:
def api_request(*args): 
    try:
        r = api.request(*args)        
        if 'errors' in r.json():
            error = r.json()
            
            if error['errors'][0]['code'] == 34: #34 not exist, 326 temp locked
                pass
            elif error['errors'][0]['code'] == 326:
                new_api(silent=True)
            else:
                print(error)
                new_api(silent=True)
            
            return(api_request(*args))

        return(r)
    except:
        print('Error with Key {}'.format(api_number))
        new_api(silent=True)
        return(api_request(*args))

    
def new_api(silent=True):
    time.sleep(1 + randint(0, 19)/10)
    global api_number
    global api
    api_number += 1
    if not silent:
        print('API Key: {}'.format(api_number))
    if api_number >= len(token):
        api_number = 0
    api = TwitterAPI(config['spenden']['api_key'], 
                     config['spenden']['api_secret_key'],
                     token.iloc[api_number]['oauth_token'],
                     token.iloc[api_number]['oauth_secret']
                     )
    r = api.request('account/verify_credentials')
    if 'errors' in r.json():
        new_api()
    return()


def collect_friends(account_id, cursor = -1, over5000 = False):
    ids = []
    r = api.request('friends/ids', {'user_id': account_id, 'cursor': cursor})
    
    if 'errors' in r.json():
        if r.json()['errors'][0]['code'] == 34:
            return(ids)
        elif r.json()['errors'][0]['code'] == 326:
            pass
        elif r.json()['errors'][0]['code'] == 88:
            time.sleep(5)
            new_api()
            print('sleeping')
        else:
            print (r.json()['errors'])
    
    for item in r: 
        if isinstance(item, int):
            ids.append(item)
        elif 'message' in item:
            print ('{0} ({1})'.format(item['message'], item['code']))
            
    if over5000:
        if 'next_cursor' in r.json():
            if r.json()['next_cursor'] != 0:
                ids = ids + collect_friends(account_id, r.json()['next_cursor'])
    return(ids)


def save_friends(user, ids):
    with open('{0}{1}.f'.format(FDAT_DIR, user), 'w', encoding='utf-8') as f:
        f.write(str.join('\n', (str(x) for x in ids)))

        
def collect_and_save_friends(user, refresh = False):
    if not refresh and os.path.exists('{0}{1}.f'.format(FDAT_DIR,user)):
        return('Already saved: {}'.format(user))
    else:
        friends = collect_friends(user)
        save_friends(user,friends)
        return('Friends saved: {}'.format(user))

                
def create_bags(all_ids):
    global failed_ids
    
    print(f'Total: {len(all_ids)}')
    
    unfailed_ids = list(filter(lambda x: x not in failed_ids, all_ids))
    unsaved_ids = list(filter(lambda x: not os.path.exists(f'{FDAT_DIR}{x}.f'), unfailed_ids))
    
    
    print(f'Unsaved: {len(unsaved_ids)}')
    
    bags = [unsaved_ids[x:x+15] for x in range(0, len(unsaved_ids), 15)]
    
    return(bags)


def lazy_collect_from_ids():
    '''
    collect followings of all accounts
    '''
    global bags
    global failed_ids
    
    success_count = 0
    error_count = 0
    last_error = ''
    
    progressbar = tqdm.tqdm(total=len(bags))
    
    while len(bags) > 0:
    
        bag = bags.pop()
        progressbar.update(1)
        new_api(silent=True)
        for user_id in bag:
            try:
                collect_and_save_friends(user_id)
                success_count += 1
            except:
                failed_ids.add(user_id)
                last_error = user_id
                error_count += 1
                #new_api(silent=True)
            progressbar.set_postfix_str(f'\nFetched: {success_count}\nFailed: {error_count}\n Last error with {last_error}')
    progressbar.close()
    
    
def load_account_ids_from_jsonl(file):
    ids = []

    with open(file, 'rb') as f:
        for tweet in json_lines.reader(f, broken=True):
            if not tweet['user']['protected'] and tweet['user']['friends_count'] > 0 and tweet['user']['friends_count'] < 5000 and not tweet['user']['protected']:
                ids.append(tweet['user']['id'])
    return (ids)

In [None]:
import itertools
from multiprocessing import Pool
# https://medium.com/@grvsinghal/speed-up-your-python-code-using-multiprocessing-on-windows-and-jupyter-or-ipython-2714b49d6fac

tweets_files = ['lang_de-2020-10-06.jsonl',
                'lang_de-2020-10-08.jsonl',
                'lang_de-2020-10-09.jsonl',
                'lang_de-2020-10-11.jsonl',
                'lang_de-2020-10-12.jsonl',
                'lang_de-2020-10-14.jsonl',
                'lang_de-2020-10-16.jsonl',
                'lang_de-2020-10-20.jsonl',
                'lang_de-2020-10-22.jsonl',
                'lang_de-2020-10-23.jsonl',
                'lang_de-2020-10-24.jsonl',
                'lang_de-2020-10-25.jsonl',
                'lang_de-2020-10-26.jsonl',
                'lang_de-2020-10-28.jsonl',
                'lang_de-2020-10-29.jsonl',
                'lang_de-2020-10-30.jsonl',
                'lang_de-2020-10-31.jsonl',
                'lang_de-2020-11-01.jsonl',
                'lang_de-2020-11-02.jsonl',
                'lang_de-2020-11-03.jsonl',
                'lang_de-2020-11-04.jsonl',
                'lang_de-2020-11-05.jsonl'
               ]

def load_user_ids(tweets_files):
    start_time = time.time()
    with Pool() as pool:
        ids = list(itertools.chain.from_iterable(pool.map(load_account_ids_from_jsonl,[f for f in tweets_files])))
    print(f'Got {len(ids):,} in {(time.time() - start_time)} seconds')
    return (ids)
    
ids = load_user_ids(tweets_files)

In [None]:
# Accounts that tweeted at least x times
#https://stackoverflow.com/a/15862037/1158702
import collections

users = collections.Counter(ids)
print(f'Total {len(users)}')

X = 2
minX = (collections.Counter({k: users for k, users in users.items() if users >= X}))
print(f'Min {X} Tweets: {len(minX)}')

In [None]:
bags = create_bags(list(minX.keys()))

In [None]:
lazy_collect_from_ids()