In [2]:
import numpy as np
import pandas as pd
import pickle
import time
import requests

from os import listdir
from os.path import isfile, join
from datetime import datetime

In [3]:
OUTPUT_PATH = 'output'
TOKEN_POOL = ['AAAAAAAAAAAAAAAAAAAAADrgBQEAAAAA5CoiD%2F1y7eYo6keNRgsk9bvC3G0%3DsmVZEpdiiRqA1UZNOPUV1UFJut7Xxp1AkifiSi9CpNIUDbjNcq',
              'AAAAAAAAAAAAAAAAAAAAAHrHUQEAAAAAEuCpSGtduf0OzCGziFepoTneKfs%3DFSjYWNvOhr5R5Rakgky4IVQt0g011PILwSqJtpK5ZvRcJlkJqu',
              'AAAAAAAAAAAAAAAAAAAAANOPVgEAAAAAcmgH4HRq%2FyRg5485gFbcUqTso0Y%3DB2LKEzoL9HphAFn6jqwVRNIecLClzuPed4yYlcHBYETq8QqxJv']

In [4]:
def get_last_checkpoint(prefix='data'):
  files = [f for f in listdir(OUTPUT_PATH) if isfile(join(OUTPUT_PATH, f)) and f.startswith(f'{prefix}_') and f.endswith('.pickle')]
  
  if not files:
    return None
  
  files.sort(reverse=True)
  path = join(OUTPUT_PATH, files[0])
  print(f'loading {path} ...')
  # read pickle file
  with open(path, 'rb') as f:
    df = pickle.load(f)
  
  return df

def save_checkpoint(df, prefix='data'):
  path = join(OUTPUT_PATH, f'{prefix}_{ datetime.now().strftime("%m%d_%H%M%S") }.pickle')
  # save dataframe to pickle
  with open(path, 'wb') as f:
    pickle.dump(df, f)
    print('checkpoint saved on', path)

def filter_dataframe(df):
  path = join(OUTPUT_PATH, 'id_list.txt')
  ids = []
  with open(path) as f:
    for l in f:
      ids.append(l.strip())

  return df.drop(index=ids, errors = 'ignore')


def translate(data):
    data['followers'] = data['public_metrics']['followers_count']
    data['following'] = data['public_metrics']['following_count']
    data['following_ids'] = None
    data['log_ratio'] = None
    data.pop('public_metrics')
    return data

def next_level(df, mode='followers', last_level=False, max_pagination=1000):
    df = df.copy()
    
    i=0
    for idx in df[df.following_ids.isnull()].index:

        if mode == 'following':
          res = twitter.get_user_following(idx, max_pagination=max_pagination)
        else:
          res = twitter.get_user_followers(idx, max_pagination=max_pagination)

        data = []
        ids = []
        for r in res:
            data.append( translate(r) )
            ids.append( str(r['id']) )
        
        df.loc[idx, 'following_ids'] = ','.join(ids)
        
        if not last_level and data:
            new_df = pd.DataFrame.from_dict(data).set_index('id')
            new_ids = set(new_df.index) - set(df.index)
            df = df.append(new_df.loc[new_ids])
        
        if i % 25 == 0:
            save_checkpoint(df)
        
        i += 1
    
    return df

In [5]:
class TwitterAPI:

    def __init__(self, BEARER_TOKEN=None, POOL=None):
        self.PATH = 'https://api.twitter.com/2'
        self.BEARER_TOKEN = POOL[0] if POOL else BEARER_TOKEN
        self.POOL_IDX = 0
        self.REQUEST_COUNT = 0 # Contador de requests
        self.REQUEST_MAX = 15 # Maximo 15 requests
        self.POOL = POOL if POOL else [BEARER_TOKEN]
        self.headers =  {"Authorization": f"Bearer {self.BEARER_TOKEN}"}

    def update_bearer_token(self):
        self.POOL_IDX = (self.POOL_IDX + 1) % len(self.POOL)
        self.BEARER_TOKEN = self.POOL[self.POOL_IDX]
        self.headers =  {"Authorization": f"Bearer {self.BEARER_TOKEN}"}
        print(f'Changing token to {self.BEARER_TOKEN}...')


    def get_user(self, user_id):
        params = { 'user.fields':'public_metrics,profile_image_url' }
        res = requests.get(f'{self.PATH}/users/{user_id}', params=params, headers=self.headers)
        return res.json()

    def get_user_followers(self, user_id, max_results=1000, max_pagination=1000):
        params = { 'max_results':max_results, 'user.fields':'public_metrics,profile_image_url' }
        data = []
        next_token = True

        i = 0
        while next_token and i < max_pagination:
            try:
                res = requests.get(f'{self.PATH}/users/{user_id}/followers', params=params, headers=self.headers)
                
                self.REQUEST_COUNT = (self.REQUEST_COUNT + 1) % self.REQUEST_MAX
                if self.REQUEST_COUNT == 0:
                  self.update_bearer_token()

                if res.status_code != 200:
                    print('Taking a nap...')
                    time.sleep( 15*60  + 10) # Sleep 15 mins and 10 seconds
                    self.REQUEST_COUNT = 0
                    print('Waking up again!')

                elif res.json().get('errors'):
                    print('There was an error')
                    print(res.json()['errors'])
                    next_token = False

                else:
                    data += res.json().get('data', [])
                    next_token = res.json().get('meta', {}).get('next_token')
                    params['pagination_token'] = next_token
                    i += 1
            except Exception as e:
                print('There was an error! Taking a nap...')
                print(e)
                time.sleep(60) # Sleep 1 min
                print('Waking up again!')
        return data

    def get_user_following(self, user_id, max_results=1000, max_pagination=1000):
        params = { 'max_results':max_results, 'user.fields':'public_metrics,profile_image_url' }
        data = []
        next_token = True

        i = 0
        while next_token and i < max_pagination:
            try:
                res = requests.get(f'{self.PATH}/users/{user_id}/following', params=params, headers=self.headers)
                
                self.REQUEST_COUNT = (self.REQUEST_COUNT + 1) % self.REQUEST_MAX
                if self.REQUEST_COUNT == 0:
                  self.update_bearer_token()

                if res.status_code != 200:
                    print('Taking a nap...')
                    time.sleep( 15*60  + 10) # Sleep 15 mins and 10 seconds
                    self.REQUEST_COUNT = 0
                    print('Waking up again!')

                elif res.json().get('errors'):
                    print('There was an error')
                    print(res.json()['errors'])
                    next_token = False

                else:
                    data += res.json().get('data', [])
                    next_token = res.json().get('meta', {}).get('next_token')
                    params['pagination_token'] = next_token
                    i += 1
            except Exception as e:
                print('There was an error! Taking a nap...')
                print(e)
                time.sleep(60) # Sleep 1 min
                print('Waking up again!')
        return data

In [6]:
twitter = TwitterAPI(POOL=TOKEN_POOL)

##  Extraccion del primer nivel

In [None]:
# Nodos Iniciales
starting_nodes = [2391563839,
 1192537548641443840,
 94529497,
 1131926131559546881,
 42274372,
 105197405,
 99324417,
 193504277,
 1679468156,
 1305910250613202946]

df = pd.DataFrame.from_dict([ translate(twitter.get_user(n)['data']) for n in starting_nodes]).set_index('id')
df = next_level(df)#activa la búsqueda
df['log_ratio'] = np.log10(df.followers / (df.following + 1)) # relacion entre seguidos y seguidores
#aveces twitter no te da el numero correcto de seguidores o seguidos y coloca -1, y te da warning jeje por division entre 0

save_checkpoint(df, 'first_level')#guardar porque dentro de la funcion next_level guarda por defecto cada 25 usuarios, 
                                #y el proceso puede terminar antes del numero 25, así se guardan los restantes
                                #first_level contiene tooooodos los usuarios, los archivos anteriores son copias, tienen info redundante


Taking a nap...
Waking up again!
Changing token to AAAAAAAAAAAAAAAAAAAAAHrHUQEAAAAAEuCpSGtduf0OzCGziFepoTneKfs%3DFSjYWNvOhr5R5Rakgky4IVQt0g011PILwSqJtpK5ZvRcJlkJqu...
Changing token to AAAAAAAAAAAAAAAAAAAAANOPVgEAAAAAcmgH4HRq%2FyRg5485gFbcUqTso0Y%3DB2LKEzoL9HphAFn6jqwVRNIecLClzuPed4yYlcHBYETq8QqxJv...
checkpoint saved on output\data_0404_145729.pickle
Changing token to AAAAAAAAAAAAAAAAAAAAADrgBQEAAAAA5CoiD%2F1y7eYo6keNRgsk9bvC3G0%3DsmVZEpdiiRqA1UZNOPUV1UFJut7Xxp1AkifiSi9CpNIUDbjNcq...
Taking a nap...


## Primer Filtrado

In [None]:
df = get_last_checkpoint('first_level')
df = df[df.following < 1e3].sort_values('following', ascending=False)
df = filter_dataframe(df)
save_checkpoint(df, 'filtered_first_level')

loading /content/drive/MyDrive/Colab Notebooks/mbu-graph/output_client/first_level_0326_044823.pickle ...
checkpoint saved on /content/drive/MyDrive/Colab Notebooks/mbu-graph/output_client/filtered_first_level_0326_200128.pickle


##  Extraccion del segundo nivel

In [None]:
df = get_last_checkpoint('filtered_first_level')
df = next_level(df, last_level=True, max_pagination=15)

In [15]:
df = get_last_checkpoint('data')
df = next_level(df, last_level=True, max_pagination=15)
save_checkpoint(df, 'second_level')

KeyboardInterrupt: ignored