In [1]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket

# Params

In [2]:
cutoff = 1000
print('Save Data After Downloading',cutoff,'Timelines')

Save Data After Downloading 1000 Timelines


In [3]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 7059205
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 1


In [5]:
country_codes=[
'US',
# 'ID',
'BR',
# 'TR',
'MX',
'AR',
# 'PH',
'CO',
# 'MY',
# 'VE',
# 'TH',
]

if len(country_codes)!=SLURM_ARRAY_TASK_COUNT:
    sys.exit('Check jobarray size...exit.')

country_code=country_codes[SLURM_ARRAY_TASK_ID]

print('Country:', country_code)

Country: BR


In [6]:
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'

path_to_users = os.path.join(path_to_data,'users')
path_to_locations = os.path.join(path_to_data,'locations','profiles')
path_to_keys = os.path.join(path_to_data,'keys','twitter')
path_to_timelines = os.path.join(path_to_data,'timelines','API',country_code)
os.makedirs(path_to_timelines, exist_ok=True)
print(path_to_users)
print(path_to_locations)
print(path_to_keys)
print(path_to_timelines)

/scratch/spf248/twitter/data/users
/scratch/spf248/twitter/data/locations/profiles
/scratch/spf248/twitter/data/keys/twitter
/scratch/spf248/twitter/data/timelines/API/BR


# Credentials

In [7]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'*.json')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check environment variables:')
        print('# Credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only keeping', SLURM_JOB_CPUS_PER_NODE, 'credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files

key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))

Check environment variables:
# Credentials ( 49 ) > # CPU ( 1 )
Only keeping 1 credentials
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-zohar.json


In [8]:
def get_auth(key_file):
    
    # Import Key
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
        print(key_file,": Authentication checked")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return api

# for key_file in np.random.permutation(glob(os.path.join(path_to_keys,'*.json'))):
#     get_auth(key_file)
# print('Credentials Checked!')

# User List

In [9]:
print('Import Users By Account Locations')
start = timer()

l = []
for filename in sorted(glob(os.path.join(path_to_users,'user-ids-by-account-location-verified/*.json'))):
    try:
        df = pd.read_json(filename,lines=True)
        l.append(df)
    except:
        print('error importing', filename)
users_by_account_location=pd.concat(l, axis=0, ignore_index=True)
users_by_account_location=users_by_account_location.set_index('user_location')['user_id']
users_by_account_location=users_by_account_location.apply(eval).apply(lambda x:[str(y) for y in x])
print('# Locations:', len(users_by_account_location))
print('# Users Total:', users_by_account_location.apply(len).sum())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users By Account Locations
# Locations: 39779
# Users Total: 92088032
Computing Time: 175 sec


In [10]:
print('Import Locations')
account_locations=pd.read_pickle(os.path.join(path_to_locations,'account-locations-identified.pkl')) 
print('# Locations:', len(account_locations))

Import Locations
# Locations: 39779


In [11]:
def get_users(country_code):  

    # Select Country Users
    users = sorted(itertools.chain.from_iterable(
    users_by_account_location.reindex(
    account_locations.loc[
    account_locations['country_short']==country_code,'LOCATION'])))
    
    # Randomize All Users
    np.random.seed(0)
    users=np.random.permutation(users)
    
    return users
    
start = timer()
print('Select Country Users...')

users = get_users(country_code)
del users_by_account_location
del account_locations

print('# Users in',country_code,':', len(users))
    
end = timer()
print('Computing Time:', round(end - start), 'sec')

Select Country Users...
# Users in BR : 4863404
Computing Time: 11 sec


Nb of verified users in the US = 21,205,171

In [12]:
# Users Whose Timeline Were Successfully Downloaded
def get_success(path_to_timelines):
    
    if not os.path.exists(os.path.join(path_to_timelines, 'success')):
        return set()
    else:
        success = set()
        with open(os.path.join(path_to_timelines, 'success'), 'r', encoding='utf-8') as file:
            for line in file:
                success.add(line.strip('\n').split('\t')[0])
        return set(success)

success = get_success(path_to_timelines)
print('# Downloaded Timelines:', len(success))

# Downloaded Timelines: 12000


In [13]:
users=list(set(users).difference(success))

# Randomize Remaining Users
np.random.seed(0)
users=list(np.random.permutation(users))
    
print('# Remaining Users For this Node:', len(users))

# Remaining Users For this Node: 4851404


# Get Timelines

In [14]:
def get_timeline(user_id,api):
    
    timeline = []
    error = None
    
    # Collect All Statuses in Timeline
    try:
        cursor = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=3200,
        tweet_mode="extended", 
        include_rts=True).items()
        
        for status in cursor:
            timeline.append(status._json)
     
    except tweepy.error.TweepError as e:
        error = str(e)
        
    return pd.DataFrame(timeline), error

# timeline = get_user_timeline('12',get_auth(key_file))

In [15]:
def get_timelines_by_block(index_key):

    # Create Access For Block of Users
    api = get_auth(key_files[index_key])
    
    # Select Block of Users
    users_block = np.array_split(users,len(key_files))[index_key]
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    # Initialize DataFrame
    timelines = pd.DataFrame()
    
    # Initialize Downloaded User List
    downloaded_ids = []
    
    for user_id in users_block:
        
        # Try Downloading Timeline
        timeline, error = get_timeline(user_id,api)
        
        if error!=None:
#             print(user_id,index_key,error)
            continue
            
        # Append
        timelines = pd.concat([timelines, timeline],sort=False)
        downloaded_ids.append(user_id)
            
        # Save after <cutoff> timelines or when reaching last user
        if len(downloaded_ids) == cutoff or user_id == users_block[-1]:
            
            print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output id', output_id)
            
            filename = \
            'timelines-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            str(len(downloaded_ids))+'-'+\
            output_id+'.json.bz2'
            
            # Save as list of dict discarding index
            timelines.to_json(
            os.path.join(path_to_timelines,filename),
            orient='records',
            force_ascii=False,
            date_format=None,
            double_precision=15)
            
            # Save User Id and File In Which Its Timeline Was Saved
            with open(os.path.join(path_to_timelines,'success'), 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id+'\t'+filename+'\n')
            
            # Reset Output File ID, Data, and Downloaded Users
            del timelines, downloaded_ids
            output_id = str(uuid.uuid4())
            timelines = pd.DataFrame()
            downloaded_ids = []
            
    return 0

In [16]:
print('Extract Timelines...\n')
with mp.Pool() as pool:
    pool.map(get_timelines_by_block, range(len(key_files)))

Extract Timelines...

/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-zohar.json : Authentication checked
Process 0 saving 10 timelines with output id cfce2357-11f3-4b71-8b69-705a409610d3
