In [15]:
import networkx as nx
import random
import numpy
from collections import deque
import csv
from threading import Thread
from Queue import Queue

In [73]:
def parse_twitter_users():
    # return: ({ user: followers (#out) }, { user: following (#in) })
    outdegrees = dict()
    indegrees = dict()
    with open('twitter-analysis/users.csv', 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for row in reader:
            if not row or row[0] == '':
                continue
            user_id = row[0]
            outdegree = row[13]
            indegree = row[14]
            outdegrees[user_id] = outdegree
            indegrees[user_id] = indegree

    return set(outdegrees.keys()), outdegrees, indegrees

In [74]:
users, followers, following = parse_twitter_users()
# print(users)
# print(outdegrees)

In [69]:
def build_twitter_network(users, outdegrees, indegrees):
    edges = { user:[] for user in users }
    down_counter = indegrees.copy()
    
    # for each user, populate its followers
    for idx, user in enumerate(list(users)):
        
        if idx % 100 == 0 and idx > 0:
            print('finished building followers for %d users' % idx)
        
        # for each follower count, select a follower, dec their indegree
        followers = int(outdegrees[user])
        for follower in range(followers):
            
            if follower % 100 == 0 and follower > 0:
                print('-------- CHECKPOINT --------> finished {0}/{1} on user {2}'.format(follower, followers, idx))
            
            selected = False
            while not selected:
                
                # should get invoked last, when we finally run out
                if down_counter.keys() == []:
                    print('RAN OUT OF USERS TO ASSIGN')
                    return edges
                
                new_follower = random.choice(list(down_counter.keys()))
                if int(down_counter[new_follower]) > 0:
                    edges[user].append(new_follower)
                    down_counter[new_follower] = int(down_counter[new_follower]) - 1
                    if down_counter[new_follower] == 0:
                        del down_counter[new_follower]
                    selected = True
                else:
                    continue
                
    return edges

In [70]:
users, followers, following = parse_twitter_users()
print('building network now...')
edges = build_twitter_network(users, followers, following)

building network now...
-------- CHECKPOINT --------> finished 100/1067 on user 4
-------- CHECKPOINT --------> finished 200/1067 on user 4
-------- CHECKPOINT --------> finished 300/1067 on user 4
-------- CHECKPOINT --------> finished 400/1067 on user 4
-------- CHECKPOINT --------> finished 500/1067 on user 4
-------- CHECKPOINT --------> finished 600/1067 on user 4
-------- CHECKPOINT --------> finished 700/1067 on user 4
-------- CHECKPOINT --------> finished 800/1067 on user 4
-------- CHECKPOINT --------> finished 900/1067 on user 4
-------- CHECKPOINT --------> finished 1000/1067 on user 4
-------- CHECKPOINT --------> finished 100/102 on user 27
-------- CHECKPOINT --------> finished 100/161 on user 32
-------- CHECKPOINT --------> finished 100/184 on user 40
-------- CHECKPOINT --------> finished 100/289 on user 45
-------- CHECKPOINT --------> finished 200/289 on user 45
-------- CHECKPOINT --------> finished 100/271 on user 46
-------- CHECKPOINT --------> finished 200/271 

-------- CHECKPOINT --------> finished 100/110 on user 598
finished building followers for 600 users
-------- CHECKPOINT --------> finished 100/153 on user 613
-------- CHECKPOINT --------> finished 100/7131 on user 617
-------- CHECKPOINT --------> finished 200/7131 on user 617
-------- CHECKPOINT --------> finished 300/7131 on user 617
-------- CHECKPOINT --------> finished 400/7131 on user 617
-------- CHECKPOINT --------> finished 500/7131 on user 617
-------- CHECKPOINT --------> finished 600/7131 on user 617
-------- CHECKPOINT --------> finished 700/7131 on user 617
-------- CHECKPOINT --------> finished 800/7131 on user 617
-------- CHECKPOINT --------> finished 900/7131 on user 617
-------- CHECKPOINT --------> finished 1000/7131 on user 617
-------- CHECKPOINT --------> finished 1100/7131 on user 617
-------- CHECKPOINT --------> finished 1200/7131 on user 617
-------- CHECKPOINT --------> finished 1300/7131 on user 617
-------- CHECKPOINT --------> finished 1400/7131 on user

-------- CHECKPOINT --------> finished 200/239 on user 946
-------- CHECKPOINT --------> finished 100/332 on user 956
-------- CHECKPOINT --------> finished 200/332 on user 956
-------- CHECKPOINT --------> finished 300/332 on user 956
-------- CHECKPOINT --------> finished 100/131 on user 973
-------- CHECKPOINT --------> finished 100/1011 on user 978
-------- CHECKPOINT --------> finished 200/1011 on user 978
-------- CHECKPOINT --------> finished 300/1011 on user 978
-------- CHECKPOINT --------> finished 400/1011 on user 978
-------- CHECKPOINT --------> finished 500/1011 on user 978
-------- CHECKPOINT --------> finished 600/1011 on user 978
-------- CHECKPOINT --------> finished 700/1011 on user 978
-------- CHECKPOINT --------> finished 800/1011 on user 978
-------- CHECKPOINT --------> finished 900/1011 on user 978
-------- CHECKPOINT --------> finished 1000/1011 on user 978
finished building followers for 1000 users
-------- CHECKPOINT --------> finished 100/385 on user 1023
-

-------- CHECKPOINT --------> finished 200/253 on user 1682
-------- CHECKPOINT --------> finished 100/167 on user 1689
-------- CHECKPOINT --------> finished 100/168 on user 1698
finished building followers for 1700 users
-------- CHECKPOINT --------> finished 100/203 on user 1709
-------- CHECKPOINT --------> finished 200/203 on user 1709
-------- CHECKPOINT --------> finished 100/142 on user 1722
-------- CHECKPOINT --------> finished 100/6103 on user 1729
-------- CHECKPOINT --------> finished 200/6103 on user 1729
-------- CHECKPOINT --------> finished 300/6103 on user 1729
-------- CHECKPOINT --------> finished 400/6103 on user 1729
-------- CHECKPOINT --------> finished 500/6103 on user 1729
-------- CHECKPOINT --------> finished 600/6103 on user 1729
-------- CHECKPOINT --------> finished 700/6103 on user 1729
-------- CHECKPOINT --------> finished 800/6103 on user 1729
-------- CHECKPOINT --------> finished 900/6103 on user 1729
-------- CHECKPOINT --------> finished 1000/6103

-------- CHECKPOINT --------> finished 400/480 on user 1813
-------- CHECKPOINT --------> finished 100/510 on user 1851
-------- CHECKPOINT --------> finished 200/510 on user 1851
-------- CHECKPOINT --------> finished 300/510 on user 1851
-------- CHECKPOINT --------> finished 400/510 on user 1851
-------- CHECKPOINT --------> finished 500/510 on user 1851
-------- CHECKPOINT --------> finished 100/746 on user 1862
-------- CHECKPOINT --------> finished 200/746 on user 1862
-------- CHECKPOINT --------> finished 300/746 on user 1862
-------- CHECKPOINT --------> finished 400/746 on user 1862
-------- CHECKPOINT --------> finished 500/746 on user 1862
-------- CHECKPOINT --------> finished 600/746 on user 1862
-------- CHECKPOINT --------> finished 700/746 on user 1862
-------- CHECKPOINT --------> finished 100/231 on user 1863
-------- CHECKPOINT --------> finished 200/231 on user 1863
-------- CHECKPOINT --------> finished 100/550 on user 1875
-------- CHECKPOINT --------> finished 2

ValueError: invalid literal for int() with base 10: 'following_count'

In [None]:
# checkpoint: have set of users, have dict of { user: followers }
# generate artificial tweet cascade -> analyze first k retweets, 