# Who Follows Trump on Twitter? A Graphical Approach to Social Media Analysis

In this notebook, we will try, using network analysis libraries, to cluster social network users and produce a simple analysis of these clusters. The goal is to show you how to use `networkx` to discover clusters in any network.

### First get the data

In [1]:
from zipfile import ZipFile
import urllib2
from StringIO import StringIO
import os
import time

DATA_URL = 'https://www.dropbox.com/s/mow0ru0duq3l1dl/trump_followers.zip?dl=1'
DATA_DIR = './data'

def download_and_extract_content(url, directory):
    print 'Downloading the data...'
    s = time.time()
    response = urllib2.urlopen(url)
    content = response.read()
    print 'Finished in %.2f seconds.' % (time.time()-s)
    
    print 'Extracting the archive to %s' % directory
    s = time.time()
    zip_data = StringIO()
    zip_data.write(content)
    
    zip_file = ZipFile(zip_data)
    zip_file.extractall(path=directory)
    print 'Done in %.2f seconds.' % (time.time()-s)

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
    download_and_extract_content(DATA_URL, DATA_DIR)

Downloading the data...
Finished in 56.31 seconds.
Extracting the archive to ./data
Done in 795.38 seconds.


### Helper functions to interact with and read the data

In [4]:
import codecs
import json
from os import listdir
from os.path import isfile, join

def get_trump_followers(directory=DATA_DIR+'/friends'):
    for fname in listdir(directory):
        fpath = join(directory, fname)
        if isfile(fpath):
            # make sure the file contains some ids as some will be empty
            # due to the account being private
            with open(fpath, 'r') as f:
                if f.read().strip() != '':
                    yield fname, fpath

def get_friends(trump_follower, fpath):
    with open(fpath, 'r') as f:
        ffriends = f.read().split(',')
    for friend in ffriends:
        if friend != '':
            yield friend

def get_user_data(user, directory=DATA_DIR+'/users'):
    fpath = join(directory, user)
    if not isfile(fpath):
        return None
    with codecs.open(fpath, 'r', 'utf-8') as f:
        return json.load(f)

def get_users_data(users):
    for u in users:
        yield get_user_data(user)

def id_to_str(id_):
    data = get_user_data(id_)
    if not data is None:
        return '%s' % data['name']
    return id_

def id_to_screenname(id_):
    data = get_user_data(id_)
    if not data is None:
        return '%s' % data['screen_name']
    return ''

### Helpers to build the network

In [28]:
import networkx as nx # pip install networkx
import community      # pip install python-louvain
import numpy as np    # pip install numpy

num_trump_followers = 0 # this counts the final number of Trump followers in the network

def get_followers_and_friends():
    followers = dict()
    their_friends = dict()
    for trump_follower, fpath in get_trump_followers():
        followers[trump_follower] = set()
        for friend in get_friends(trump_follower, fpath):
            followers[trump_follower].add(friend)

            if friend not in their_friends:
                their_friends[friend] = set()
            their_friends[friend].add(trump_follower)
        
    return followers, their_friends

def get_outliers(followers, their_friends, min_num_followers, num_supernodes_to_drop):
    ''' There are 3 outlier detection steps that we'll do:
        1. For Trump's followers, any that has less than 25% friends of the overall
           counts, or any that has more than (1.5 * Interquartile range) friends.
        2. For their friends: anyone that has less than X followers in total, where X
           is a parameter. The idea is to remove those that have very few followers as
           they won't be useful in our analysis.
        3. The supernodes: nodes that are connected to a LOT of other nodes. Trump will
           likely be at the top, but removing others as well should help get rid of some
           noise.
    '''
    
    print 'Outlier detection...'
    outliers = set()
    
    # 1.
    friends_counts = [len(ffriends) for follower, ffriends in followers.iteritems()]
    friends_q1, friends_q3 = np.percentile(friends_counts, [25, 75])
    iqr = friends_q3 - friends_q1
    min_friends = friends_q1
    max_friends = friends_q3 + 1.5 * iqr
    print '\tAllowing followers with friends between %.1f and %.1f' % (min_friends, max_friends)
    
    outliers |= {follower for follower, ffriends in followers.iteritems()
                               if len(ffriends) < min_friends or len(ffriends) > max_friends}
    
    print '\tNumber of outlier Trump followers that will be ignored: %d' % len(outliers)
    
    # 2.
    friends_outliers = {friend for friend, ffollowers in their_friends.iteritems()
                               if len(ffollowers) < min_num_followers}
    print '\tNumber of outlier friends of Trump\'s followers that will be ignored: %d' % len(friends_outliers)
    
    outliers |= friends_outliers
    
    # 3.
    top_followed = sorted(their_friends.keys(), key=lambda x: len(their_friends[x]), reverse=True)
    top_followed = top_followed[:num_supernodes_to_drop]
    print '\tRemoving top %d followed accounts:' % num_supernodes_to_drop
    print ', '.join([id_to_screenname(id_) for id_ in top_followed])
    
    outliers |= set(top_followed)
    
    return outliers

def get_network(followers, outliers=None):
    global num_trump_followers
    num_trump_followers = 0
    
    if outliers is None:
        outliers = set()
    
    G = nx.DiGraph()
    for trump_follower, ffriends in followers.iteritems():
        if trump_follower in outliers:
            continue
        num_trump_followers += 1
        
        for friend in ffriends:
            if friend in outliers:
                continue
            
            G.add_edge(trump_follower, friend)
            
            # add some attributes to the nodes so we can inspect the nodes in Gephi
            # this is a slow IO process so comment out if you don't care about this
            follower_data = get_user_data(trump_follower)
            friend_data   = get_user_data(friend)
            
            fo_name  = follower_data['name']        if follower_data else ''
            fo_sname = follower_data['screen_name'] if follower_data else ''
            fr_name  = friend_data['name']          if friend_data else ''
            fr_sname = friend_data['screen_name']   if friend_data else ''

            G.node[trump_follower]['name'] = fo_name
            G.node[trump_follower]['screenname'] = fo_sname
            G.node[friend]['name'] = fr_name
            G.node[friend]['screenname'] = fr_sname
        
    print 'Number of Trump followers in the network:', num_trump_followers
    print 'Number of nodes:', G.order()
    print 'Number of edges:', G.size()
    print 'Average degree:', np.array(nx.degree(G).values()).mean()
    
    return G

def build_network():
        
    # 1. first get all of trump's followers and their friends
    followers, their_friends = get_followers_and_friends()
    print 'Found %d Trump followers.' % len(followers)
    
    # 2. then get outliers
    outliers = get_outliers(followers, their_friends, min_num_followers=10, num_supernodes_to_drop=50)
    print '\tTotal outlier nodes: %d' % len(outliers) 

    # 3. finally, build the graph from the remaining followers
    return get_network(followers, outliers)

In [29]:
G = build_network()

Found 12902 Trump followers.
Outlier detection...
	Allowing followers with friends between 48.0 and 822.4
	Number of outlier Trump followers that will be ignored: 4619
	Number of outlier friends of Trump's followers that will be ignored: 2092741
	Removing top 50 followed accounts:
realDonaldTrump, BarackObama, cnnbrk, jimmyfallon, TheEllenShow, nytimes, katyperry, taylorswift13, CNN, BillGates, Forbes, KimKardashian, ladygaga, justinbieber, WSJ, jtimberlake, FoxNews, SportsCenter, YouTube, KingJames, Oprah, espn, rihanna, KevinHart4real, TheEconomist, WhiteHouse, ActuallyNPH, kanyewest, ConanOBrien, selenagomez, danieltosh, BBCBreaking, richardbranson, Reuters, NFL, Drake, HillaryClinton, NASA, MileyCyrus, POTUS, twitter, khloekardashian, MarketWatch, NBA, kourtneykardash, blakeshelton, wizkhalifa, BBCWorld, britneyspears, TIME
	Total outlier nodes: 2097261
Number of Trump followers in the network: 8178
Number of nodes: 48008
Number of edges: 813916
Average degree: 33.9075154141


### Optional: Save the network for use with Gephi

In [11]:
nx.write_graphml(G, DATA_DIR+'/network.graphml')

### Calculate the communities in the network

In [30]:
communities = community.best_partition(G.to_undirected(), resolution=1.0)
print 'Number of communities:', len(set(communities.values()))

Number of communities: 18


### Calculate the size of each community and how many trump followers are in each

In [31]:
sizes = dict()
for node, c in communities.iteritems():
    if not c in sizes:
        sizes[c] = 0
    sizes[c] += 1

trump_followers = map(lambda (trump_follower, fpath): trump_follower,get_trump_followers())
followers_per_community = dict()
for c in sizes:
    trump_followers_in_community = filter(lambda follower: communities.get(follower, -1)==c, trump_followers)
    followers_per_community[c] = len(trump_followers_in_community)

### Compute a centrality score for each node

In [32]:
# scores = nx.eigenvector_centrality(G, max_iter=1000)
scores = nx.pagerank(G)

# Results

### For each community, print the top N nodes based on their scores

In [33]:
N = 10

for c in sorted(followers_per_community.keys(), key=followers_per_community.get, reverse=True):

    print 'Community %d' % (c,)
    
    percent_overall = 100.0 * sizes[c] / len(communities)
    percent_trump   = 100.0 * followers_per_community[c] / num_trump_followers
    
    print '\tNumber of nodes: %d (%%%.2f), out of which, %d are Trump followers (%%%.2f)' % \
                (sizes[c], percent_overall, followers_per_community[c], percent_trump)
    
    nodes = filter(lambda n: communities[n]==c, scores.keys())
    print '\tTop %d Nodes (PageRank):' % N
    for node in sorted(nodes, key=scores.get, reverse=True)[:N]:
        print '\t\t%s\t%.12f\t' % (id_to_str(node), scores[node])

Community 4
	Number of nodes: 11234 (%23.40), out of which, 1742 are Trump followers (%21.30)
	Top 10 Nodes (PageRank):
		Dr. Ben Carson	0.000265679505	
		Ted Cruz	0.000217594350	
		Marco Rubio	0.000190006283	
		SHAQ	0.000174564531	
		Kevin Durant	0.000149938824	
		Megyn Kelly	0.000140340060	
		Tiger Woods	0.000136802297	
		Kobe Bryant	0.000125326549	
		Sean Hannity	0.000124281932	
		Mitt Romney	0.000115294762	
Community 1
	Number of nodes: 5387 (%11.22), out of which, 1547 are Trump followers (%18.92)
	Top 10 Nodes (PageRank):
		Ariana Grande	0.000299297842	
		Kendall	0.000281915571	
		Kylie Jenner	0.000278941210	
		Stephen Colbert	0.000271917690	
		Aziz Ansari	0.000269557233	
		Seth MacFarlane	0.000247524672	
		Funny Or Die	0.000246660854	
		Mark Cuban	0.000237261108	
		Bill Clinton	0.000228092338	
		UberFacts	0.000227512505	
Community 5
	Number of nodes: 7376 (%15.36), out of which, 1352 are Trump followers (%16.53)
	Top 10 Nodes (PageRank):
		Anderson Cooper	0.000215303009	
		Ivank

### The final result: for each community, calculate the top N friends for Trump's followers in that community.

In [34]:
N = 10

def get_friends_counts_from_graph(users, G):
    friends = dict()
    for u in users:
        for _, f in G.out_edges(u):
            if not f in friends:
                friends[f] = 0
            friends[f] += 1
    return friends

for community in sorted(followers_per_community.keys(), key=followers_per_community.get, reverse=True):
    print 'Community %d' % (community,)
    
    percent_overall = 100.0 * sizes[community] / len(communities)
    percent_trump   = 100.0 * followers_per_community[community] / num_trump_followers
    
    print '\tNumber of nodes: %d (%%%.2f), out of which, %d are Trump followers (%%%.2f)' % \
                (sizes[community], percent_overall, followers_per_community[community], percent_trump)
    
    trump_followers_in_community = filter(lambda follower: communities.get(follower, -1)==community, trump_followers)
    
    friend_counts = get_friends_counts_from_graph(trump_followers_in_community, G)
    print '\tTotal number of friends: %d' % len(friend_counts)
    print '\tTop %d Friends of Trump\'s followers:' % N
    for friend in sorted(friend_counts.keys(), key=friend_counts.get, reverse=True)[:N]:
        print '\t\t%s\t%d' % (id_to_str(friend), friend_counts[friend])

Community 4
	Number of nodes: 11234 (%23.40), out of which, 1742 are Trump followers (%21.30)
	Total number of friends: 21683
	Top 10 Friends of Trump's followers:
		Adam Schefter	431
		Dr. Ben Carson	424
		Sean Hannity	407
		Megyn Kelly	392
		Marco Rubio	386
		Ted Cruz	358
		Ann Coulter	328
		Bill O'Reilly	320
		Mark Cuban	314
		SHAQ	299
Community 1
	Number of nodes: 5387 (%11.22), out of which, 1547 are Trump followers (%18.92)
	Total number of friends: 15483
	Top 10 Friends of Trump's followers:
		Aziz Ansari	518
		United Nations	470
		Mike Bloomberg	438
		Vogue Magazine	396
		Mindy Kaling	395
		NPR	390
		Mark Cuban	389
		Twitter TV	383
		CHANEL	382
		Funny Or Die	379
Community 5
	Number of nodes: 7376 (%15.36), out of which, 1352 are Trump followers (%16.53)
	Total number of friends: 20753
	Top 10 Friends of Trump's followers:
		ashton kutcher	423
		Tom Hanks	403
		Ryan Seacrest	396
		Anderson Cooper	362
		P!nk	358
		Ivanka Trump	350
		Jim Carrey	344
		Jimmy Kimmel	328
		Charlie Sh