In [None]:
# Examples from Mining the Social Web, section 7

In [6]:
import json
import sys
import os
from operator import itemgetter
from collections import Counter
import webbrowser
from ConfigParser import ConfigParser

import requests
from github import Github
import networkx as nx  # pip install networkx
from networkx.readwrite import json_graph

config = ConfigParser()
config.read('config.ini')
username = config.get('github', 'username')
password = config.get('github', 'password')
token = config.get('github', 'token')

def create_access_token():
    # Note that credentials will be transmitted over a secure SSL connection
    url = 'https://api.github.com/authorizations'
    note = 'Mining the Social Web, 2nd Ed.'
    post_data = {'scopes': ['repo'], 'note': note}

    response = requests.post(url, auth=(username, password), data=json.dumps(post_data), )

    print "API response:", response.text
    if response.json().has_key('errors'):
        print response.json()['errors'][0]['code']
    else:
        print "Your OAuth token is", response.json()['token']

create_access_token()

API response: {"message":"Validation Failed","errors":[{"resource":"OauthAccess","code":"already_exists","field":"description"}],"documentation_url":"https://developer.github.com/v3/oauth_authorizations/#create-a-new-authorization"}
already_exists


Create Github object and get response for stargazers:

In [11]:
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'

client = Github(token, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)

url = "https://api.github.com/repos/ptwobrussell/Mining-the-Social-Web/stargazers"
response = requests.get(url)
json.dumps(response.json()[0], indent=1)

'{\n "following_url": "https://api.github.com/users/georgebellos/following{/other_user}", \n "events_url": "https://api.github.com/users/georgebellos/events{/privacy}", \n "organizations_url": "https://api.github.com/users/georgebellos/orgs", \n "url": "https://api.github.com/users/georgebellos", \n "gists_url": "https://api.github.com/users/georgebellos/gists{/gist_id}", \n "html_url": "https://github.com/georgebellos", \n "subscriptions_url": "https://api.github.com/users/georgebellos/subscriptions", \n "avatar_url": "https://avatars.githubusercontent.com/u/13009?v=3", \n "repos_url": "https://api.github.com/users/georgebellos/repos", \n "received_events_url": "https://api.github.com/users/georgebellos/received_events", \n "gravatar_id": "", \n "starred_url": "https://api.github.com/users/georgebellos/starred{/owner}{/repo}", \n "site_admin": false, \n "login": "georgebellos", \n "type": "User", \n "id": 13009, \n "followers_url": "https://api.github.com/users/georgebellos/followers"

Print response headers:

In [10]:
for (k, v) in response.headers.items():
    print k, "=>", v

Server => GitHub.com
Date => Sat, 05 Nov 2016 18:54:37 GMT
Content-Type => application/json; charset=utf-8
Transfer-Encoding => chunked
Status => 200 OK
X-RateLimit-Limit => 60
X-RateLimit-Remaining => 58
X-RateLimit-Reset => 1478375436
Cache-Control => public, max-age=60, s-maxage=60
Vary => Accept, Accept-Encoding
ETag => W/"ccecbb8f902b5668c3d206bef821fc7c"
X-GitHub-Media-Type => github.v3
Link => <https://api.github.com/repositories/1040700/stargazers?page=2>; rel="next", <https://api.github.com/repositories/1040700/stargazers?page=37>; rel="last"
Access-Control-Expose-Headers => ETag, Link, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval
Access-Control-Allow-Origin => *
Content-Security-Policy => default-src 'none'
Strict-Transport-Security => max-age=31536000; includeSubdomains; preload
X-Content-Type-Options => nosniff
X-Frame-Options => deny
X-XSS-Protection => 1; mode=block
X-Served-By => bae57

Get number of stargazers:

In [12]:
stargazers = [s for s in repo.get_stargazers()]
print "Number of stargazers", len(stargazers)

Number of stargazers 1084


Create stargazers graph:

In [13]:
g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)
for sg in stargazers:
    g.add_node(sg.login + '(user)', type='user')
    g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')

print nx.info(g)

Name: 
Type: DiGraph
Number of nodes: 1085
Number of edges: 1084
Average in degree:   0.9991
Average out degree:   0.9991


Query this graph:

In [16]:
print g.node['Mining-the-Social-Web(repo)']
print g.node['ptwobrussell(user)']
print
print g['ptwobrussell(user)']['Mining-the-Social-Web(repo)']
# The next line would throw a KeyError since no such edge exists:
# print g['Mining-the-Social-Web(repo)']['ptwobrussell(user)']
print g['ptwobrussell(user)']
print g['Mining-the-Social-Web(repo)']
print
print g.in_edges(['ptwobrussell(user)'])
print g.out_edges(['ptwobrussell(user)'])

{'lang': u'JavaScript', 'owner': u'ptwobrussell', 'type': 'repo'}
{'type': 'user'}

{'type': 'gazes'}
{u'Mining-the-Social-Web(repo)': {'type': 'gazes'}}
{}

[]
[('ptwobrussell(user)', u'Mining-the-Social-Web(repo)')]


Print followers (extract them from incoming edges):

In [19]:
print [i for (i,o) in g.in_edges(['Mining-the-Social-Web(repo)'])]

[u'yobo000(user)', u'gregmoreno(user)', u'SathishRaju(user)', u'beinvest(user)', u'gawry(user)', u'decadef20(user)', u'darkfall(user)', u'daimajia(user)', u'henri-nourel(user)', u'chihuanqi(user)', u'program247365(user)', u'xiaodiu2010(user)', u'yoghi(user)', u'dylanthomas(user)', u'EugeneLiang(user)', u'longislandicetea(user)', u'jagguli(user)', u'ConceptKreator(user)', u'rgtjf(user)', u'daseme(user)', u'keokilee(user)', u'azuranop(user)', u'enriquesanchezb(user)', u'chhzhangs(user)', u'jfca(user)', u'terry2012(user)', u'yy(user)', u'batasrki(user)', u'yhj8341(user)', u'zerojarvis(user)', u'royburns(user)', u'coodoing(user)', u'bootstrapt(user)', u'ZoomQuiet(user)', u'trietptm(user)', u'bryantchan(user)', u'metllord(user)', u'flashus(user)', u'TomiToivio(user)', u'hammer(user)', u'paulbersch(user)', u'tranminhan(user)', u'amckenna(user)', u'zihaolucky(user)', u'khaing211(user)', u'thulio(user)', u'virajkulkarni14(user)', u'khurchla(user)', u'timmyshen(user)', u'jujubepalm(user)', u'ac

The classic Krackhardt kite graph

In [20]:
kkg = nx.generators.small.krackhardt_kite_graph()
print "Degree Centrality"
print sorted(nx.degree_centrality(kkg).items(), key=itemgetter(1), reverse=True)
print
print "Betweenness Centrality"
print sorted(nx.betweenness_centrality(kkg).items(), key=itemgetter(1), reverse=True)
print
print "Closeness Centrality"
print sorted(nx.closeness_centrality(kkg).items(), key=itemgetter(1), reverse=True)

Degree Centrality
[(3, 0.6666666666666666), (5, 0.5555555555555556), (6, 0.5555555555555556), (0, 0.4444444444444444), (1, 0.4444444444444444), (2, 0.3333333333333333), (4, 0.3333333333333333), (7, 0.3333333333333333), (8, 0.2222222222222222), (9, 0.1111111111111111)]

Betweenness Centrality
[(7, 0.38888888888888884), (5, 0.23148148148148148), (6, 0.23148148148148148), (8, 0.2222222222222222), (3, 0.10185185185185183), (0, 0.023148148148148143), (1, 0.023148148148148143), (2, 0.0), (4, 0.0), (9, 0.0)]

Closeness Centrality
[(5, 0.6428571428571429), (6, 0.6428571428571429), (3, 0.6), (7, 0.6), (0, 0.5294117647058824), (1, 0.5294117647058824), (2, 0.5), (4, 0.5), (8, 0.42857142857142855), (9, 0.3103448275862069)]


Add "follows" edges between stargazers in the graph if any relationships exist

In [None]:
    for i, sg in enumerate(stargazers):
        try:
            for follower in sg.get_followers():
                if follower.login + '(user)' in g:
                    g.add_edge(follower.login + '(user)', sg.login + '(user)', type='follows')
        except Exception, e:  # ssl.SSLError
            print >> sys.stderr, "Encountered an error fetching followers for", sg.login, "Skipping."
            print >> sys.stderr, e
        print "Processed", i + 1, " stargazers. Num nodes/edges in graph", \
            g.number_of_nodes(), "/", g.number_of_edges()
        print "Rate limit remaining", client.rate_limiting

nx.write_gpickle(g, "data/github.gpickle.1")

Load graph and analyze centrality measures:

In [25]:
g = nx.read_gpickle("data/github.gpickle.1")
# Create a copy of the graph so that we can iteratively mutate the copy as needed for experimentation
h = g.copy()

# Remove the seed of the interest graph, which is a supernode, in order to get a better idea of the network dynamics
h.remove_node('Mining-the-Social-Web(repo)')

# XXX: Remove any other nodes that appear to be supernodes. Filter any other nodes that you can by threshold
# criteria or heuristics from inspection.
dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Degree Centrality"
print dc[:10]
print

bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Betweenness Centrality"
print bc[:10]
print

print "Closeness Centrality"
cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True)
print cc[:10]

Degree Centrality
[(u'kennethreitz(user)', 0.1218568665377176), (u'ptwobrussell(user)', 0.11121856866537717), (u'trietptm(user)', 0.05609284332688588), (u'JT5D(user)', 0.024177949709864605), (u'daimajia(user)', 0.02321083172147002), (u'rohithadassanayake(user)', 0.022243713733075435), (u'hammer(user)', 0.0183752417794971), (u'isnowfy(user)', 0.0183752417794971), (u'japerk(user)', 0.015473887814313346), (u'gawbul(user)', 0.013539651837524178)]

Betweenness Centrality
[(u'kennethreitz(user)', 0.0006747044501158732), (u'daimajia(user)', 0.000385723728188353), (u'hupili(user)', 0.000301463690477305), (u'rohithadassanayake(user)', 0.0002649510074691842), (u'mcroydon(user)', 0.0002552767068431009), (u'douglas(user)', 0.0002078414263539184), (u'acdha(user)', 0.00017179685466641448), (u'tswicegood(user)', 0.00011655971883361639), (u'trietptm(user)', 0.00011531142197863787), (u'isnowfy(user)', 0.00010298449053572532)]

Closeness Centrality
[(u'trietptm(user)', 0.0602264325668581), (u'ido(user)'

Add each stargazer's additional starred repos and add edges to find additional interests

In [None]:
MAX_REPOS = 500
for i, sg in enumerate(stargazers):
    print sg.login
    try:
        for starred in sg.get_starred()[:MAX_REPOS]:  # Slice to avoid supernodes
            g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \
                       owner=starred.owner.login)
            g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
    except Exception, e:  # ssl.SSLError:
        print "Encountered an error fetching starred repos for", sg.login, "Skipping."
    print "Processed", i + 1, "stargazers' starred repos"
    print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
    print "Rate limit", client.rate_limiting

# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "data/github.gpickle.2")

Load graph:

In [27]:
g = nx.read_gpickle("data/github.gpickle.2")
print(nx.info(g))

Name: 
Type: DiGraph
Number of nodes: 82898
Number of edges: 202431
Average in degree:   2.4419
Average out degree:   2.4419


Analyse repositories:

In [28]:
# Get a list of repositories from the graph.
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']
# Most popular repos
print "Popular repositories"
print sorted([(n, d) for (n, d) in g.in_degree_iter()
              if g.node[n]['type'] == 'repo'], key=itemgetter(1), reverse=True)[:10]
print

# Projects gazed at by a user
print "Respositories that ptwobrussell has bookmarked"
print [(n, g.node[n]['lang']) for n in g['ptwobrussell(user)']
       if g['ptwobrussell(user)'][n]['type'] == 'gazes']
print

# Programming languages for each user
print "Programming languages ptwobrussell is interested in"
print list(set([g.node[n]['lang'] for n in g['ptwobrussell(user)']
                if g['ptwobrussell(user)'][n]['type'] == 'gazes']))
print

# Find supernodes in the graph by approximating with a high number of outgoing edges
print "Supernode candidates"
print sorted([(n, len(g.out_edges(n))) for n in g.nodes_iter()
              if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], key=itemgetter(1), reverse=True)

Popular repositories
[(u'Mining-the-Social-Web(repo)', 1035), (u'bootstrap(repo)', 281), (u'd3(repo)', 231), (u'dotfiles(repo)', 191), (u'free-programming-books(repo)', 158), (u'storm(repo)', 155), (u'node-v0.x-archive(repo)', 137), (u'requests(repo)', 135), (u'Mining-the-Social-Web-2nd-Edition(repo)', 133), (u'scrapy(repo)', 130)]

Respositories that ptwobrussell has bookmarked
[(u'gsoc-freebase-graph-importer(repo)', u'Java'), (u'schema(repo)', u'Clojure'), (u'dj-database-url(repo)', u'Python'), (u'stackdio(repo)', u'Python'), (u'x6502(repo)', u'C'), (u'TextBlob(repo)', u'Python'), (u'ipython-notebox(repo)', u'Ruby'), (u'networkx(repo)', u'Python'), (u'tiddlylisp(repo)', u'Python'), (u'fastly-py(repo)', u'Python'), (u'elasticsearch-py(repo)', u'Python'), (u'scikit-tensor(repo)', u'Python'), (u'webscalesql-5.6(repo)', u'C++'), (u'x-editable(repo)', u'JavaScript'), (u'premailer(repo)', u'Python'), (u'vowpal_wabbit(repo)', u'C++'), (u'CoreNLP(repo)', u'Java'), (u'torch-demos(repo)', u'L

Update graph to include nodes for programming languages by iterating over all of the repos, and adding edges for programming languages for each person in the graph. We'll also add edges back to repos so that we have a good point to "pivot" upon.

In [None]:
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']
for repo in repos:
    lang = (g.node[repo]['lang'] or "") + "(lang)"
    stargazers = [u for (u, r, d) in g.in_edges_iter(repo, data=True) if d['type'] == 'gazes']
    for sg in stargazers:
        g.add_node(lang, type='lang')
        g.add_edge(sg, lang, type='programs')
        g.add_edge(lang, repo, type='implements')

nx.write_gpickle(g, "data/github.gpickle.3")

Load graph from pickle:

In [29]:
g = nx.read_gpickle("data/github.gpickle.3")
print nx.info(g)

Name: 
Type: DiGraph
Number of nodes: 83080
Number of edges: 301663
Average in degree:   3.6310
Average out degree:   3.6310


Query the graph to analyse use of programming languages:

In [30]:
# What languages exist in the graph?
print [n for n in g.nodes_iter() if g.node[n]['type'] == 'lang']
print

print 'What languages do users program with?'
print [n for n in g['ptwobrussell(user)'] if g['ptwobrussell(user)'][n]['type'] == 'programs']

print "Most popular languages"
print sorted([(n, g.in_degree(n)) for n in g.nodes_iter()
              if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]
print

# How many users program in a particular language?
python_programmers = [u for (u, l) in g.in_edges_iter('Python(lang)') if g.node[u]['type'] == 'user']
print "Number of Python programmers:", len(python_programmers)

javascript_programmers = [u for (u, l) in g.in_edges_iter('JavaScript(lang)')
                          if g.node[u]['type'] == 'user']
print "Number of JavaScript programmers:", len(javascript_programmers)
print

# What users program in both Python and JavaScript?
print "Number of programmers who use JavaScript and Python"
print len(set(python_programmers).intersection(set(javascript_programmers)))

# Programmers who use JavaScript but not Python
print "Number of programmers who use JavaScript but not Python"
print len(set(javascript_programmers).difference(set(python_programmers)))

[u'CoffeeScript(lang)', u'Clojure(lang)', u'Objective-C++(lang)', u'Smali(lang)', u'PHP(lang)', u'Elixir(lang)', u'Liquid(lang)', u'Processing(lang)', u'Gnuplot(lang)', u'HTML(lang)', u'Rust(lang)', u'Pure Data(lang)', u'SuperCollider(lang)', u'Hy(lang)', u'Julia(lang)', u'Groovy(lang)', u'Haxe(lang)', u'CSS(lang)', u'Pascal(lang)', u'JavaScript(lang)', u'ApacheConf(lang)', u'Protocol Buffer(lang)', u'ActionScript(lang)', u'Common Lisp(lang)', u'QML(lang)', u'Xtend(lang)', u'Visual Basic(lang)', u'Objective-C(lang)', u'Delphi(lang)', u'KiCad(lang)', u'Objective-J(lang)', u'Scala(lang)', u'Smalltalk(lang)', u'Nginx(lang)', u'CMake(lang)', u'ASP(lang)', u'XML(lang)', u'Ruby(lang)', u'Logtalk(lang)', u'VHDL(lang)', u'LOLCODE(lang)', u'C++(lang)', u'RAML(lang)', u'PostScript(lang)', u'Vala(lang)', u'Bison(lang)', u'SaltStack(lang)', u'Frege(lang)', u'ChucK(lang)', u'Perl(lang)', u'Groff(lang)', u'Racket(lang)', u'Oz(lang)', u'SQLPL(lang)', u'LilyPond(lang)', u'F#(lang)', u'Opa(lang)', u'Pu

Create a subgraph from a collection of nodes. In this case, the collection is all of the users in the original interest graph

In [31]:
mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
h = g.subgraph(mtsw_users)

print "Stats on the extracted subgraph"
print nx.info(h)

Stats on the extracted subgraph
Name: 
Type: DiGraph
Number of nodes: 1035
Number of edges: 716
Average in degree:   0.6918
Average out degree:   0.6918


Visualize the social network of all people from the original interest graph.

In [35]:
from IPython.display import IFrame
from IPython.core.display import display

d = json_graph.node_link_data(h)
json.dump(d, open('visualization/force.json', 'w'))

viz_file = 'visualization/force.html'
display(IFrame(viz_file, '100%', '900px'))