Skip to content

Commit

Permalink
add Wikipedia example
Browse files Browse the repository at this point in the history
  • Loading branch information
minrk committed Jul 6, 2014
1 parent df3a04f commit 42ae138
Show file tree
Hide file tree
Showing 5 changed files with 789 additions and 0 deletions.
232 changes: 232 additions & 0 deletions examples/wikipedia/Wikipedia.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
{
"metadata": {
"name": "",
"signature": "sha256:07b06af47856760f4b8fd7d0ee04bd3d6d2038a9a1be22dcd94ee62d8df15fe3"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Crawling Wikipedia"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook crawls links on Wikipedia\n",
"and visualizes the graph with NetworkX and d3."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.html import widgets\n",
"from IPython.display import display\n",
"from eventful_graph import EventfulGraph\n",
"from widget_forcedirectedgraph import ForceDirectedGraphWidget, publish_js\n",
"publish_js()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import networkx as nx"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython import parallel\n",
"rc = parallel.Client()\n",
"lbv = rc.load_balanced_view()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%px --local\n",
"\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"import re\n",
"wiki_pat = re.compile(r'^/wiki/([^:]*)$')\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def links_for_page(title):\n",
" page = BeautifulSoup(requests.get('http://en.wikipedia.org/wiki/%s' % title).text)\n",
" links = page.find(\"div\", id=\"content\").findAll(\"a\", href=wiki_pat)\n",
" \n",
" titles = []\n",
" for link in links:\n",
" title = wiki_pat.match(link['href']).group(1)\n",
" titles.append(title)\n",
" \n",
" return titles"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def add_node(g, label, **kwargs):\n",
" \"\"\"add a node to a graph, with some default fill and color\"\"\"\n",
" kwargs.setdefault('fill', '#ccc')\n",
" kwargs.setdefault('color', 'black')\n",
" g.add_node(label, label=label, **kwargs)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def add_links(graph, src, links):\n",
" \"\"\"Add links from src to links in graph\"\"\"\n",
" new_nodes = []\n",
" add_node(graph, src)\n",
" n = len(links)\n",
" for i,link in enumerate(links):\n",
" if link not in graph:\n",
" new_nodes.append(link)\n",
" add_node(graph, link)\n",
" \n",
" graph.add_edge(src, link)#, distance=(i+0.2))\n",
" return new_nodes"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def wikipedia_graph(lbview, root, limit=32, in_degree_limit=3):\n",
" \"\"\"build a graph by crawling Wikipedia from a root page\n",
" \n",
" The visualized graph will be limited to pages linked from several other pages\n",
" \"\"\"\n",
" graph = nx.DiGraph()\n",
" egraph = EventfulGraph()\n",
"\n",
" graph_widget = ForceDirectedGraphWidget(egraph, width=800, height=600)\n",
" display(graph_widget)\n",
" \n",
" add_node(graph, root)\n",
" add_node(egraph, root, r=16, fill='#aef')\n",
" surface = [root]\n",
" while len(egraph) < limit:\n",
" surface = [ node for node in graph if graph.out_degree(node) == 0 ]\n",
" amr = lbview.map_async(links_for_page, surface)\n",
" for i, links in enumerate(amr):\n",
" src = surface[i]\n",
" links = links[:20]\n",
" add_links(graph, src, links)\n",
" for node in links:\n",
" if graph.in_degree(node) >= in_degree_limit:\n",
" path = nx.shortest_path(graph, root, node)\n",
" prv = root\n",
" for nxt in path[1:]:\n",
" if nxt not in egraph:\n",
" add_node(egraph, nxt)\n",
" egraph.add_edge(prv, nxt)\n",
" egraph.node[nxt]['r'] = min(3 * graph.in_degree(nxt), 24)\n",
" prv = nxt\n",
" for parent in graph.predecessors(node):\n",
" if parent in egraph:\n",
" egraph.add_edge(parent, node)\n",
" egraph.node[node]['r'] = min(3 * graph.in_degree(node), 24)\n",
" for child in graph.successors(node):\n",
" if child in egraph:\n",
" egraph.add_edge(node, child)\n",
" egraph.node[child]['r'] = min(3 * graph.in_degree(child), 24)\n",
" time.sleep(0.3)\n",
" if len(egraph) > limit:\n",
" return graph, egraph\n",
" print('%s: %i' % (src, len(graph)))\n",
" sys.stdout.flush()\n",
" return graph, egraph\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 73
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"g, eg = wikipedia_graph(lbv, 'SciPy', limit=20)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"g, eg = wikipedia_graph(lbv, 'Austin, TX', limit=12)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
100 changes: 100 additions & 0 deletions examples/wikipedia/eventful_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import time

class EventfulDict(dict):
"""Eventful dictionary"""

def __init__(self, *args, **kwargs):
"""Sleep is an optional float that allows you to tell the
dictionary to hang for the given amount of seconds on each
event. This is usefull for animations."""
self._sleep = kwargs.pop('sleep', 0.0)
self._add_callbacks = []
self._del_callbacks = []
self._set_callbacks = []
dict.__init__(self, *args, **kwargs)

def on_add(self, callback, remove=False):
self._register_callback(self._add_callbacks, callback, remove)
def on_del(self, callback, remove=False):
self._register_callback(self._del_callbacks, callback, remove)
def on_set(self, callback, remove=False):
self._register_callback(self._set_callbacks, callback, remove)
def _register_callback(self, callback_list, callback, remove=False):
if callable(callback):
if remove and callback in callback_list:
callback_list.remove(callback)
elif not remove and not callback in callback_list:
callback_list.append(callback)
else:
raise Exception('Callback must be callable.')

def _handle_add(self, key, value):
self._try_callbacks(self._add_callbacks, key, value)
self._try_sleep()
def _handle_del(self, key):
self._try_callbacks(self._del_callbacks, key)
self._try_sleep()
def _handle_set(self, key, value):
self._try_callbacks(self._set_callbacks, key, value)
self._try_sleep()
def _try_callbacks(self, callback_list, *pargs, **kwargs):
for callback in callback_list:
callback(*pargs, **kwargs)

def _try_sleep(self):
if self._sleep > 0.0:
time.sleep(self._sleep)

def __setitem__(self, key, value):
return_val = None
exists = False
if key in self:
exists = True

# If the user sets the property to a new dict, make the dict
# eventful and listen to the changes of it ONLY if it is not
# already eventful. Any modification to this new dict will
# fire a set event of the parent dict.
if isinstance(value, dict) and not isinstance(value, EventfulDict):
new_dict = EventfulDict(value)

def handle_change(*pargs, **kwargs):
self._try_callbacks(self._set_callbacks, key, dict.__getitem__(self, key))

new_dict.on_add(handle_change)
new_dict.on_del(handle_change)
new_dict.on_set(handle_change)
return_val = dict.__setitem__(self, key, new_dict)
else:
return_val = dict.__setitem__(self, key, value)

if exists:
self._handle_set(key, value)
else:
self._handle_add(key, value)
return return_val

def __delitem__(self, key):
return_val = dict.__delitem__(self, key)
self._handle_del(key)
return return_val

def pop(self, key):
return_val = dict.pop(self, key)
if key in self:
self._handle_del(key)
return return_val

def popitem(self):
popped = dict.popitem(self)
if popped is not None and popped[0] is not None:
self._handle_del(popped[0])
return popped

def update(self, other_dict):
for (key, value) in other_dict.items():
self[key] = value

def clear(self):
for key in list(self.keys()):
del self[key]
52 changes: 52 additions & 0 deletions examples/wikipedia/eventful_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""NetworkX graphs do not have events that can be listened to. In order to
watch the NetworkX graph object for changes a custom eventful graph object must
be created. The custom eventful graph object will inherit from the base graph
object and use special eventful dictionaries instead of standard Python dict
instances. Because NetworkX nests dictionaries inside dictionaries, it's
important that the eventful dictionary is capable of recognizing when a
dictionary value is set to another dictionary instance. When this happens, the
eventful dictionary needs to also make the new dictionary an eventful
dictionary. This allows the eventful dictionary to listen to changes made to
dictionaries within dictionaries."""
import networkx
from networkx.generators.classic import empty_graph

from eventful_dict import EventfulDict

class EventfulGraph(networkx.Graph):

_constructed_callback = None

@staticmethod
def on_constructed(callback):
"""Register a callback to be called when a graph is constructed."""
if callback is None or callable(callback):
EventfulGraph._constructed_callback = callback

def __init__(self, *pargs, **kwargs):
"""Initialize a graph with edges, name, graph attributes.
Parameters
sleep: float
optional float that allows you to tell the
dictionary to hang for the given amount of seconds on each
event. This is usefull for animations."""
super(EventfulGraph, self).__init__(*pargs, **kwargs)

# Override internal dictionaries with custom eventful ones.
sleep = kwargs.get('sleep', 0.0)
self.graph = EventfulDict(self.graph, sleep=sleep)
self.node = EventfulDict(self.node, sleep=sleep)
self.adj = EventfulDict(self.adj, sleep=sleep)

# Notify callback of construction event.
if EventfulGraph._constructed_callback:
EventfulGraph._constructed_callback(self)


def empty_eventfulgraph_hook(*pargs, **kwargs):
def wrapped(*wpargs, **wkwargs):
"""Wrapper for networkx.generators.classic.empty_graph(...)"""
wkwargs['create_using'] = EventfulGraph(*pargs, **kwargs)
return empty_graph(*wpargs, **wkwargs)
return wrapped
Loading

0 comments on commit 42ae138

Please sign in to comment.