-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
789 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
{ | ||
"metadata": { | ||
"name": "", | ||
"signature": "sha256:07b06af47856760f4b8fd7d0ee04bd3d6d2038a9a1be22dcd94ee62d8df15fe3" | ||
}, | ||
"nbformat": 3, | ||
"nbformat_minor": 0, | ||
"worksheets": [ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "heading", | ||
"level": 1, | ||
"metadata": {}, | ||
"source": [ | ||
"Crawling Wikipedia" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"This notebook crawls links on Wikipedia\n", | ||
"and visualizes the graph with NetworkX and d3." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"%matplotlib inline\n", | ||
"import matplotlib.pyplot as plt" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"from IPython.html import widgets\n", | ||
"from IPython.display import display\n", | ||
"from eventful_graph import EventfulGraph\n", | ||
"from widget_forcedirectedgraph import ForceDirectedGraphWidget, publish_js\n", | ||
"publish_js()" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"import networkx as nx" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"from IPython import parallel\n", | ||
"rc = parallel.Client()\n", | ||
"lbv = rc.load_balanced_view()" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"%%px --local\n", | ||
"\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"\n", | ||
"import re\n", | ||
"wiki_pat = re.compile(r'^/wiki/([^:]*)$')\n" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"def links_for_page(title):\n", | ||
" page = BeautifulSoup(requests.get('http://en.wikipedia.org/wiki/%s' % title).text)\n", | ||
" links = page.find(\"div\", id=\"content\").findAll(\"a\", href=wiki_pat)\n", | ||
" \n", | ||
" titles = []\n", | ||
" for link in links:\n", | ||
" title = wiki_pat.match(link['href']).group(1)\n", | ||
" titles.append(title)\n", | ||
" \n", | ||
" return titles" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"def add_node(g, label, **kwargs):\n", | ||
" \"\"\"add a node to a graph, with some default fill and color\"\"\"\n", | ||
" kwargs.setdefault('fill', '#ccc')\n", | ||
" kwargs.setdefault('color', 'black')\n", | ||
" g.add_node(label, label=label, **kwargs)" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"def add_links(graph, src, links):\n", | ||
" \"\"\"Add links from src to links in graph\"\"\"\n", | ||
" new_nodes = []\n", | ||
" add_node(graph, src)\n", | ||
" n = len(links)\n", | ||
" for i,link in enumerate(links):\n", | ||
" if link not in graph:\n", | ||
" new_nodes.append(link)\n", | ||
" add_node(graph, link)\n", | ||
" \n", | ||
" graph.add_edge(src, link)#, distance=(i+0.2))\n", | ||
" return new_nodes" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"def wikipedia_graph(lbview, root, limit=32, in_degree_limit=3):\n", | ||
" \"\"\"build a graph by crawling Wikipedia from a root page\n", | ||
" \n", | ||
" The visualized graph will be limited to pages linked from several other pages\n", | ||
" \"\"\"\n", | ||
" graph = nx.DiGraph()\n", | ||
" egraph = EventfulGraph()\n", | ||
"\n", | ||
" graph_widget = ForceDirectedGraphWidget(egraph, width=800, height=600)\n", | ||
" display(graph_widget)\n", | ||
" \n", | ||
" add_node(graph, root)\n", | ||
" add_node(egraph, root, r=16, fill='#aef')\n", | ||
" surface = [root]\n", | ||
" while len(egraph) < limit:\n", | ||
" surface = [ node for node in graph if graph.out_degree(node) == 0 ]\n", | ||
" amr = lbview.map_async(links_for_page, surface)\n", | ||
" for i, links in enumerate(amr):\n", | ||
" src = surface[i]\n", | ||
" links = links[:20]\n", | ||
" add_links(graph, src, links)\n", | ||
" for node in links:\n", | ||
" if graph.in_degree(node) >= in_degree_limit:\n", | ||
" path = nx.shortest_path(graph, root, node)\n", | ||
" prv = root\n", | ||
" for nxt in path[1:]:\n", | ||
" if nxt not in egraph:\n", | ||
" add_node(egraph, nxt)\n", | ||
" egraph.add_edge(prv, nxt)\n", | ||
" egraph.node[nxt]['r'] = min(3 * graph.in_degree(nxt), 24)\n", | ||
" prv = nxt\n", | ||
" for parent in graph.predecessors(node):\n", | ||
" if parent in egraph:\n", | ||
" egraph.add_edge(parent, node)\n", | ||
" egraph.node[node]['r'] = min(3 * graph.in_degree(node), 24)\n", | ||
" for child in graph.successors(node):\n", | ||
" if child in egraph:\n", | ||
" egraph.add_edge(node, child)\n", | ||
" egraph.node[child]['r'] = min(3 * graph.in_degree(child), 24)\n", | ||
" time.sleep(0.3)\n", | ||
" if len(egraph) > limit:\n", | ||
" return graph, egraph\n", | ||
" print('%s: %i' % (src, len(graph)))\n", | ||
" sys.stdout.flush()\n", | ||
" return graph, egraph\n", | ||
" " | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [], | ||
"prompt_number": 73 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"g, eg = wikipedia_graph(lbv, 'SciPy', limit=20)" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [ | ||
"g, eg = wikipedia_graph(lbv, 'Austin, TX', limit=12)" | ||
], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"collapsed": false, | ||
"input": [], | ||
"language": "python", | ||
"metadata": {}, | ||
"outputs": [] | ||
} | ||
], | ||
"metadata": {} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import time | ||
|
||
class EventfulDict(dict): | ||
"""Eventful dictionary""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
"""Sleep is an optional float that allows you to tell the | ||
dictionary to hang for the given amount of seconds on each | ||
event. This is usefull for animations.""" | ||
self._sleep = kwargs.pop('sleep', 0.0) | ||
self._add_callbacks = [] | ||
self._del_callbacks = [] | ||
self._set_callbacks = [] | ||
dict.__init__(self, *args, **kwargs) | ||
|
||
def on_add(self, callback, remove=False): | ||
self._register_callback(self._add_callbacks, callback, remove) | ||
def on_del(self, callback, remove=False): | ||
self._register_callback(self._del_callbacks, callback, remove) | ||
def on_set(self, callback, remove=False): | ||
self._register_callback(self._set_callbacks, callback, remove) | ||
def _register_callback(self, callback_list, callback, remove=False): | ||
if callable(callback): | ||
if remove and callback in callback_list: | ||
callback_list.remove(callback) | ||
elif not remove and not callback in callback_list: | ||
callback_list.append(callback) | ||
else: | ||
raise Exception('Callback must be callable.') | ||
|
||
def _handle_add(self, key, value): | ||
self._try_callbacks(self._add_callbacks, key, value) | ||
self._try_sleep() | ||
def _handle_del(self, key): | ||
self._try_callbacks(self._del_callbacks, key) | ||
self._try_sleep() | ||
def _handle_set(self, key, value): | ||
self._try_callbacks(self._set_callbacks, key, value) | ||
self._try_sleep() | ||
def _try_callbacks(self, callback_list, *pargs, **kwargs): | ||
for callback in callback_list: | ||
callback(*pargs, **kwargs) | ||
|
||
def _try_sleep(self): | ||
if self._sleep > 0.0: | ||
time.sleep(self._sleep) | ||
|
||
def __setitem__(self, key, value): | ||
return_val = None | ||
exists = False | ||
if key in self: | ||
exists = True | ||
|
||
# If the user sets the property to a new dict, make the dict | ||
# eventful and listen to the changes of it ONLY if it is not | ||
# already eventful. Any modification to this new dict will | ||
# fire a set event of the parent dict. | ||
if isinstance(value, dict) and not isinstance(value, EventfulDict): | ||
new_dict = EventfulDict(value) | ||
|
||
def handle_change(*pargs, **kwargs): | ||
self._try_callbacks(self._set_callbacks, key, dict.__getitem__(self, key)) | ||
|
||
new_dict.on_add(handle_change) | ||
new_dict.on_del(handle_change) | ||
new_dict.on_set(handle_change) | ||
return_val = dict.__setitem__(self, key, new_dict) | ||
else: | ||
return_val = dict.__setitem__(self, key, value) | ||
|
||
if exists: | ||
self._handle_set(key, value) | ||
else: | ||
self._handle_add(key, value) | ||
return return_val | ||
|
||
def __delitem__(self, key): | ||
return_val = dict.__delitem__(self, key) | ||
self._handle_del(key) | ||
return return_val | ||
|
||
def pop(self, key): | ||
return_val = dict.pop(self, key) | ||
if key in self: | ||
self._handle_del(key) | ||
return return_val | ||
|
||
def popitem(self): | ||
popped = dict.popitem(self) | ||
if popped is not None and popped[0] is not None: | ||
self._handle_del(popped[0]) | ||
return popped | ||
|
||
def update(self, other_dict): | ||
for (key, value) in other_dict.items(): | ||
self[key] = value | ||
|
||
def clear(self): | ||
for key in list(self.keys()): | ||
del self[key] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""NetworkX graphs do not have events that can be listened to. In order to | ||
watch the NetworkX graph object for changes a custom eventful graph object must | ||
be created. The custom eventful graph object will inherit from the base graph | ||
object and use special eventful dictionaries instead of standard Python dict | ||
instances. Because NetworkX nests dictionaries inside dictionaries, it's | ||
important that the eventful dictionary is capable of recognizing when a | ||
dictionary value is set to another dictionary instance. When this happens, the | ||
eventful dictionary needs to also make the new dictionary an eventful | ||
dictionary. This allows the eventful dictionary to listen to changes made to | ||
dictionaries within dictionaries.""" | ||
import networkx | ||
from networkx.generators.classic import empty_graph | ||
|
||
from eventful_dict import EventfulDict | ||
|
||
class EventfulGraph(networkx.Graph): | ||
|
||
_constructed_callback = None | ||
|
||
@staticmethod | ||
def on_constructed(callback): | ||
"""Register a callback to be called when a graph is constructed.""" | ||
if callback is None or callable(callback): | ||
EventfulGraph._constructed_callback = callback | ||
|
||
def __init__(self, *pargs, **kwargs): | ||
"""Initialize a graph with edges, name, graph attributes. | ||
Parameters | ||
sleep: float | ||
optional float that allows you to tell the | ||
dictionary to hang for the given amount of seconds on each | ||
event. This is usefull for animations.""" | ||
super(EventfulGraph, self).__init__(*pargs, **kwargs) | ||
|
||
# Override internal dictionaries with custom eventful ones. | ||
sleep = kwargs.get('sleep', 0.0) | ||
self.graph = EventfulDict(self.graph, sleep=sleep) | ||
self.node = EventfulDict(self.node, sleep=sleep) | ||
self.adj = EventfulDict(self.adj, sleep=sleep) | ||
|
||
# Notify callback of construction event. | ||
if EventfulGraph._constructed_callback: | ||
EventfulGraph._constructed_callback(self) | ||
|
||
|
||
def empty_eventfulgraph_hook(*pargs, **kwargs): | ||
def wrapped(*wpargs, **wkwargs): | ||
"""Wrapper for networkx.generators.classic.empty_graph(...)""" | ||
wkwargs['create_using'] = EventfulGraph(*pargs, **kwargs) | ||
return empty_graph(*wpargs, **wkwargs) | ||
return wrapped |
Oops, something went wrong.