# The Wikipedia Hyperlink Graph of Pages in the Categories of Computational Biology & Bioinformatics
## By Moses Boudourides

In [1]:
import networkx as nx
import pygraphviz
from networkx.drawing.nx_agraph import graphviz_layout
from networkx.algorithms import community
import community as louvain
import numpy as np
import pickle
import sys 
import holoviews as hv
from holoviews import dim, opts
hv.extension('bokeh', 'matplotlib')

In [2]:
sys.path.insert(0, '/Users/moses/WorkPlaces/Python Projects 2/WikipediaNets/ComputationalBiology&Bioinformatics')
from mytools import connected_component_subgraphs,g_diagnostics,g_diameter,HVNX_PLOT,GirvanNewmanComms,AsyncLabelPropagationComms,FluidComms


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [3]:
Title="Categories of Computational Biology & Bioinformatics"
with open("pcb.pkl", 'rb') as f:
    pcb = pickle.load(f)
with open("pbi.pkl", 'rb') as f:
    pbi = pickle.load(f)
with open("pco.pkl", 'rb') as f:
    pco = pickle.load(f)
with open("Hyperlinks_Among_Wikipedia_"+Title.replace(" ","")+".pkl", 'rb') as f:
    eds = pickle.load(f)
print('There are %i hyperlinks among the "%s" Wikipedia peges' %(len(eds),Title))

There are 1175 hyperlinks among the "Categories of Computational Biology & Bioinformatics" Wikipedia peges


In [4]:
Gd = nx.DiGraph()
Gd.add_edges_from(eds)
print(list(nx.isolates(Gd)))
ss="The '%s' directed graph has %i nodes (webpages) and %i edges (hyperlinks)" %(Title,len(Gd.nodes()),len(Gd.edges()))
print(ss)

[]
The 'Categories of Computational Biology & Bioinformatics' directed graph has 282 nodes (webpages) and 1175 edges (hyperlinks)


In [5]:
type_d={}
for n in Gd.nodes():
    if n in pcb:
        type_d[n]="Computational Biology"
    elif n in pbi:
        type_d[n]="Bioinformatics"
    else:
        type_d[n]="Computational Biology and Bioinformatics"
        
for n in Gd.nodes():
    Gd.nodes[n]['type']=type_d[n]
    
# Girvan-Newman communities:

gncp = community.girvan_newman(Gd)
top_level_communities = next(gncp)
next_level_communities = next(gncp)
lc=sorted(sorted(map(sorted, next_level_communities)), key=len,reverse=True)
gncp_d={n:i for i,c in enumerate(lc) for n in c}

# Asynchronous Label Propagation communities:

alpc=nx.algorithms.community.label_propagation.asyn_lpa_communities(Gd)
alpc=sorted(alpc, key=len, reverse=True)
alpc_d={n:i for i,c in enumerate(alpc) for n in c}

comms=["Girvan-Newman communities","Asynchronous label propagation communities"]
dictc=[gncp_d,alpc_d]
for c in comms:
    if c==comms[0]:
        dictc0=dictc[c.index(c)]
    if c==comms[1]:
        dictc1=dictc[c.index(c)]
nc0=max(list(dictc0.values()))+1
nc1=max(list(dictc1.values()))+1

indegree_d=Gd.in_degree
outdegree_d=Gd.out_degree

for n in Gd.nodes():
    Gd.nodes[n]['node in-degree']=indegree_d[n]
    Gd.nodes[n]['node out-degree']=outdegree_d[n]
    Gd.nodes[n][comms[0]]=dictc0[n]
    Gd.nodes[n][comms[1]]=dictc1[n]

In [6]:
name="digraph of Wikipedia pages in the %s" %Title
pos=graphviz_layout(Gd)
width=1000
height=1000
bundled=1
nodelabels=1
xoffset=0
yoffset=-5
arrowhead_length=0.02
selection_mode='nodes'
selection_policy="nodes"
edge_hover_line_color='green'
node_hover_fill_color='red'
fontsize={'title': '9pt'}
text_font_size='5pt'
text_color="darkgrey" #'black' #'white'
bgcolor='white'
title="The %s (with bundled edges)" %name
node_color='type' #c 
node_cmap=["lime","dodgerblue","gold","orangered","yellow","green","magenta"]  #cmaps[comms.index(c)] 
node_size=2*np.log(5+hv.dim('node in-degree'))
edge_color='lightblue' #'lightgreen'
edge_line_width=1
g=HVNX_PLOT(Gd,pos,width,height,bundled,nodelabels,title,node_size,node_color,node_cmap,edge_color,edge_line_width,xoffset,yoffset,arrowhead_length,selection_mode,selection_policy,edge_hover_line_color,node_hover_fill_color,fontsize,text_font_size, text_color,bgcolor)
g.opts(title=title) #"The %s" %name)
hv.save(g, 'WikipediaNetworkHyperlinkComputationalBiology&Bioinformatics_11.html', backend='bokeh')
g

In [7]:
bundled=0
nodelabels=1
arrowhead_length=0.005
title="The %s" %name
g=HVNX_PLOT(Gd,pos,width,height,bundled,nodelabels,title,node_size,node_color,node_cmap,edge_color,edge_line_width,xoffset,yoffset,arrowhead_length,selection_mode,selection_policy,edge_hover_line_color,node_hover_fill_color,fontsize,text_font_size, text_color,bgcolor)
g.opts(title=title) #"The %s" %name)
hv.save(g, 'WikipediaNetworkHyperlinkComputationalBiology&Bioinformatics_01.html', backend='bokeh')
g