In [8]:
import pickle
import networkx
from itertools import combinations

creating_pref_attach_graph = False;
creating_pref_attach_reduced_graph = False;
creating_shortest_paths_graph = True;

### LINKS
Tutorial how to use networkX : <a href="https://networkx.github.io/documentation/stable/tutorial.html#accessing-edges-and-neighbors"> networkx.github.io</a> 

In [2]:
with open('base_data/G_california.p', 'rb') as f:
    G = pickle.load(f)

In [3]:
print("Number of nodes : ", G.number_of_nodes())
print("Number of edges : ", G.number_of_edges())
print("\n10 first nodes :\n", "\t"+ "\n\t".join([str(node) for node in list(G.nodes())[:10]]))
print("\n10 first edges :\n", "\t" +"\n\t".join([str(edge) for edge in list(G.edges())[:10]]))

Number of nodes :  9816
Number of edges :  16424

10 first nodes :
 	hortau
	spiritual-gangster
	brit
	seatninja
	bluedata-software
	village-defense
	netherfire-entertainment
	dubuc-motors
	reflektive
	bundled-bliss

10 first edges :
 	('hortau', 'inv_advantage-capital-partners')
	('hortau', 'inv_avrio-capital')
	('spiritual-gangster', 'inv_m3-ventures')
	('brit', 'inv_shervin-pishevar')
	('brit', 'inv_general-catalyst-partners')
	('brit', 'inv_jim-felding')
	('brit', 'inv_cowboy-ventures')
	('brit', 'inv_marissa-mayer')
	('brit', 'inv_jennifer-hyman')
	('brit', 'inv_ff-angel-llc')


In [4]:
invs = [node for node in list(G.nodes()) if node[:4]=='inv_']
vs = [node for node in list(G.nodes()) if node[:4]!='inv_']
n_invs= len(invs)
n_vs = len(vs)
print("Extracted {0} ventures and {1} investors so {2} (/{3}) nodes ".format(len(invs), len(vs), len(invs)+len(vs),  G.number_of_nodes()))

Extracted 5883 ventures and 3933 investors so 9816 (/9816) nodes 


### 2 - Preferential attachement based on degrees

In this part we are interested in preferential attachement, which states that two nodes are likely to be linked when they are already of *high degree* which means, that those nodes are already having a lot of links (edges).

In the following we show that **preferential attachement of (venture, inv) is 16 times bigger when indeed a link exists between the two stakeholders**. This results might be discussed because, nodes which are connected in the real graph, because of being connected, are likely to have high degree.

To verify more subtily our result, we should get rid of one existing link, calculate the pref attachement between the two nodes, and then suppose that this link is likely to happen, in the same graph as initial except that special edge. To do that, we should just reduce by one the degree of the two nodes in the measure of metric_pref_attach. The result falls back to **14 times larger**

In [None]:

max_dv_dinv_inverse = ((n_invs * n_vs)**(-1))
                       
def metric_pref_attach(node_v, node_inv, G,  max_dv_dinv_inverse):
    return  max_dv_dinv_inverse * G.degree(node_v) * G.degree(node_inv)
                       
def metric_pref_attach_reduced(node_v, node_inv, G,  max_dv_dinv_inverse):
    return  max_dv_dinv_inverse * (G.degree(node_v) -1) * (G.degree(node_inv)-1)

In [11]:
%%time
if creating_pref_attach_graph :
    print("Creating")
    G2 = networkx.Graph()
    G2.add_nodes_from(G)
    i = 0
    for v_i in vs :
        if not i%1000 :
            print(i)
        i+=1
        for inv_j in invs :
            G2.add_edge(v_i, inv_j, weight = metric_pref_attach(v_i, inv_j, G, max_dv_dinv_inverse))
    pickle.dump( G2, open( "output_data/pref_attach_edges_graph.p", "wb" ) )
else :
    print("Loading G2")
    with open('output_data/pref_attach_edges_graph.p', 'rb') as f:
        G2 = pickle.load(f)

Loading G2
CPU times: user 23.1 s, sys: 39.4 s, total: 1min 2s
Wall time: 1min 6s


In [12]:
existing_investment_edges = set(G.edges)
all_possible_edges = set(G2.edges)
all_not_existing_investment_edges = all_possible_edges - existing_investment_edges
print("Number of investment edges : ",len(existing_investment_edges))
print("Number of non-invested edges : ", len(all_not_existing_investment_edges))
print("Theoretically, possible edges are Nv x Ninv = ", len(vs)*len(invs))
print("Matching : ", len(existing_investment_edges) + len(all_not_existing_investment_edges) == len(vs)*len(invs))


Number of investment edges :  16424
Number of non-invested edges :  23121415
Theoretically, possible edges are Nv x Ninv =  23137839
Matching :  True


In [None]:
%%time
if creating_pref_attach_reduced_graph :
    print("Creating")
    G3 = networkx.Graph()
    G3.add_nodes_from(G)
    i = 0
    for v_i in vs :
        if not i%200 :
            print(i)
        i+=1
        for inv_j in invs :
            if (v_i,inv_j) in existing_investment_edges :
                G3.add_edge(v_i, inv_j, weight = metric_pref_attach_reduced(v_i, inv_j, G, max_dv_dinv_inverse))
            else : 
                G3.add_edge(v_i, inv_j, weight = metric_pref_attach(v_i, inv_j, G, max_dv_dinv_inverse))
    pickle.dump( G3, open( "output_data/pref_attach_reduced_edges_graph.p", "wb" ) )
else :
    print("Loading G3")
    with open('output_data/pref_attach_reduced_edges_graph.p', 'rb') as f:
        G3 = pickle.load(f)

In [None]:
sum_of_existing_edges_pref_value = 0
sum_of_non_existing_edges_pref_value = 0
for v, inv in existing_investment_edges :
    sum_of_existing_edges_pref_value += G2[v][inv]['weight']
for v, inv in all_not_existing_investment_edges :
    sum_of_non_existing_edges_pref_value += G2[v][inv]['weight']
mean_existing_pref = sum_of_existing_edges_pref_value / len(existing_investment_edges)
mean_non_existing_pref = sum_of_non_existing_edges_pref_value / len(all_not_existing_investment_edges)
print(("When a link exists, preferential attachement is on average {0}\n and {1} "
      + "when not existing").format(mean_existing_pref, mean_non_existing_pref))

In [None]:
sum_of_existing_edges_pref_value = 0
sum_of_non_existing_edges_pref_value = 0
for v, inv in existing_investment_edges :
    sum_of_existing_edges_pref_value += G3[v][inv]['weight']
for v, inv in all_not_existing_investment_edges :
    sum_of_non_existing_edges_pref_value += G3[v][inv]['weight']
mean_existing_pref = sum_of_existing_edges_pref_value / len(existing_investment_edges)
mean_non_existing_pref = sum_of_non_existing_edges_pref_value / len(all_not_existing_investment_edges)
print(("Method #2 : \nWhen a link exists, preferential attachement is on average {0}\n and {1} "
      + "when not existing").format(mean_existing_pref, mean_non_existing_pref))

In [None]:
6.82331770893643e-06 / 4.985113415602395e-07

### 3 - Measuring the connection between two nodes 

* Hitting time or Commute time (based on random walks).
* Personalized Pagerank.
* G4 - Shortest path length (or geodesic distance) Note that in weighted graph u can use **Djikstra**

In [16]:
%%time
from copy import deepcopy
Gtemp = deepcopy(G)
if creating_shortest_paths_graph :
    print("Creating shortest paths")
    G4 = networkx.Graph()
    G4.add_nodes_from(G)
    i = 0
    for v_i in vs :
        if not i%100 :
            print(i)
        i+=1
        for inv_j in invs :
            stored_edge = False
            if (v_i, inv_j) in existing_investment_edges :
                stored_edge = True
                Gtemp.remove_edge(v_i,inv_j)
            try :
                G4.add_edge(v_i, inv_j, weight = networkx.shortest_path_length(G, v_i, inv_j))
                Gtemp.add_edge(v_i,inv_j)
            except :
                Gtemp.add_edge(v_i,inv_j)
    pickle.dump( G4, open( "output_data/creating_shortest_paths_graph.p", "wb" ) )
else :
    print("Loading G4")
    with open('output_data/creating_shortest_paths_graph.p', 'rb') as f:
        G4 = pickle.load(f)

Creating shortest paths
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
CPU times: user 27min 19s, sys: 2min 19s, total: 29min 38s
Wall time: 29min 53s


In [None]:
G.