In [66]:
import pickle
import networkx as nx
from itertools import combinations
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import itertools
import math
from IPython.display import clear_output
with open('base_data/G_98-18_2.p', 'rb') as f:
    G = pickle.load(f)

THRESHOLD = 0.05
ENDYEAR = 2018
STARTYEAR = 2014
INVTHRESHOLDINT = 2**18

def isInv(inv):
    return inv > INVTHRESHOLDINT
def isVent(v):
    return 0 < v < INVTHRESHOLDINT

##### 0.1 Removing the investor 'inv_'

In [2]:
print(len(list(G.neighbors('inv_'))))
G.remove_node('inv_')

42783


In [3]:
investors = [inv for inv in G.nodes() if inv[:4]=='inv_']
ventures = [v for v in G.nodes() if v[:4] != 'inv_']
id_to_investor = {}
investor_to_id = {}
j = INVTHRESHOLDINT
k=0
for inv in investors :
    k+=1
    id_to_investor[j+k] = inv
    investor_to_id[inv] = j+k

j = 0
k = 0
id_to_ventures = {}
ventures_to_id = {}
for v in ventures :
    k+=1
    id_to_ventures[j+k] = v
    ventures_to_id[v] = j+k

nx.relabel_nodes(G, {**investor_to_id, **ventures_to_id}, copy=False )

<networkx.classes.graph.Graph at 0x7f592c165358>

In [4]:
%%time
def amount_correcter(GX):
    dates = []
    j = 0
    k = 0
    for v,i in GX.edges :
        try :
            dates.append(GX[v][i]['date'])
            k+=1
        except:
            j+=1
            continue
    divider = dict(zip(*np.unique(dates, return_counts=True)))
    print("No_dates edges", j)
    print("with date edges", k)

    for v,i in GX.edges :
        try :
            GX[v][i]['corrected_amount'] = GX[v][i]['amount']/(divider[GX[v][i]['date']])
        except:
            print(v,i, "no_date")
            continue
    return GX


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.87 µs


# I - Preparing Real Data

In this notebook, I will illustrate Zheng's paper process to predict links.
Please note that I took a smaller graph so that it's easy to understand
<strong>
* 1. Sample the data to have a precise time windows
* 2. Cleaning data : edges that are not present in 2008-2018 or with no dates
* 3. Taking the biggest subgraph
* 4. Are there investor - investor or v - v relations
* 5. Making 3months snapshots
* 6. Verifying that edges are not here twice (i,v and v,i are not present at the same time)</strong>

In [5]:
years = []
unknown2 = 0
for v,i in G.edges():
    years.append(int(str(G[v][i]['date'])[:4]))
    
unique_years1 =  dict(zip(*np.unique(years, return_counts=True)))

for year in range(1988, 2018):
    s = str(year) + "\t"
    try :
        s += str(unique_years1[year])+"\t edges this year"
    except:
        s+= "0\t"
    print(s)



1988	4	 edges this year
1989	2	 edges this year
1990	2	 edges this year
1991	4	 edges this year
1992	1	 edges this year
1993	3	 edges this year
1994	6	 edges this year
1995	8	 edges this year
1996	33	 edges this year
1997	81	 edges this year
1998	255	 edges this year
1999	1810	 edges this year
2000	8991	 edges this year
2001	3038	 edges this year
2002	2382	 edges this year
2003	2576	 edges this year
2004	3207	 edges this year
2005	4090	 edges this year
2006	5093	 edges this year
2007	6975	 edges this year
2008	6660	 edges this year
2009	5642	 edges this year
2010	8030	 edges this year
2011	11688	 edges this year
2012	15235	 edges this year
2013	20515	 edges this year
2014	26708	 edges this year
2015	32218	 edges this year
2016	32152	 edges this year
2017	31729	 edges this year


In [6]:
%%time
print("Number of nodes : " , len(G.nodes))
print("Number of edges : " , len(G.edges))
print("Number of investors : " , len([nod for nod in G.nodes if nod > INVTHRESHOLDINT]))

Number of nodes :  157405
Number of edges :  243928
Number of investors :  50293
CPU times: user 93.8 ms, sys: 0 ns, total: 93.8 ms
Wall time: 83.7 ms


##### What does the biggest connected subgraph looks like ?

In [7]:
%%time
connected_components= nx.connected_component_subgraphs(G)
graphs = sorted(list(connected_components), key = lambda x : len(x.node))
G2 = graphs[-1]
print("Number of nodes : " ,len(G2.nodes))
print("Number of edges : " , len(G2.edges))
print("Number of investors : " , len([nod for nod in G2.nodes if nod > INVTHRESHOLDINT]))

Number of nodes :  113224
Number of edges :  233371
Number of investors :  41210
CPU times: user 4.8 s, sys: 453 ms, total: 5.25 s
Wall time: 5.28 s


### 1. Subset by timestamp

We subset the Crunchbase data by taking advantage of the timestamps on each edge to create a network that includes all nodes that received or made an investment between 2005-2015. Due to the nature of the data collection, this preserved > 90% of the Crunchbase data on investments. To make this problem more tractable, we further restricted our analysis to the maximum weakly connected component of the graph (as observed in 2015) which consisted of ∼ 105k edges and ∼ 55k nodes (the next largest 
WCC has 20 nodes). In our network, there are ∼ 21k investors and ∼ 34k companies.


In [8]:
%%time
edges_08_18 = set([edge for edge in G.edges() if G[edge[0]][edge[1]] and str(G[edge[0]][edge[1]]['date'])[:4] in [str(x) for x in range(STARTYEAR, ENDYEAR)]])
not_edges_08_18_or_with_no_dates = set(G.edges()) - edges_08_18
print(len(edges_08_18), " between 2008 and 2018 versus {0} before".format(len(G2.edges)))
print(len(not_edges_08_18_or_with_no_dates), " deleted")

122807  between 2008 and 2018 versus 233371 before
121121  deleted
CPU times: user 1.53 s, sys: 93.8 ms, total: 1.62 s
Wall time: 1.63 s


### 2. Cleaning data : edges that are not present in 2008-2018 or with no dates

In [9]:
#Here we remove the edges with no data from G, and we verify that G has the good number of edges
G.remove_edges_from(not_edges_08_18_or_with_no_dates)
print(len(G.edges()))
subgraphs = nx.connected_component_subgraphs(G)
subgraphslist = list(subgraphs)

del(subgraphs)
del(not_edges_08_18_or_with_no_dates)
del(edges_08_18)

122807


### 3. Taking the biggest subgraph

In [10]:
number_of_subgraphs = len(subgraphslist)
sizes = [g.number_of_nodes() for g in subgraphslist]
mean_sizes = sum(sizes)/number_of_subgraphs
sizes2 = np.unique(sizes, return_counts=True)

print("Number of subgraphs :", number_of_subgraphs)
print("Mean size : ", mean_sizes)
print("Size \t Occurences")
for i in range(len(sizes2[0])):
    print(str(sizes2[0][i]) + "\t" + str(sizes2[1][i]))

biggest_subgraph = sorted(subgraphslist, key = lambda x : len(x.nodes()))[-1]
print("edges in biggest subgraph :", biggest_subgraph.number_of_edges())

del(sizes2)
del(sizes)
del(subgraphslist)

Number of subgraphs : 85052
Mean size :  1.8506913417673894
Size 	 Occurences
1	80391
2	3100
3	839
4	305
5	167
6	86
7	49
8	40
9	19
10	18
11	16
12	7
13	2
14	2
15	3
16	1
17	1
18	1
21	1
23	1
28	1
35	1
64195	1
edges in biggest subgraph : 114597


#### Observations :

The biggest connected subgraph between with only edges from 2008 - 2018 is 122K nodes, and 221K edges.

### 4. Are there investor - investor or v - v relations ?
While the majority of our data adhered to the binary division between investor and startups, ∼ 400 nodes had both
received and given investments over our 10-year observation period. To enforce the bipartite property, we assigned
entities with strictly higher out-degree to the investor group
(and removed their incoming investment edges) and the
other companies to the start-up group (and removed their
outgoing investment edges). This simple heuristic preserves
the maximum number of edges and, by manual inspection,
seems to perform well in distinguishing the primary function of an entity.

In [11]:
from time import time
if False:
    investors = [inv for inv in G.nodes() if isInv(inv)]
    print("n°investors:", len(investors))
    ventures = [v for v in G.nodes() if isVent(v)]
    print("Nventures:", len(ventures))
    abnormal_inv_inv_links = []
    abnormal_v_v_links = []
    i=0
    t0 = 0
    print("beginning detecting abnormal links inv-inv")
    existing_edges=set(G.edges())
    invComb = len(investors)**2
    for inv1, inv2 in combinations(investors, 2):
        if not(i%1000000):
            clear_output(wait=True)
            print(100*i/invComb, i, invComb, sep="\n")
        i+=1
        if (inv1,inv2) in existing_edges:
                print(inv1, inv2)
                abnormal_inv_inv_links.append((inv1,inv2))

    print("beginning detecting abnormal links v-v")
    i=0
    for v1, v2 in combinations(ventures,2):
        if (v1,v2) in existing_edges :
            print(v1,v2)
else :
    print('■ Jumping this part, because it\'s too long')

■ Jumping this part, because it's too long


### Observations :
Here there seem to be no invalid links. Zheng was working we some data where investors received some funds, and vis-versa, some ventures were investing.

### 5. Making 3months snapshots

In [12]:
G3 = biggest_subgraph
del G2
del biggest_subgraph

In [13]:
selectors = {}
for year in range(STARTYEAR, ENDYEAR+1):
    selectors[year] = {}
    for trimestre in [1,2,3,4] :
        mois = [3*(trimestre-1) + x for x in [1,2,3]]
        selectors[year][trimestre] = [(year, moi) for moi in mois]
        
def selectors_list(from_y_t, to_y_t) :
    list_of_matching_year_month = []
    if from_y_t[0] == to_y_t[0]:
        for trimester in range(from_y_t[1], to_y_t[1] +1) :
            list_of_matching_year_month+=(selectors[from_y_t[0]][trimester])
        return list_of_matching_year_month
    else :
        for year in range(from_y_t[0], to_y_t[0] +1):
            if year == from_y_t[0] :
                for trimester in range(from_y_t[1], 5):
                    list_of_matching_year_month+=(selectors[year][trimester])
            elif year == to_y_t[0] :
                for trimester in range(1, to_y_t[1]+1) :
                    list_of_matching_year_month+=(selectors[year][trimester])
            else:
                for trimester in range(1,5):
                    list_of_matching_year_month+=(selectors[year][trimester])
        return list_of_matching_year_month

def edges_with_year_trimester(G, from_y_t, to_y_t):
    yt_list = selectors_list(from_y_t, to_y_t)
    edges_corresponding = set([edge for edge in G.edges() if G[edge[0]][edge[1]] and (int(str(G[edge[0]][edge[1]]['date'])[:4]) , int(str(G[edge[0]][edge[1]]['date'])[5:7])) in yt_list])
    return(edges_corresponding)

### 6. Verifying that edges are not here twice (i,v and v,i are not present at the same time)
**Already_seen**

If (v,i) or (i,v) is twice in the graph, then it should print
<br/>Else nothing is printed. If everything is ok, nothing prints hereafter

In [14]:
already_seen = set()
j = 0
seen = 0
t0 = time()
for i,v in list(edges_with_year_trimester(G3, (STARTYEAR,1), (ENDYEAR,4))):
    j+=1
    if j%1000==0 :
        clear_output(wait=True)
        print(j, time()-t0)
        print("seen twice", seen)
        print("already_seen_size", len(already_seen))
    if (i,v) in already_seen or (v,i) in already_seen:
        print(i,v)
        seen+=1
    else :
        already_seen.add((i,v))

114000 1.0026047229766846
seen twice 0
already_seen_size 113999


# II - Scores used by Zheng's

### 0 - Candidates edges

In [15]:
print("■  STARTING DATE  to  LIMIT DATE - - - - PERC EDGES OF TOTAL")
for y in range(STARTYEAR,ENDYEAR):
    for t in range(1,5):
        edgesLocal = edges_with_year_trimester(G3, (STARTYEAR,1), (y, t))
        print("from {0} to {1} -- Nedges : ".format((STARTYEAR,1), (y,t)),round(100*len(edgesLocal)/len(G3.edges)), "% - - " , len(edgesLocal) )

■  STARTING DATE  to  LIMIT DATE - - - - PERC EDGES OF TOTAL
from (2014, 1) to (2014, 1) -- Nedges :  5 % - -  6156
from (2014, 1) to (2014, 2) -- Nedges :  11 % - -  12357
from (2014, 1) to (2014, 3) -- Nedges :  16 % - -  18904
from (2014, 1) to (2014, 4) -- Nedges :  22 % - -  25184
from (2014, 1) to (2015, 1) -- Nedges :  28 % - -  32657
from (2014, 1) to (2015, 2) -- Nedges :  35 % - -  40570
from (2014, 1) to (2015, 3) -- Nedges :  42 % - -  48340
from (2014, 1) to (2015, 4) -- Nedges :  48 % - -  55500
from (2014, 1) to (2016, 1) -- Nedges :  55 % - -  63456
from (2014, 1) to (2016, 2) -- Nedges :  62 % - -  71079
from (2014, 1) to (2016, 3) -- Nedges :  68 % - -  78377
from (2014, 1) to (2016, 4) -- Nedges :  74 % - -  85260
from (2014, 1) to (2017, 1) -- Nedges :  81 % - -  92839
from (2014, 1) to (2017, 2) -- Nedges :  87 % - -  100127
from (2014, 1) to (2017, 3) -- Nedges :  94 % - -  107731
from (2014, 1) to (2017, 4) -- Nedges :  100 % - -  114597


In [17]:
print("■ Write the (Year,trimester) as follow : YYYY,T")
limit70 = [int(x) for x in input("Please enter the data at which G is over 70% : ").split(",")]
print(limit70)
limit90 = [int(x) for x in input("Please enter the data at which G is over 90% : ").split(",")]
print(limit90)
limit100 = [int(x) for x in input("Please enter the data at which G is over 100% : ").split(",")]
print(limit100)

■ Write the (Year,trimester) as follow : YYYY,T
Please enter the data at which G is over 70% : 2016,4
[2016, 4]
Please enter the data at which G is over 90% : 2017,2
[2017, 2]
Please enter the data at which G is over 100% : 2017,4
[2017, 4]


In [18]:
def put_inv_on_right(set_of_links):
    S = set()
    r = 0
    n=0
    for v,i in set_of_links:
        if v > INVTHRESHOLDINT:
            S.add((i,v))
            r+=1
        else :
            S.add((v,i))
            n+=1
    print("Reversed / Not reversed : ", r, n)
    return S

print("■ Calculating G70, G90, G100 with Corrected Amounts on it")



edges70 = edges_with_year_trimester(G3, (STARTYEAR,1), limit70)
edges90 = edges_with_year_trimester(G3, (STARTYEAR,1), limit90)
edges100 = edges_with_year_trimester(G3, (STARTYEAR,1), limit100)
G70 = nx.Graph()
G90 = nx.Graph()
G100 = nx.Graph()
for v,i in put_inv_on_right(edges70) :
    G70.add_edge(v,i, amount = G[v][i]['amount'], date = G[v][i]['date'])
G70 = amount_correcter(G70)
for v,i in put_inv_on_right(edges90 ):
    G90.add_edge(v,i, amount = G[v][i]['amount'], date = G[v][i]['date'])
G90 = amount_correcter(G90)
for v,i in put_inv_on_right(edges100) :
    G100.add_edge(v,i, amount = G[v][i]['amount'], date = G[v][i]['date'])
G100 = amount_correcter(G100)

print("\n■ Calculating ventures, investors of G70")
ventures = [nod for nod in G70.nodes if nod< INVTHRESHOLDINT]
investors = [nod for nod in G70.nodes if nod > INVTHRESHOLDINT]

print("\n■ Calculating sets")
existing70 = set(put_inv_on_right(G70.edges()))
existing90 = set(put_inv_on_right(G90.edges()))
existing100 = set(put_inv_on_right(G100.edges()))


target_for_70 = existing90 - existing70
target_for_70_possible = [e for e in target_for_70 if e[0] in G70.nodes() and e[1] in G70.nodes()]
target_for_90 = existing100 - existing90
target_for_90_possible = [e for e in target_for_90 if e[0] in G90.nodes() and e[1] in G90.nodes()]

ventures70 = [v for v in G70.nodes() if v<INVTHRESHOLDINT]
investors70 = [i for i in G70.nodes() if i > INVTHRESHOLDINT]

ventures90 = [v for v in G90.nodes() if v<INVTHRESHOLDINT]
investors90 = [v for v in G90.nodes() if v>INVTHRESHOLDINT]

ventures100 = [v for v in G100.nodes() if v<INVTHRESHOLDINT]
investors100 = [i for i in G100.nodes() if i > INVTHRESHOLDINT]

print("\n■ DESCRIPTION")
desc = "■ DESCRIPTION OF UTILS \n\n"
desc += "G70 contains {0} nodes with\t {2} ventures, \t{1} edges, dates, and amounts (and corrected) are edges labels.".format(len(G70.nodes), len(G70.edges),len(ventures70))
desc += "\nG90 contains {0} nodes with\t {2} ventures, \t{1} edges, dates, and amounts (and corrected) are edges labels.".format(len(G90.nodes), len(G90.edges), len(ventures90))
desc += "\nG100 contains {0} nodes with\t {2} ventures, \t{1} edges, dates, and amounts (and corrected) are edges labels.".format(len(G100.nodes), len(G100.edges), len(ventures100))
desc += "\n\nThese graphs are extracted respectively between :"
desc += "\n\tG70  - YEAR :{} TRIMESTER : {} and YEAR:{} TRIMESTER : {}".format(STARTYEAR, 1, limit70[0],limit70[1])
desc += "\n\tG90  - YEAR :{} TRIMESTER : {} and YEAR:{} TRIMESTER : {}".format(STARTYEAR,1, limit90[0], limit90[1])
desc += "\n\tG100 - YEAR :{} TRIMESTER : {} and YEAR:{} TRIMESTER : {}".format(STARTYEAR, 1, limit100[0], limit100[1])
desc += "\nINVTHRESHOLDINT is {0} soit 2^18, if the number of the node is superior, it's an investor".format(INVTHRESHOLDINT)
desc += "\n\nBetween G70 and G90,\t {0} edges are created, but only {1} are from nodes in G70".format(len(target_for_70), len(target_for_70_possible))
desc += "\nBetween G90 and G100,\t {0} edges are created, but only {1} are from nodes in G90".format(len(target_for_90), len(target_for_90_possible))
desc += "\n↑ Above variables are contained in target_for_70 or _90 and target_for_70_possible and _90_possible"
desc += "\n\n It also means that there are 10^{0} candidates in G70 for target".format(round(math.log(len(ventures70)*len(investors70)-len(existing70),10)))
print(desc)

print("\n■ Dumping...")
utils = { "description": desc,
         "STARTDATE" : (STARTYEAR, 1),
         "INVTHRESHOLDINT" : INVTHRESHOLDINT, 
         "target_for_70":target_for_70, 
         "target_for_70_possible":target_for_70_possible, 
         "target_for_90":target_for_90, 
         "target_for_90_possible":target_for_90_possible, 
         #"target_edges":target_edges, 
         "ventures70":ventures70,
         "ventures90":ventures90,
         "ventures100":ventures100,
         "investors70":investors70,
         "investors90":investors90,
         "investors100":investors100,
         "G70" : G70, 
         "G90" : G90,
         "G100":G100,
         "id_to_investors": id_to_investor, "id_to_ventures" :id_to_ventures, "ventures_to_id":ventures_to_id, "investor_to_id":investor_to_id}
pickle.dump(utils, open('output_data/utils{}-{}.p'.format(STARTYEAR, ENDYEAR), 'wb'), protocol=2)
print("\n\t\t ...DONE ■ output_data/utils{}-{}.p".format(STARTYEAR, ENDYEAR))


■ Calculating G70, G90, G100 with Corrected Amounts on it
Reversed / Not reversed :  65565 19695
No_dates edges 0
with date edges 85260
Reversed / Not reversed :  77010 23117
No_dates edges 0
with date edges 100127
Reversed / Not reversed :  87964 26633
No_dates edges 0
with date edges 114597

■ Calculating ventures, investors of G70

■ Calculating sets
Reversed / Not reversed :  52515 32745
Reversed / Not reversed :  62535 37592
Reversed / Not reversed :  72117 42480

■ DESCRIPTION
■ DESCRIPTION OF UTILS 

G70 contains 52142 nodes with	 31730 ventures, 	85260 edges, dates, and amounts (and corrected) are edges labels.
G90 contains 58606 nodes with	 35560 ventures, 	100127 edges, dates, and amounts (and corrected) are edges labels.
G100 contains 64195 nodes with	 38836 ventures, 	114597 edges, dates, and amounts (and corrected) are edges labels.

These graphs are extracted respectively between :
	G70  - YEAR :2014 TRIMESTER : 1 and YEAR:2016 TRIMESTER : 4
	G90  - YEAR :2014 TRIMESTER :

# III - Meta Analyse 2014 - 2018

In [53]:
import pandas as pd
existing100List = list(existing100)
X = np.array([[x[0],x[1]] for x in existing100List])
df = pd.DataFrame(X, columns=["VID","IID"])


In [73]:
df['VIN70'] = df["VID"].map(lambda x : x in G70.nodes())
df['VIN90'] = df["VID"].map(lambda x : x in G90.nodes())
df['IIN90'] = df["IID"].map(lambda x : x in G90.nodes())
df['IIN70'] = df["IID"].map(lambda x : x in G70.nodes())
df['TOINV_'] = df["VID"].map(lambda x : G.has_edge('inv_',utils['id_to_ventures'][x]))
df['TOINV_ONLY'] = df["VID"].map(lambda x : G.has_edge('inv_',utils['id_to_ventures'][x]) and G.degree(utils['id_to_ventures'][x])==1)

df['BOTH70'] = df[['VID',"IID"]].apply(lambda x : G70.has_edge(x[0],x[1]), axis=1)
df['BOTH90'] = df[['VID',"IID"]].apply(lambda x : G90.has_edge(x[0],x[1]), axis=1)

In [74]:
def pc(series):
    return round(100*series.sum()/series.shape[0],1)

print("VIN70", pc(df['VIN70']))
print("VIN90", pc(df['VIN90']))
print("IIN70", pc(df['IIN70']))
print("IIN90", pc(df['IIN70']))

print("BOTH70", pc(df['BOTH70']))
print("BOTH90", pc(df['BOTH90']))

print("TOINV_", pc(df['TOINV_']))
print("TOINV_ONLY", pc(df['TOINV_ONLY']))

VIN70 86.3
VIN90 93.8
IIN70 94.4
IIN90 94.4
BOTH70 74.4
BOTH90 87.4
TOINV_ 23.0
TOINV_ONLY 0.0


In [72]:
G.degree("advocate")

1