In [218]:
import numpy as np
import math
import pprint as pp
from scipy import *
from scipy.sparse import *

## Generation de graphes

In [219]:
handmadeGraph1 = {1: {2:0.5,3:0.5},
                 2: {4:0.6},
                 3: {4:0.4,5:0.6},
                 4: {},
                 5: {},
                }

handmadeGraph=dok_matrix((7,7),dtype=float32)
handmadeGraph[0,1] =0.5
handmadeGraph[0,2] =0.5
handmadeGraph[1,3] =0.6
handmadeGraph[2,3] =0.4
handmadeGraph[2,4] =0.6

print("Handmade Graph")
print(handmadeGraph)


nodes = np.arange(handmadeGraph.shape[0])

def randomGraph(nodes):
    m=len(nodes)
    graph = dok_matrix((m,m),dtype=float32)
    for i in range(m):
        for j in range(m):
            if(i!=j):
                graph[i,j]=np.random.rand()
    return graph
print("Random Graph")
print(randomGraph(nodes))

Handmade Graph
  (0, 1)	0.5
  (0, 2)	0.5
  (1, 3)	0.6
  (2, 3)	0.4
  (2, 4)	0.6
Random Graph
  (0, 1)	0.35793808
  (0, 2)	0.9866501
  (0, 3)	0.013221352
  (0, 4)	0.1828696
  (0, 5)	0.03427853
  (0, 6)	0.38070202
  (1, 0)	0.8288135
  (1, 2)	0.055521414
  (1, 3)	0.21423219
  (1, 4)	0.77184093
  (1, 5)	0.112766765
  (1, 6)	0.0021552218
  (2, 0)	0.18658401
  (2, 1)	0.18757546
  (2, 3)	0.2970758
  (2, 4)	0.35188803
  (2, 5)	0.8878879
  (2, 6)	0.068491265
  (3, 0)	0.8192144
  (3, 1)	0.48997146
  (3, 2)	0.01632727
  (3, 4)	0.26126522
  (3, 5)	0.40720692
  (3, 6)	0.94067585
  (4, 0)	0.5607623
  (4, 1)	0.31207532
  (4, 2)	0.32916886
  (4, 3)	0.19208989
  (4, 5)	0.21371375
  (4, 6)	0.54073435
  (5, 0)	0.22994892
  (5, 1)	0.4682285
  (5, 2)	0.7443926
  (5, 3)	0.20502649
  (5, 4)	0.87337327
  (5, 6)	0.60775065
  (6, 0)	0.33565366
  (6, 1)	0.7427652
  (6, 2)	0.5562697
  (6, 3)	0.3382322
  (6, 4)	0.17150491
  (6, 5)	0.7333921


## Inference 
Generation de cascades

In [220]:

def genCascade(graph,startNode,startTime=0):
    ''' Receive graph and starting infected node generate a cascade 
        Args
            graph(node to child representation)
            cascade {node : time} 
    '''
    cascade = {startNode : startTime}
    lastInfected = [startNode]
    infected_next = {}
    time = startTime+1
    while len(lastInfected)> 0:
        for infected in lastInfected:
            for (_,child),pct in graph[infected,:].items():
                if (child not in cascade) and (child not in infected_next):
                    if (np.random.rand()<pct):
                        infected_next[child] = time
        cascade.update(infected_next)
        lastInfected = list(infected_next.keys())
        infected_next = {}
        time +=1
        
    return cascade

In [221]:
nbCascades = 200
cascades = [genCascade(handmadeGraph,0) for i in range(nbCascades)]
pp.pprint(cascades[:3])

[{0: 0, 1: 1, 2: 1, 4: 2}, {0: 0, 1: 1, 2: 1}, {0: 0, 1: 1, 2: 1, 3: 2, 4: 2}]


## Independant cascades apprentissage


### 2008 Saito
Les temps d'infections contiguë <br>
$D(i)$  all newly infected at time $i$  <br>
$D = D(0) \cup D(1) \cup ... \cup D(n)$
#### Maximimum likelyhood expectation
Etapes : 
1. On donne une première estimation des arêtes du graph : $k_{u,v}$ <br>
- On calcule $P_{w}$ en fonction de $k_{u,v}$
- On calcule par ML $k_{u,v}$ en fonction de $P_{w}$
- Retour à l'étape 2 jusqu'à convergence


In [222]:
def cascade_repr(cascade):
    ''' Transforme une cascade en list ayant pour indice 
        le temps d'infection t et pour contenu une list contenant les noeuds infectés au temps t'''
    
    maxT  = max(cascade.values())+1
    Ds = [[] for i in range(maxT)]
    for (n,t) in cascade.items():
        Ds[t].append(n)
    return Ds
D = [cascade_repr(c) for c in cascades]
print(cascades[:2],'\n',D[:2])

[{0: 0, 1: 1, 2: 1, 4: 2}, {0: 0, 1: 1, 2: 1}] 
 [[[0], [1, 2], [4]], [[0], [1, 2]]]


In [223]:
def nodes_in_Ds(Ds):
    uniques = []
    for nodes in Ds : 
        for n in nodes : 
            uniques.append(n)
    return uniques

In [224]:

def P_sw(g,Ds,w):
    ''' Vraisemblance de l infection d un noeud sachant le graphe 
    et les noeuds acivés à l'etape precedente '''
    not_activated = 1
    t = None
    for i,nodes in enumerate(Ds):
        if (w in nodes):
            t = i
    if (t == 0): # si le noeud est le premier
        return 1
    if (t is None): # si le noeud n'est pas dans l'episode de diffusion
        raise Exception(f"node {w} is not in diff episode : {Ds} ")
    else :  # si le noeud est dans l'épisode de diffusion
        return 1 - np.prod ([1-g[parent,w] for parent in Ds[t-1]])

In [225]:
def Expectation(g,D):
    ''' Calcule l'ensemble des P_ws 
        P_ws[idD][node]->proba'''
    p_sw = [{n:P_sw(g,Ds,n) for n in nodes_in_Ds(Ds)} for Ds in D]
    return p_sw

In [226]:
def D_plus_uv_id(D,u,v):
    ''' pour chaque couples u,v renvoit l'ensemble des 
    Ds (episode infections) ou u precede v'''
    D_plus = []
    for i,Ds in enumerate(D): 
        for t in range(1,len(Ds)):
            if (u in Ds[t-1] and v in Ds[t]):
                D_plus.append(i)
                break
    return D_plus

def D_minus_uv_len(D,u,v):
    '''Pour chaque couple u,v renvoit le cardinal de l'ensemble des 
    Ds(episode infection) ou u est present mais v ne le suit pas'''
    D_minus_len=0
    for Ds in D : 
        for t in range(1,len(Ds)):
            if (u in Ds[t-1] and v not in Ds[t]):
                D_minus_len+=1
                break
        if (u in Ds[-1]):
            D_minus_len+=1
    return D_minus_len

In [227]:
def Maximisation_uv(g,D_plus_id,Dminus_len,p_sw,u,v):
    '''Calcule les nouveaux paramètre pour l'arete u,v '''
    if ((len(D_plus_id[u][v])+Dminus_len[u][v]) == 0):
        raise Exception(f"{u}-{v} Division zero")
    return (1/(len(D_plus_id[u][v])+Dminus_len[u][v])) *sum([g[u,v]/p_sw[i][v] for i in D_plus_id[u][v]])


def Maximisation(g,D_plus_id,Dminus_len,p_sw):
    ''' Calcule les nouveaux paramètres pour le graphe'''
    gprime = dok_matrix(g.shape,dtype=float32)
    for u,v in g.keys():
        if u != v:
            gprime[u,v] = Maximisation_uv(g,D_plus_id,Dminus_len,p_sw,u,v)
    return gprime


In [228]:
def EM_IC(D,nodes):
        # initalisation
    g = randomGraph(nodes)    
    p_sw = None
    D_plus_id =   {v:{u:D_plus_uv_id(D,v,u) for u in nodes} for v in nodes}
    D_minus_len = {v:{u:D_minus_uv_len(D,v,u)for u in nodes} for v in nodes}
    for i in range(100):
        p_sw = Expectation(g,D)
        g = Maximisation(g,D_plus_id,D_minus_len,p_sw)
    return g

nodes = set()
for c in cascades:
    nodes.update(c.keys())
print(nodes)

D = [cascade_repr(c) for c in cascades]
finalGraph = EM_IC(D,nodes)
print(finalGraph)

{0, 1, 2, 3, 4}
  (0, 1)	0.46
  (0, 2)	0.54
  (1, 3)	0.5957176
  (1, 4)	4.4472476e-27
  (2, 3)	0.41914776
  (2, 4)	0.5740741


### Methode 2
Les temps d'infections non contiguë : 
Un noeud peut désorais être infecté non seulement par ceux qui ont été infecté

In [229]:
def P_sw2(g,Ds,w):
    ''' Vraisemblance de l infection d un noeud sachant le graphe 
    et les tous noeuds précedemment acivés '''
    not_activated = 1
    t = None
    for i,nodes in enumerate(Ds):
        if (w in nodes):
            t = i
    if (t == 0): # si le noeud est le premier
        return 1
    if (t is None): # si le noeud n'est pas dans l'episode de diffusion
        raise Exception(f"node {w} is not in diff episode : {Ds} ")
    else :  # si le noeud est dans l'épisode de diffusion
        preceding_nodes = [] # on regroupe les noeuds des épisodes précedents
        for nodes in Ds[:t]:
            for n in nodes : 
                preceding_nodes.append(n)
        
        return 1 - np.prod ([1-g[parent,w] for parent in preceding_nodes])

In [230]:
def Expectation2(g,D):
    ''' Calcule l'ensemble des P_ws 
        P_ws[idD][node]->proba'''
    p_sw = [{n:P_sw2(g,Ds,n) for n in nodes_in_Ds(Ds)} for Ds in D]
    return p_sw

In [231]:
def D_plus_uv_id2(D,u,v):
    ''' Pour chaque couples u,v renvoit l'ensemble des 
        Ds (episode infections) ou t_u < t_v'''
    D_plus = []
    
    for i,Ds in enumerate(D) : 
        preceding_nodes = []
        for t in range(0,len(Ds)):
            if (v in Ds[t]):
                if (u in preceding_nodes):
                    D_plus.append(i)
                    break
            else : 
                preceding_nodes +=Ds[t]
    return D_plus

def D_minus_uv2_len(D,u,v):
    '''Pour chaque couple u,v renvoit l'ensemble des 
    Ds(episode infection) ou u est present et non(t_u < t_v)'''
    D_minus_len = 0
    for Ds in D : 
        u_in_Ds = False
        v_in_Ds = False
        for t in range(0,len(Ds)):
            if (u in Ds[t]):
                u_in_Ds = True
            if (u_in_Ds and v in Ds[t]):
                v_in_Ds = True
                break
        if (u_in_Ds and not v_in_Ds):
            D_minus_len+=1
    return D_minus_len

In [232]:
def EM_IC_2(D,nodes):
    g = randomGraph(nodes)    
    p_sw = None
    D_plus_id =   {v:{u:D_plus_uv_id2(D,v,u) for u in nodes} for v in nodes}
    D_minus_len = {v:{u:D_minus_uv2_len(D,v,u)for u in nodes} for v in nodes}
    
    for i in range(100):
        p_sw = Expectation2(g,D)
        g = Maximisation(g,D_plus_id,D_minus_len,p_sw)
    return g

D = [cascade_repr(c) for c in cascades]
finalGraph = EM_IC_2(D,nodes)
print(finalGraph)

  (0, 1)	0.46
  (0, 2)	0.54
  (0, 3)	2.848232e-13
  (0, 4)	1.4095633e-27
  (1, 3)	0.5957176
  (1, 4)	4.2353156e-27
  (2, 3)	0.41914776
  (2, 4)	0.5740741


## Evaluation

On va évaluer les modèles par MeanAveragePrecision (MAP): <br>
On peut caculer $AP$ pour un épisode ($Ds$) en particulier : <br><br>
$ AP(Ds) =  \sum_{i=1}^{|U^{Ds}|} \frac{ |\{ U_1^{Ds},...,U_i^{Ds} \} \cap Ds |}{i}  dx$ <br>
$ AP(Ds) = \sum_{i=1}^{|U^{Ds}|} \frac{TruePositive}{TruePositive+FalseNegative}dx$ <br><br>
$ dx = recall(i) -recall(i-1) $<br>
<br>
$ MAP = \frac{1}{D}\sum_{Ds\in |D|} AP(Ds)$

On va d'abord calculer les probabilité d'infection d'un noeud connaissant la source 
par génération d'épisodes : 

In [233]:
def Pws_gs(graph,source,nbEpisode=100):
    ''' Calcule la probabilité qu'un noeud soit infecté connaissant 
        une source et un graph de diffusion
        On calcule par moyenne sur echantionnage d'episodes d'infections '''
    proba_infected = {n:0 for n in range(graph.shape[0])}
    for i in range(nbEpisode):
        c = genCascade(graph,source)
        for node in c : 
            proba_infected[node]+=1/nbEpisode
    return proba_infected

In [234]:
def AP(Ds,graph):
    '''Average Precision pour un episode Ds '''
    pws_gs =Pws_gs(graph,Ds[0][0])
    U_d = sorted(pws_gs,key=pws_gs.get,reverse=True) # sort par ordre decroissant
    ap = 0
    DsNodeSet = set(nodes_in_Ds(Ds)) # noeuds faisant partie de l'episode d'infection Ds
    
        # calcul des points de precision et de recall : 
    precision = np.ones((len(U_d)+1))
    recall = np.zeros((len(U_d)+1))
    for i in range(1,len(precision)):
        tp = len(DsNodeSet.intersection(U_d[:i])) # TruePositive
        precision[i] = tp/i
        recall[i] = tp/len(DsNodeSet)
        
        # smooth precision curve
    for i in range(len(precision)-2,-1,-1):
        precision[i] = max(precision[i],precision[i+1])
    
    ap = 0
    for i in range(1,len(precision)):
        dx = recall[i]-recall[i-1]
        ap+= precision[i]* dx

    return ap

def MAP(D,graph):
    return sum([AP(Ds,graph)for Ds in D])/len(D)    

In [237]:
print("original  graph MAP score : ",MAP(D,handmadeGraph))
print("inference graph MAP score : ",MAP(D,finalGraph))

original  graph MAP score :  0.9303888888888889
inference graph MAP score :  0.9361944444444444
