In [92]:
from random import choices

import networkx as nx
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Comparing Spread Across Different Datasets with Different Assortivities

## NetHept

In [159]:
file = open('weic-graphdata/hep.txt', 'r')
first_line = file.readline()
meta = first_line.split()
num_nodes = int(meta[0])
num_edges = int(meta[1])
print(f"|V|: {num_nodes}, |E|: {num_edges}")


edges = [line.strip().split() for line in file.readlines()]
edges = [(int(e[0]), int(e[1])) for e in edges]

graph = nx.DiGraph()
graph.add_edges_from(edges)
graph.number_of_nodes(), graph.number_of_edges() # 32K edges match the paper
# duplicated edges

|V|: 15233, |E|: 58891


(15233, 32235)

In [160]:
print('average clustering coefficient:', np.mean(list(nx.clustering(graph).values())))

average clustering coefficient: 0.26205258708522


In [161]:
print('dimaeter:', max([max(j.values()) for (i,j) in nx.shortest_path_length(graph)]))

dimaeter: 31


In [162]:
# run on original graph
results_spreads_original = [] 
results_deg_assorts_original = []
results_visibility_original = []

nodes = list(graph.nodes)
deg_assort = nx.degree_assortativity_coefficient(graph, x = 'out', y = 'out')
print(f"Running for the NetHept graph with deg assort {deg_assort}\n")
for target_visibility in [0.01, 0.05, 0.11]:
    spreads = []
    for t in range(5):
        fully_observ_O, boundary_nodes = construct_observable(graph, target_visibility)
        observable_graph = construct_observable_graph(graph, fully_observ_O, boundary_nodes)
        seeds = seed_weighted_degree(observable_graph, weight = 3, num_seed = 5)
        activated_nodes = simulate_weighted_IC(graph, seeds)
        spreads.append(len(activated_nodes))
    avg_spread = np.mean(spreads)
    avg_spread_percent_of_graph = avg_spread/graph.number_of_nodes()
    print(f"\tvisibility: {target_visibility}, avg. spread as a pct. of |V|: {100*avg_spread_percent_of_graph:.4f}%")
    results_spreads_original.append(avg_spread)
    results_deg_assorts_original.append(deg_assort)
    results_visibility_original.append(target_visibility)

Running for the NetHept graph with deg assort 0.3161257813793155

	visibility: 0.01, avg. spread as a pct. of |V|: 0.3492%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.6499%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.6118%


----

## Gnutella Peer to Peer Network


A sequence of snapshots of the Gnutella peer-to-peer file sharing network from August 2002. There are total of 9 snapshots of Gnutella network collected in August 2002. Nodes represent hosts in the Gnutella network topology and edges represent connections between the Gnutella hosts.

```
Nodes	10876
Edges	39994
Nodes in largest WCC	10876 (1.000)
Edges in largest WCC	39994 (1.000)
Nodes in largest SCC	4317 (0.397)
Edges in largest SCC	18742 (0.469)
Average clustering coefficient	0.0062
Number of triangles	934
Fraction of closed triangles	0.001807
Diameter (longest shortest path)	9
90-percentile effective diameter	5.4
```

In [163]:
file = open('additional_datasets/p2p-Gnutella04.txt', 'r')
edges = [line.strip().split() for line in file.readlines()[4:]]
edges = [(int(e[0]), int(e[1])) for e in edges]
graph = nx.DiGraph()
graph.add_edges_from(edges)
graph.number_of_nodes(), graph.number_of_edges()

(10876, 39994)

In [164]:
# run on original graph
results_spreads_original = [] 
results_deg_assorts_original = []
results_visibility_original = []

nodes = list(graph.nodes)
deg_assort = nx.degree_assortativity_coefficient(graph, x = 'out', y = 'out')
print(f"Running for the Gnutella graph with deg assort {deg_assort}\n")
for target_visibility in [0.01, 0.05, 0.11]:
    spreads = []
    for t in range(5):
        fully_observ_O, boundary_nodes = construct_observable(graph, target_visibility)
        observable_graph = construct_observable_graph(graph, fully_observ_O, boundary_nodes)
        seeds = seed_weighted_degree(observable_graph, weight = 3, num_seed = 5)
        activated_nodes = simulate_weighted_IC(graph, seeds)
        spreads.append(len(activated_nodes))
    avg_spread = np.mean(spreads)
    avg_spread_percent_of_graph = avg_spread/graph.number_of_nodes()
    print(f"\tvisibility: {target_visibility}, avg. spread as a pct. of |V|: {100*avg_spread_percent_of_graph:.4f}%")
    results_spreads_original.append(avg_spread)
    results_deg_assorts_original.append(deg_assort)
    results_visibility_original.append(target_visibility)

Running for the Gnutella graph with deg assort -0.0036630443245183283

	visibility: 0.01, avg. spread as a pct. of |V|: 0.4064%
	visibility: 0.05, avg. spread as a pct. of |V|: 7.6940%
	visibility: 0.11, avg. spread as a pct. of |V|: 10.1250%


----

## Stanford web graph


Nodes represent pages from Stanford University (stanford.edu) and directed edges represent hyperlinks between them. The data was collected in 2002.

```
Nodes	281903
Edges	2312497
Nodes in largest WCC	255265 (0.906)
Edges in largest WCC	2234572 (0.966)
Nodes in largest SCC	150532 (0.534)
Edges in largest SCC	1576314 (0.682)
Average clustering coefficient	0.5976
Number of triangles	11329473
Fraction of closed triangles	0.002889
Diameter (longest shortest path)	674
90-percentile effective diameter	9.7
```

In [165]:
file = open('additional_datasets/web-Stanford.txt', 'r')
edges = [line.strip().split() for line in file.readlines()[4:]]
edges = [(int(e[0]), int(e[1])) for e in edges]
graph = nx.DiGraph()
graph.add_edges_from(edges)
graph.number_of_nodes(), graph.number_of_edges()

(281903, 2312497)

In [166]:
# run on original graph
results_spreads_original = [] 
results_deg_assorts_original = []
results_visibility_original = []

nodes = list(graph.nodes)
deg_assort = nx.degree_assortativity_coefficient(graph, x = 'out', y = 'out')
print(f"Running for the Stanford Web graph with deg assort {deg_assort}\n")
for target_visibility in [0.01, 0.05, 0.11]:
    spreads = []
    for t in range(5):
        fully_observ_O, boundary_nodes = construct_observable(graph, target_visibility)
        observable_graph = construct_observable_graph(graph, fully_observ_O, boundary_nodes)
        seeds = seed_weighted_degree(observable_graph, weight = 3, num_seed = 5)
        activated_nodes = simulate_weighted_IC(graph, seeds)
        spreads.append(len(activated_nodes))
    avg_spread = np.mean(spreads)
    avg_spread_percent_of_graph = avg_spread/graph.number_of_nodes()
    print(f"\tvisibility: {target_visibility}, avg. spread as a pct. of |V|: {100*avg_spread_percent_of_graph:.4f}%")
    results_spreads_original.append(avg_spread)
    results_deg_assorts_original.append(deg_assort)
    results_visibility_original.append(target_visibility)

Running for the Stanford Web graph with deg assort 0.04575903921600233

	visibility: 0.01, avg. spread as a pct. of |V|: 0.1188%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.1554%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.2595%


------

## Amazon product co-purchasing network, March 02 2003


Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j.

The data was collected in March 02 2003.

```
Nodes	262111
Edges	1234877
Nodes in largest WCC	262111 (1.000)
Edges in largest WCC	1234877 (1.000)
Nodes in largest SCC	241761 (0.922)
Edges in largest SCC	1131217 (0.916)
Average clustering coefficient	0.4198
Number of triangles	717719
Fraction of closed triangles	0.09339
Diameter (longest shortest path)	32
90-percentile effective diameter	11
```

In [167]:
file = open('additional_datasets/amazon0302.txt', 'r')
edges = [line.strip().split() for line in file.readlines()[4:]]
edges = [(int(e[0]), int(e[1])) for e in edges]
graph = nx.DiGraph()
graph.add_edges_from(edges)
graph.number_of_nodes(), graph.number_of_edges()

(262111, 1234877)

In [168]:
# run on original graph
results_spreads_original = [] 
results_deg_assorts_original = []
results_visibility_original = []

nodes = list(graph.nodes)
deg_assort = nx.degree_assortativity_coefficient(graph, x = 'out', y = 'out')
print(f"Running for the Amazon graph with deg assort {deg_assort}\n")
for target_visibility in [0.01, 0.05, 0.11]:
    spreads = []
    for t in range(5):
        fully_observ_O, boundary_nodes = construct_observable(graph, target_visibility)
        observable_graph = construct_observable_graph(graph, fully_observ_O, boundary_nodes)
        seeds = seed_weighted_degree(observable_graph, weight = 3, num_seed = 5)
        activated_nodes = simulate_weighted_IC(graph, seeds)
        spreads.append(len(activated_nodes))
    avg_spread = np.mean(spreads)
    avg_spread_percent_of_graph = avg_spread/graph.number_of_nodes()
    print(f"\tvisibility: {target_visibility}, avg. spread as a pct. of |V|: {100*avg_spread_percent_of_graph:.4f}%")
    results_spreads_original.append(avg_spread)
    results_deg_assorts_original.append(deg_assort)
    results_visibility_original.append(target_visibility)

Running for the Amazon graph with deg assort 0.10270867847558518

	visibility: 0.01, avg. spread as a pct. of |V|: 0.0041%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.0033%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.0038%


## Wikipedia vote network


Wikipedia is a free encyclopedia written collaboratively by volunteers around the world. A small part of Wikipedia contributors are administrators, who are users with access to additional technical features that aid in maintenance. In order for a user to become an administrator a Request for adminship (RfA) is issued and the Wikipedia community via a public discussion or a vote decides who to promote to adminship. Using the latest complete dump of Wikipedia page edit history (from January 3 2008) we extracted all administrator elections and vote history data. This gave us 2,794 elections with 103,663 total votes and 7,066 users participating in the elections (either casting a vote or being voted on). Out of these 1,235 elections resulted in a successful promotion, while 1,559 elections did not result in the promotion. About half of the votes in the dataset are by existing admins, while the other half comes from ordinary Wikipedia users.

The network contains all the Wikipedia voting data from the inception of Wikipedia till January 2008. Nodes in the network represent wikipedia users and a directed edge from node i to node j represents that user i voted on user j.

```
Nodes	7115
Edges	103689
Nodes in largest WCC	7066 (0.993)
Edges in largest WCC	103663 (1.000)
Nodes in largest SCC	1300 (0.183)
Edges in largest SCC	39456 (0.381)
Average clustering coefficient	0.1409
Number of triangles	608389
Fraction of closed triangles	0.04564
Diameter (longest shortest path)	7
90-percentile effective diameter	3.8
```

In [173]:
file = open('additional_datasets/wiki-Vote.txt', 'r')
edges = [line.strip().split() for line in file.readlines()[4:]]
edges = [(int(e[0]), int(e[1])) for e in edges]
graph = nx.DiGraph()
graph.add_edges_from(edges)
graph.number_of_nodes(), graph.number_of_edges()

(7115, 103689)

In [174]:
# run on original graph
results_spreads_original = [] 
results_deg_assorts_original = []
results_visibility_original = []

nodes = list(graph.nodes)
deg_assort = nx.degree_assortativity_coefficient(graph, x = 'out', y = 'out')
print(f"Running for the Wikipedia graph with deg assort {deg_assort}\n")
for target_visibility in [0.01, 0.05, 0.11]:
    spreads = []
    for t in range(5):
        fully_observ_O, boundary_nodes = construct_observable(graph, target_visibility)
        observable_graph = construct_observable_graph(graph, fully_observ_O, boundary_nodes)
        seeds = seed_weighted_degree(observable_graph, weight = 3, num_seed = 5)
        activated_nodes = simulate_weighted_IC(graph, seeds)
        spreads.append(len(activated_nodes))
    avg_spread = np.mean(spreads)
    avg_spread_percent_of_graph = avg_spread/graph.number_of_nodes()
    print(f"\tvisibility: {target_visibility}, avg. spread as a pct. of |V|: {100*avg_spread_percent_of_graph:.4f}%")
    results_spreads_original.append(avg_spread)
    results_deg_assorts_original.append(deg_assort)
    results_visibility_original.append(target_visibility)

Running for the Wikipedia graph with deg assort -0.018909153225472288

	visibility: 0.01, avg. spread as a pct. of |V|: 2.1785%
	visibility: 0.05, avg. spread as a pct. of |V|: 1.9508%
	visibility: 0.11, avg. spread as a pct. of |V|: 1.8665%


## email-Eu-core network


The network was generated using email data from a large European research institution. We have anonymized information about all incoming and outgoing email between members of the research institution. There is an edge (u, v) in the network if person u sent person v at least one email. The e-mails only represent communication between institution members (the core), and the dataset does not contain incoming messages from or outgoing messages to the rest of the world.

The dataset also contains "ground-truth" community memberships of the nodes. Each individual belongs to exactly one of 42 departments at the research institute.

This network represents the "core" of the email-EuAll network, which also contains links between members of the institution and people outside of the institution (although the node IDs are not the same).

```
Nodes	1005
Edges	25571
Nodes in largest WCC	986 (0.981)
Edges in largest WCC	25552 (0.999)
Nodes in largest SCC	803 (0.799)
Edges in largest SCC	24729 (0.967)
Average clustering coefficient	0.3994
Number of triangles	105461
Fraction of closed triangles	0.1085
Diameter (longest shortest path)	7
90-percentile effective diameter	2.9
```

In [175]:
file = open('additional_datasets/email-Eu-core.txt', 'r')
edges = [line.strip().split() for line in file.readlines()]
edges = [(int(e[0]), int(e[1])) for e in edges]
graph = nx.DiGraph()
graph.add_edges_from(edges)
graph.number_of_nodes(), graph.number_of_edges()

(1005, 25571)

In [176]:
# run on original graph
results_spreads_original = [] 
results_deg_assorts_original = []
results_visibility_original = []

nodes = list(graph.nodes)
deg_assort = nx.degree_assortativity_coefficient(graph, x = 'out', y = 'out')
print(f"Running for the email EU graph with deg assort {deg_assort}\n")
for target_visibility in [0.01, 0.05, 0.11]:
    spreads = []
    for t in range(5):
        fully_observ_O, boundary_nodes = construct_observable(graph, target_visibility)
        observable_graph = construct_observable_graph(graph, fully_observ_O, boundary_nodes)
        seeds = seed_weighted_degree(observable_graph, weight = 3, num_seed = 5)
        activated_nodes = simulate_weighted_IC(graph, seeds)
        spreads.append(len(activated_nodes))
    avg_spread = np.mean(spreads)
    
    avg_spread_percent_of_graph = avg_spread/graph.number_of_nodes()
    print(f"\tvisibility: {target_visibility}, avg. spread as a pct. of |V|: {100*avg_spread_percent_of_graph:.4f}%")
    results_spreads_original.append(avg_spread)
    results_deg_assorts_original.append(deg_assort)
    results_visibility_original.append(target_visibility)

Running for the email EU graph with deg assort -0.0018398611298732078

	visibility: 0.01, avg. spread as a pct. of |V|: 19.2637%
	visibility: 0.05, avg. spread as a pct. of |V|: 16.5572%
	visibility: 0.11, avg. spread as a pct. of |V|: 19.3035%


-------

## Summarizing Results

**NetHept** graph with deg assort $0.3161$

```
	visibility: 0.01, avg. spread as a pct. of |V|: 0.32%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.59%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.80%

```    
    
**Gnutella** graph with deg assort $-0.0037$
```
	visibility: 0.01, avg. spread as a pct. of |V|: 4.74%
	visibility: 0.05, avg. spread as a pct. of |V|: 5.28%
	visibility: 0.11, avg. spread as a pct. of |V|: 8.81%

 ```   
        
    
**Stanford graph** with deg assort $0.04576$
```
	visibility: 0.01, avg. spread as a pct. of |V|: 0.16%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.13%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.18%
  ```

**Amazon graph** with deg assort $0.1027$
```
	visibility: 0.01, avg. spread as a pct. of |V|: 0.0027%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.0033%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.0034%

    
```
    
**Wikipedia** graph with deg assort $-0.0189$

```
	visibility: 0.01, avg. spread as a pct. of |V|: 1.8552%
	visibility: 0.05, avg. spread as a pct. of |V|: 1.7484%
	visibility: 0.11, avg. spread as a pct. of |V|: 1.8327%
    
```

**Email EU** graph with deg assort $-0.0018$
```
	visibility: 0.01, avg. spread as a pct. of |V|: 19.4428%
	visibility: 0.05, avg. spread as a pct. of |V|: 18.3284%
    visibility: 0.11, avg. spread as a pct. of |V|: 20.3582%
```

## More Detailed General Resilts - From An Earlier Run That Took a Long Time

Running for the NetHept graph with deg assort 0.3161257813793155
```
	visibility: 0.01, avg. spread as a pct. of |V|: 0.32%
	visibility: 0.03, avg. spread as a pct. of |V|: 0.54%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.59%
	visibility: 0.07, avg. spread as a pct. of |V|: 0.71%
	visibility: 0.09, avg. spread as a pct. of |V|: 0.72%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.80%
	visibility: 0.13, avg. spread as a pct. of |V|: 0.80%
	visibility: 0.15, avg. spread as a pct. of |V|: 0.72%
	visibility: 0.17, avg. spread as a pct. of |V|: 0.56%
```    
    
Running for the Gnutella graph with deg assort -0.0036630443245183283
```
	visibility: 0.01, avg. spread as a pct. of |V|: 4.74%
	visibility: 0.03, avg. spread as a pct. of |V|: 7.89%
	visibility: 0.05, avg. spread as a pct. of |V|: 5.28%
	visibility: 0.07, avg. spread as a pct. of |V|: 12.73%
	visibility: 0.09, avg. spread as a pct. of |V|: 6.76%
	visibility: 0.11, avg. spread as a pct. of |V|: 8.81%
	visibility: 0.13, avg. spread as a pct. of |V|: 16.55%
	visibility: 0.15, avg. spread as a pct. of |V|: 11.20%
	visibility: 0.17, avg. spread as a pct. of |V|: 15.18%
 ```   
        
    
Running for the Stanford graph with deg assort 0.04575903921600233
```
	visibility: 0.01, avg. spread as a pct. of |V|: 0.16%
	visibility: 0.03, avg. spread as a pct. of |V|: 0.23%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.13%
	visibility: 0.07, avg. spread as a pct. of |V|: 0.20%
	visibility: 0.09, avg. spread as a pct. of |V|: 0.17%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.18%
	visibility: 0.13, avg. spread as a pct. of |V|: 0.16%
	visibility: 0.15, avg. spread as a pct. of |V|: 0.18%
	visibility: 0.17, avg. spread as a pct. of |V|: 0.20%
  
  ```
  

Running for the Amazon graph with deg assort 0.10270867847558518
```
	visibility: 0.01, avg. spread as a pct. of |V|: 0.0027%
	visibility: 0.03, avg. spread as a pct. of |V|: 0.0053%
	visibility: 0.05, avg. spread as a pct. of |V|: 0.0033%
	visibility: 0.07, avg. spread as a pct. of |V|: 0.0040%
	visibility: 0.09, avg. spread as a pct. of |V|: 0.0047%
	visibility: 0.11, avg. spread as a pct. of |V|: 0.0034%
	visibility: 0.13, avg. spread as a pct. of |V|: 0.0058%
	visibility: 0.15, avg. spread as a pct. of |V|: 0.0035%
	visibility: 0.17, avg. spread as a pct. of |V|: 0.0047%
    
```
    
Running for the Wikipedia graph with deg assort -0.018909153225472288

```
	visibility: 0.01, avg. spread as a pct. of |V|: 1.8552%
	visibility: 0.03, avg. spread as a pct. of |V|: 1.6894%
	visibility: 0.05, avg. spread as a pct. of |V|: 1.7484%
	visibility: 0.07, avg. spread as a pct. of |V|: 1.7344%
	visibility: 0.09, avg. spread as a pct. of |V|: 1.5348%
	visibility: 0.11, avg. spread as a pct. of |V|: 1.8327%
	visibility: 0.13, avg. spread as a pct. of |V|: 1.6416%
	visibility: 0.15, avg. spread as a pct. of |V|: 1.9789%
    
```


Running for the email EU graph with deg assort -0.0018398611298732078
```
	visibility: 0.01, avg. spread as a pct. of |V|: 19.4428%
	visibility: 0.03, avg. spread as a pct. of |V|: 22.9652%
	visibility: 0.05, avg. spread as a pct. of |V|: 18.3284%
	visibility: 0.07, avg. spread as a pct. of |V|: 18.0896%
	visibility: 0.09, avg. spread as a pct. of |V|: 18.7463%
	visibility: 0.11, avg. spread as a pct. of |V|: 20.3582%
	visibility: 0.13, avg. spread as a pct. of |V|: 15.0050%
	visibility: 0.15, avg. spread as a pct. of |V|: 19.6617%
```