# Social Network Analysis - Final Project
### Influence spread and virality II

# TODO
- save outbreak simulation data to file
- implement heap in CELF
- implement CELF++
- implement SA
- implement multi-processing for outbreak simulation

## Preliminaries

In [1]:
import numpy as np
import pandas as pd
import time
import networkx as nx
from functools import reduce
import operator
from random import choice
import functools
import os
import heapq 


## Load data

In [3]:
def edgelist_csv_to_graph(filename="./data/1jazz_edges.csv"):
    '''
    Converts a .csv file in edgelist format to a networkX graph. 
    :param filename: relative path to the csv file
    :return G: Returns a networkx undirected graph object
    '''

    df = pd.read_csv(filename, sep=";")
    G = nx.from_pandas_edgelist(df, source="Source", target="Target")
    del df
    return G

In [9]:
G = edgelist_csv_to_graph()

## Outbreak simulation

In [33]:
def simulate_outbreak(G, init_nodes=[], p=False, n_runs=10):
    '''
    Simulates an outbreak, either from a random node or a prespecified set of nodes.
    :param G: networkx graph object to use as network
    :param initial_infected nodes: list of nodes to start the outbreak from, if empty choose random node
    :param n_runs: how many outbreaks to simulate
    :return all_runs_list: returns a list of lists, where each 
                            inner list is a list of infected nodes resulting from that run
    '''
    
    # List with all runs output
    all_runs_list = []
    # Run the algorithm 'n_runs' times
    for run in range(n_runs):
        np.random.seed(run)
        initial_infected_nodes = []
        # Using a fixed p or sampling from a 20-60 distribution
        if p==False:
            prob = np.random.uniform(20,60,1)[0]/100
        else:
            prob = p

        if init_nodes==[]:
            initial_infected_nodes = [choice(list(G.nodes()))]
            
        else:
            initial_infected_nodes = init_nodes

        # Random seed equals to run so we always can recover the same output
        # np.random.seed(run)
        # Nodes that are infecting other nodes in this time step
        transmissible_nodes = initial_infected_nodes
        # Nodes that become infected in this time step - at start, by default, none
        just_infected = []
        # History of all nodes that become infected - at the start just the the initial nodes
        all_infected = [initial_infected_nodes]
        # The algorithm runs when there is at least one trasmissible node
        while transmissible_nodes:
            # For each node recently infected we are going to check its neighbors and infect new nodes with probability p
            for n in transmissible_nodes:
                infection = np.random.uniform(0,1,len(list(G.neighbors(n)))) < prob
                just_infected += list(np.extract(infection, list(G.neighbors(n))))
            # Now the recent infected become the trasmissible nodes (only if they were not infected before)
            transmissible_nodes = list(set(just_infected) - set(reduce(operator.concat, all_infected)))
            # And they are added to the list with the history of all nodes infected
            all_infected.extend([transmissible_nodes])
        # Removing the last blank element (the last element is always a blank list)
        all_infected = all_infected[:-1]
        # Appending t the list with the output from all runs
        all_runs_list.append(all_infected)

    return all_runs_list


## Evaluation functions

In [31]:
#Fraction of information cascades and contamination events detected by the selected nodes
def detection_likelihood(outbreak_simulations, placement, inverse=True):
    '''
    For a given run or multiple runs, calculate the placement score with detection likelihood as objective
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param placement: solution set of nodes to calculate score for
    :param inverse: if inverse is set to True, return 1 - detection likelihood score, such that a lower score is a better score
    :return: Returns the average detection likelihood score (between 0 and 1) over all runs
    '''
    detection_count = 0
    # For each run we need to compute the detection likelihood
    for run in outbreak_simulations:
        # If any node in the placement got infeceted then we detected the outbreak
        if list(set(placement) & set(reduce(operator.concat, run))):
            detection_count += 1
    # We need the detection likelihood thus the number of detection divded by the total number of simulations
    dl = (detection_count/len(outbreak_simulations))
    if inverse:
        return 1-dl
    
    else:
          return dl

def detection_time(outbreak_simulations, placement):
    '''
    For a given run or multiple runs, calculate the placement score with detection time as objective
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param placement: solution set of nodes to calculate score for
    :return: Returns the average detection time score over all runs
    '''
          
    # Output is a list with the time step in which the outbreak was detected for each run
    output = []
    # Creating a set for the placement
    set_placement = set(placement)
    # For each run we need to compute the detection time
    for run in outbreak_simulations:
        # If any node in the placement got infeceted then we detected the outbreak and we need to check the detection time
        intersection = list(set_placement & set(reduce(operator.concat, run)))
        if intersection:
            # For each step and nodes infected in this step we are going to check if the placement detected it
            for step,nodes in enumerate(run):
                # If any element in the intersection is in this set of nodes, than it was detected at this time step
                if list(set(intersection) & set(nodes)):
                    output.append(step)
                    break
        # If not detected, than the detection time is the max penalty
        else:
            max_penalty = len(run)
            output.append(max_penalty)
    return np.mean(output)

def population_affected(outbreak_simulations, placement):
    '''
    For a given run or multiple runs, calculate the placement score with population affected as objective
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param placement: solution set of nodes to calculate score for
    :return: Returns the average population affected score over all runs
    '''
          
    # Output is a list with the population affected in each outbreak run
    output = []
    # Creating a set for the placement
    set_placement = set(placement)
    # For each run we need to compute the population affected
    for run in outbreak_simulations:
        pa = 0
        # If any node in the placement got infeceted then we detected the outbreak and we need to check the population affected
        intersection = list(set_placement & set(reduce(operator.concat, run)))
        if intersection:
            # For each step and nodes infected in this step we are going to check if the placement detected it
            for nodes in run:
                # If any element in the intersection is in this set of nodes, then it was detected at this time step
                # and we can finish by appending the population affected to the output and breaking the run
                if list(set(intersection) & set(nodes)):
                    output.append(pa)
                    break
                else:
                # If not, then we need to sum the non-detected nodes to the pop affected
                    pa += len(nodes)
        # If not detected, than the pop affected is the size of the outbreak
        else:
            output.append(len(reduce(operator.concat, run)))
    return np.mean(output)

## Naive greedy algorithm

In [44]:
def naive_greedy(outbreak_simulations, budget, eval_function, G, verbosity=2):
    '''
    Using a naive greedy strategy: find the best placement, given outbreak simulation data and an objective function, constrained to a budget.
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param budget: The total cost of selecting nodes cannot exceed the budget
    :param eval_function: Which objective function to use to calculate placement score
    :param G: networkx graph object to use as network
    :return placement: Return best found solution set of nodes
    
    '''
    t_total = time.time()
    eval_function = functools.partial(eval_function, outbreak_simulations=outbreak_simulations)
    nodes = list(G.nodes())
    n_nodes = len(nodes)
    placement = []
#     total_gain = 0
    #Finding best placement
    for i in range(budget):
        t_iter = time.time()
        scores = []
        for n in nodes:
            placement.append(n)
            scores.append(eval_function(placement=placement))
            placement.remove(n)
        best_node = nodes[np.argmin(scores)]
        placement.append(best_node)
        nodes.remove(best_node)
        
        if verbosity >= 2:
            print("Finished iteration " + str(i+1) + " in " + str(time.time()-t_iter))
        
    if verbosity >= 1:
        print("Total time: " + str(time.time()-t_total))
    
    return placement, eval_function(placement=placement)

## CELF

1. Maintain priority queue (u, u.marginal_gain, u.iter)
2. If node chosen had its marginal_gain computer in the current iteration, then it must be the best node for the current iteration.

Handy link to understand min heap implementation: https://www.techbeamers.com/python-heapq/

In [63]:
def CELF(outbreak_simulations, budget, eval_function, G, verbosity = 2):
    '''
    Using the CELF algorithm: find the best placement, given outbreak simulation data and an objective function, constrained to a budget.
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param budget: The total cost of selecting nodes cannot exceed the budget
    :param eval_function: Which objective function to use to calculate placement score
    :param G: networkx graph object to use as network
    :return placement: Return best found solution set of nodes
    '''
  
    t_total = time.time()

    eval_function = functools.partial(eval_function, outbreak_simulations=outbreak_simulations)

    nodes = list(G.nodes())

    # Construct heap for first iteration of format (marginal gain, node)
    node_heap = []
    placement = []
    scores = []
    total_penalty = eval_function(placement = [])
    t_iter = time.time()
    for node in nodes:
        penalty = eval_function(placement = [node])
        marginal_gain = total_penalty - penalty
        # heapq implements a min heap, which keeps the smallest element at the top of the heap
        # but we need it the other way around, therefore we multiply the marginal_gain by -1
        heapq.heappush(node_heap, (-marginal_gain, node))
    
    # Remove best node from heap and add to solution set
    best_gain, best_node = heapq.heappop(node_heap)
    total_penalty = total_penalty -
    placement.append(best_node)
    
    if verbosity >= 2:
        print("Finished iteration " + str(len(placement)) + " in " + str(time.time()-t_iter))

    
    while len(placement) < budget:
        t_iter = time.time()
#         Recompute gain only for top node

#         Sort by gain

#         If a is still top node:
#         Add to solution
#         Else:
#         Recompute gain for new top node
        top_node_unchanged = False

        while not top_node_unchanged:
            _, current_node = heapq.heappop(node_heap)
            placement.append(current_node)
            current_penalty = eval_function(placement=placement)
            placement.remove(current_node)
            marginal_gain = total_penalty - current_penalty

            # check if the previous top node stayed on the top after pushing
            # the marginal gain to the heap
            heapq.heappush(node_heap, (-marginal_gain, current_node))
            _, top_node = node_heap[0]
            
            if top_node == current_node:
                top_node_unchanged = True
            
        marginal_gain, current_node = heapq.heappop(node_heap)
        # marginal gain is stored as negative, so use plus instead of minus
        total_penalty = total_penalty + marginal_gain
        
        placement.append(current_node)
        
        if verbosity >= 2:
            print("Finished iteration " + str(len(placement)) + " in " + str(time.time()-t_iter))
            print(eval_function(placement = placement))
    
    if verbosity >= 1:
        print("Total time: " + str(time.time()-t_total))
               
    return placement, eval_function(placement = placement)

In [75]:
# Generating outbreaks scenarios and other parameters
n_scenarios = 1000
outbreak_simulations = simulate_outbreak(G=G, n_runs=n_scenarios)
budget = 5

# Naive Algorithm
print("NAIVE")
naive_placement, naive_score = naive_greedy(outbreak_simulations, budget=budget, eval_function = detection_time, G=G)
print(naive_placement)
print(naive_score)
# CELF
print("CELF")
i_time = time.time()
celf_placement, celf_score = CELF(outbreak_simulations, budget=budget, eval_function = detection_time, G=G)
print(celf_placement)
print(celf_score)

NAIVE
Finished iteration 1 in 2.6556804180145264
Finished iteration 2 in 2.265080690383911
Finished iteration 3 in 2.1713662147521973
Finished iteration 4 in 2.155748128890991
Finished iteration 5 in 2.233853340148926
Total time: 11.481728792190552
[67, 7, 31, 20, 32]
1.448
CELF
Finished iteration 1 in 2.8431172370910645
Finished iteration 2 in 2.279677629470825
1.733
Finished iteration 3 in 0.015622854232788086
1.629
Finished iteration 4 in 0.01565098762512207
1.561
Finished iteration 5 in 0.015624523162841797
1.517
Total time: 5.216492176055908
[67, 7, 23, 20, 133]
1.517


In [73]:
population_affected(outbreak_simulations, [20])

21.45