# Social Network Analysis - Final Project
### Influence spread and virality II

## Preliminaries

In [1]:
import numpy as np
import pandas as pd
import time
import networkx as nx
from functools import reduce
import operator
from random import choice
import functools
import os

## Load data

In [2]:
def edgelist_csv_to_graph(filename="./data/1jazz_edges.csv")
    '''
    Converts a .csv file in edgelist format to a networkX graph. 
    :param filename: relative path to the csv file
    :return G: Returns a networkx undirected graph object
    '''

    df = pd.read_csv(filename, sep=";")
    G = nx.from_pandas_edgelist(df, source="Source", target="Target")
    del df
    return G

## Outbreak simulation

In [3]:
def simulate_outbreak(G, initial_infected_nodes=[], p=False, n_runs=10):
    '''
    Simulates an outbreak, either from a random node or a prespecified set of nodes.
    :param G: networkx graph object to use as network
    :param initial_infected nodes: list of nodes to start the outbreak from, if empty choose random node
    :param n_runs: how many outbreaks to simulate
    :return all_runs_list: returns a list of lists, where each inner list is a list of infected nodes resulting from that run
    '''
    
    # List with all runs output
    all_runs_list = []
    # Run the algorithm 'runs' times
    for run in range(n_runs):
        # Using a fixed p or sampling from a 20-60 distribution
        if p==False:
            prob = np.random.uniform(20,60,1)[0]/100
        else:
            prob = p

        if initial_infected_nodes==[]:
            initial_infected_nodes = [choice(list(G.nodes()))]

        # Random seed equals to run so we always can recover the same output
        np.random.seed(run)
        # Nodes that are infecting other nodes in this time step
        transmissible_nodes = initial_infected_nodes
        # Nodes that become infected in this time step - at start, by default, none
        just_infected = []
        # History of all nodes that become infected - at the start just the the initial nodes
        all_infected = [initial_infected_nodes]
        # The algorithm runs when there is at least one trasmissible node
        while transmissible_nodes:
            # For each node recently infected we are going to check its neighbors and infect new nodes with probability p
            for n in transmissible_nodes:
                infection = np.random.uniform(0,1,len(list(G.neighbors(n)))) < prob
                just_infected += list(np.extract(infection, list(G.neighbors(n))))
            # Now the recent infected become the trasmissible nodes (only if they were not infected before)
            transmissible_nodes = list(set(just_infected) - set(reduce(operator.concat, all_infected)))
            # And they are added to the list with the history of all nodes infected
            all_infected.extend([transmissible_nodes])
        # Removing the last blank element (the last element is always a blank list)
        all_infected = all_infected[:-1]
        # Appending t the list with the output from all runs
        all_runs_list.append(all_infected)

    return all_runs_list


## Evaluation functions

In [4]:
#Fraction of information cascades and contamination events detected by the selected nodes
def detection_likelihood(outbreak_simulations, placement, inverse=True):
    '''
    For a given run or multiple runs, calculate the placement score with detection likelihood as objective
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param placement: solution set of nodes to calculate score for
    :param inverse: if inverse is set to True, return 1 - detection likelihood score, such that a lower score is a better score
    :return: Returns the average detection likelihood score (between 0 and 1) over all runs
    '''
    detection_count = 0
    # For each run we need to compute the detection likelihood
    for run in outbreak_simulations:
        # If any node in the placement got infeceted then we detected the outbreak
        if list(set(placement) & set(reduce(operator.concat, run))):
            detection_count += 1
    # We need the detection likelihood thus the number of detection divded by the total number of simulations
    dl = (detection_count/len(outbreak_simulations)
    if inverse:
        return 1-dl
    
    else:
          return dl

def detection_time(outbreak_simulations, placement):
    '''
    For a given run or multiple runs, calculate the placement score with detection time as objective
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param placement: solution set of nodes to calculate score for
    :return: Returns the average detection time score over all runs
    '''
          
    # Output is a list with the time step in which the outbreak was detected for each run
    output = []
    # Creating a set for the placement
    set_placement = set(placement)
    # For each run we need to compute the detection time
    for run in outbreak_simulations:
        # If any node in the placement got infeceted then we detected the outbreak and we need to check the detection time
        intersection = list(set_placement & set(reduce(operator.concat, run)))
        if intersection:
            # For each step and nodes infected in this step we are going to check if the placement detected it
            for step,nodes in enumerate(run):
                # If any element in the intersection is in this set of nodes, than it was detected at this time step
                if list(set(intersection) & set(nodes)):
                    output.append(step)
                    break
        # If not detected, than the detection time is the max penalty
        else:
            max_penalty = len(run)
            output.append(max_penalty)
    return np.mean(output)

def population_affected(outbreak_simulations, placement):
    '''
    For a given run or multiple runs, calculate the placement score with population affected as objective
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param placement: solution set of nodes to calculate score for
    :return: Returns the average population affected score over all runs
    '''
          
    # Output is a list with the population affected in each outbreak run
    output = []
    # Creating a set for the placement
    set_placement = set(placement)
    # For each run we need to compute the population affected
    for run in outbreak_simulations:
        pa = 0
        # If any node in the placement got infeceted then we detected the outbreak and we need to check the population affected
        intersection = list(set_placement & set(reduce(operator.concat, run)))
        if intersection:
            # For each step and nodes infected in this step we are going to check if the placement detected it
            for nodes in run:
                # If any element in the intersection is in this set of nodes, then it was detected at this time step
                # and we can finish by appending the population affected to the output and breaking the run
                if list(set(intersection) & set(nodes)):
                    output.append(pa)
                    break
                else:
                # If not, then we need to sum the non-detected nodes to the pop affected
                    pa += len(nodes)
        # If not detected, than the pop affected is the size of the outbreak
        else:
            output.append(len(reduce(operator.concat, run)))
    return np.mean(output)

## Naive greedy algorithm

In [5]:
def naive_greedy(outbreak_simulations, budget, eval_function, G):
    '''
    Using a naive greedy strategy: find the best placement, given outbreak simulation data and an objective function, constrained to a budget.
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param budget: The total cost of selecting nodes cannot exceed the budget
    :param eval_function: Which objective function to use to calculate placement score
    :param G: networkx graph object to use as network
    :return placement: Return best found solution set of nodes
    
    '''
    eval_function = functools.partial(eval_function, outbreak_simulations=outbreak_simulations)
    nodes = list(G.nodes())
    placement = []
    #Finding best placement
    for i in range(budget):
        scores = []
        for n in nodes:
            placement.append(n)
            scores.append(eval_function(placement=placement))
            placement.remove(n)
        best_node = nodes[np.argmin(scores)]
        placement.append(best_node)
        nodes.remove(best_node)
    
    return placement

## CELF

In [6]:
def CELF(outbreak_simulations, budget, eval_function, G):
    '''
    Using the CELF algorithm: find the best placement, given outbreak simulation data and an objective function, constrained to a budget.
    :param outbreak_simulations: outbreak simulation data generated by simulate_outbreak
    :param budget: The total cost of selecting nodes cannot exceed the budget
    :param eval_function: Which objective function to use to calculate placement score
    :param G: networkx graph object to use as network
    :return placement: Return best found solution set of nodes
    '''
    
    eval_function = functools.partial(eval_function, outbreak_simulations=outbreak_simulations)

    nodes = list(G.nodes())
    placement = []
    
    #First run
    scores = []
    placement=[]
    for n in nodes:
        placement.append(n)
        scores.append(eval_function(placement=placement))
        placement.remove(n)
    best_node = nodes[np.argmin(scores)]
    placement.append(best_node)
    nodes.remove(best_node)
    del scores[np.argmin(scores)]
    #Preparation for 2nd run
    #getting indexes of the sort
    scores = [[index,value] for index, value in sorted(enumerate(scores), key=lambda x: x[1])]
    #Finding best placement
    while len(placement) < budget:
       #calculating
        actual_score = eval_function(placement=placement)
        placement.append(nodes[scores[0][0]])
        top_node_marginal_score = eval_function(placement=placement)-actual_score
        placement.remove(nodes[scores[0][0]])
        #chek that
        scores[0][1] = top_node_marginal_score
           #sort 
        scores = sorted(scores, key=lambda x: x[1])
           #checking
        if scores[0][1] == top_node_marginal_score:
           # resorting
            placement.append(nodes[scores[0][0]])
            del scores[0]
            nodes.remove(nodes[scores[0][0]])
               
    return placement