In [1]:
import pandas as pd
import numpy as np
import os

import networkx as nx
import re

In [2]:
edges = pd.read_csv('data/edges.csv')
nodes = pd.read_csv('data/nodes.csv')
hero_edges = pd.read_csv('data/hero-network.csv')
datasets = [edges, nodes, hero_edges]

In [3]:
# preprocessing
def strip_rightend(string):
    string = string.strip().rstrip('/')

    if string =='SPIDER-MAN/PETER PARKER':
        string = 'SPIDER-MAN/PETER PAR'
    elif string == 'SPIDER-MAN/PETER PARKERKER':
        string = 'SPIDER-MAN/PETER PAR'

    return string

In [4]:
for ds in datasets:
    ds.dropna(inplace=True)
    for col in ds.columns:
        ds[col] = ds[col].apply(lambda row: strip_rightend(row))

In [5]:
hero_edges_arr = np.array(datasets[2])
hero_edges_arr.shape

(574467, 2)

In [6]:
idx_self = np.where(hero_edges_arr[:,0] == hero_edges_arr[:,1])
idx_self
# we check on a sample
hero_edges.iloc[8889]

hero1    MISS AMERICA/MADELIN
hero2    MISS AMERICA/MADELIN
Name: 8889, dtype: object

In [7]:
hero_edges_arr = np.delete(hero_edges_arr, idx_self, axis = 0)
print(hero_edges_arr.shape)
hero_edge = pd.DataFrame(hero_edges_arr) # ! new df 
hero_edge.columns = ['hero1', 'hero2']

(572235, 2)


# Creating the graph

In [8]:
heroes_graph = nx.from_pandas_edgelist(hero_edge, 'hero1', 'hero2', create_using=nx.MultiGraph)

In [9]:
# counting edges for each node
nodes_edgecount = {}

for node in heroes_graph.nodes():
    nodes_edgecount[node] = len(heroes_graph.edges(node))
    
max_nr_edges = max(nodes_edgecount.values())

In [10]:
node_tuple_weight = {}
for e in heroes_graph.edges: # .edges() doesnt have the edge key and believe this is the third tuple entry necessary
    weight = 1-(nodes_edgecount[e[0]]+nodes_edgecount[e[1]])/(2*max_nr_edges)
    node_tuple_weight[(e[0], e[1], e[2])] = weight

In [11]:
nx.set_edge_attributes(heroes_graph, values = node_tuple_weight, name = 'weight')

In [12]:
node_data = list(zip(datasets[1].node, [{'type':t}for t in datasets[1].type]))
edge_data = list(zip(datasets[0].hero, datasets[0].comic))

In [13]:
comics = nx.MultiGraph()
# nodes.apply(lambda row: comics.add_node(row.index, {'type': row.values()} ))
comics.add_nodes_from(node_data)
comics.add_edges_from(edge_data)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [14]:
#nx.draw_networkx(comics)

In [15]:
# making list of top_n sorted heroes
ls = edges.groupby(by = 'hero')['comic'].size().sort_values(ascending=False)
top_n_heroes = np.array(list(zip(ls.index, ls.values)) )

# Functionality 2 - Find top superheroes!

In [16]:
#returns every comics where a list of heroes appear
def filter_comics(n_heroes):
    filtered_comics = []
    for node in n_heroes:
        for e in comics.edges(node):
            if e[0] == node:
                if e[1] not in filtered_comics:
                    filtered_comics.append(e[1])
            elif e[1] == node:
                if e[0] not in filtered_comics:
                    filtered_comics.append(e[0])
    return filtered_comics

In [17]:
def find_top_superheroes(graph, node, metric, N):
    #creates the subgraph
    n_heroes = list(top_n_heroes[:N][:, 0])
    comics_to_subgraph = filter_comics(n_heroes) 
    subgraph = graph.subgraph(n_heroes + comics_to_subgraph)
    
    #nx.draw(subgraph)
    
    values = {}
    
    #calculates the values of the requested metric for each node
    if metric == 1:
        values = nx.betweenness_centrality(subgraph)
    elif metric == 2:
        values = nx.pagerank(subgraph)
    elif metric == 3:
        values = nx.closeness_centrality(subgraph)
    elif metric == 4:
        values = nx.degree_centrality(subgraph)
    else:
        return 0
    
    
    node_value = values[node]
    avg_value = sum(values.values()) / len(values)
    
    #creates the table
    df = pd.DataFrame([[avg_value], [node_value]], index=pd.Index(['average value', 'node value']), columns=['values'])
    
    return df

In [18]:
value = find_top_superheroes(comics, "IM2 6", 2, 3)

In [19]:
value

Unnamed: 0,values
average value,0.00029
node value,0.000226
