# Artists Centrality
## Learning from Networks project 2022-2023

# TODO: add info of our group

# TODO: add short descritpion of the project here

Let's start by importing libraries that we'll use

In [1]:
import networkx as nx 
import time
import random as rnd
import math

Let's build our graph starting with adding the nodes

In the file *nodes.csv* datas is rappresented in the following format: ___"spotify_id, name, followers, popularity, genres, chart_hits"___.

We are interested in the spotify ID, in the name and in the number of followers.

In the dataset there are some duplicates with incomplete data so we need to check we are not adding any by mistake and that the one we added are the one with the information we are interested (i.e. a duplicate can have the number of followers setted to 0)

Also to be more efficient and to be able to work with arrays and not hashmaps later on we have to map our IDs to ints

In [2]:
G = nx.Graph()

#Load dataset with nodes infos
f = open('dataset/nodes_reduced.csv', "r", encoding="utf8")

# skip the first line in the input file since it contains dataset description
f.readline()

#hashmap used for mapping spotify_ids to integers
mapping = {}
int_counter=0

while True:
    line = f.readline().strip()
    
    #empty line = EOF
    if line == '':
        break
        
    #extrapolate id and artist from the current line
    current_id, current_artist, current_followers, tmp = line.split(',', 3)
    
    #if the artist has already been added
    current_mapped_int = mapping.get(current_id,-1)
    if current_mapped_int != -1:
        
        #if the number of followers of the current line is greater than the numbers of followers already addeed
        #to the graph we just update the corresponding label
        if G.nodes[current_mapped_int]['followers'] < current_followers:
            G.nodes[current_mapped_int]['followers'] = current_followers
        
    #else we add current spotify id into the hashmap and update the id counter, then we add it to the graph
    else:
        mapping[current_id]=int_counter
        int_counter += 1
        
        #add new node with mapped int as key and artist and followers as lables
        G.add_node(int_counter, artist=current_artist, followers=current_followers)

#print results
print(G.number_of_nodes(),"nodes have been added successfully")

10 nodes have been added successfully


Now let's add the edges

In the file *edges.csv* datas is rappresented in the following format: ___"id_0,id_1"___.

We need to check the validity of the edge before adding it because some IDs are not on the *nodes.csv* dataset

In [3]:
#Load dataset with edges info
f = open('dataset/edges_reduced.csv', "r", encoding="utf8")

# skip the first line in the input file since it contains dataset description
f.readline()

while True:
    line = f.readline().strip()
    
    #empty line = EOF
    if line == '':
        break
        
    #extrapolate id_0 and id_1 from the current line
    id_0, id_1 = line.split(',', 1)
    
    #need to find the mapped ints that correspond to the ids
    int_0 = mapping.get(id_0,-1)
    int_1 = mapping.get(id_1,-1)
    
    #if the indices are both valid then we add the edge
    if int_0 != -1 and int_1 != -1:
        G.add_edge(int_0, int_1)
    
print(G.number_of_edges(),"edges have been added successfully")

12 edges have been added successfully


Let's run the exact algorithm for closeness centrality and let's also calculate the time of computation

In [4]:
start_time = time.time()
exact_closeness_centrality = nx.closeness_centrality(G)
end_time = time.time()

#let's print the results
print(exact_closeness_centrality)
print("the exact closeness centralities have been computed in %s seconds" %(end_time - start_time))

{1: 0.4263157894736842, 2: 0.45, 3: 0.4263157894736842, 4: 0.405, 5: 0.45, 6: 0.405, 7: 0.5785714285714286, 8: 0.3681818181818182, 9: 0.3681818181818182, 10: 0.0, 0: 0.675}
the exact closeness centralities have been computed in 0.0 seconds


Let's now define the function for computing the approximated closeness centrality based on Eppstein-Wang Algorithm

In [5]:
def ApproximateClosenessCentrality(G, k):
    sum_v = [0] * G.number_of_nodes()
    for i in range(k):
        #pick one node uniformally at random
        x = rnd.randint(0, G.number_of_nodes()-1)
        #solve sssp with picked node as source
        sssp = nx.shortest_path_length(G, source=x)
        #update partial sum of distancies for each node
        for n in sssp:
            sum_v[n] += sssp[n]
    centralities = {}
    #compute final approximation of centrality for each node
    for n in G:
        if sum_v[n] == 0:
            centralities[n] = 0
        else:
            centralities[n] = 1/((G.number_of_nodes()*sum_v[n])/(k*(G.number_of_nodes()-1)))
    #return dicionary containg pairs (node, approximatedcentrality)
    return centralities
        

Now let's run the approximated algorithm for closeness centrality and let's also calculate the time of computation

In [6]:
#let's compute the value k of the number of iteration we have to do with epsilon = 0.1
k = int(math.log(G.number_of_nodes(),2)/0.01)

start_time = time.time()
approximated_closeness_centrality = ApproximateClosenessCentrality(G,k)
end_time = time.time()

#let's print the results
print(approximated_closeness_centrality)
print("the approximated closeness centralities have been computed in %s seconds" %(end_time - start_time))

{1: 0.5091499409681228, 2: 0.5370485678704857, 3: 0.5108084098312111, 4: 0.4923647780790638, 5: 0.5492755930584302, 6: 0.4923647780790638, 7: 0.6862940123333997, 8: 0.43621191048172964, 9: 0.457863304578633, 10: 0, 0: 0.8021390374331552}
the approximated closeness centralities have been computed in 0.015631437301635742 seconds
