In [19]:
import os
import sys
import pandas as pd
import numpy as np
import igraph as ig
from igraph import Graph


from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

from sklearn.metrics import normalized_mutual_info_score
import community as community_louvain
import networkx as nx


In [2]:
dataset = pd.read_csv(
    'D:/Pablo/clases/UJM/2. Semester, 2021/Mining Uncertain Social Networks/Repository/Experiments/datasets/dancer_01/results/random_01.txt')

dataset.head()

Unnamed: 0,from,to,asim_likelihood,edge_exists_original,edge_exists_modified,seal_likelihood,dataset,modification
0,0,1,0.036421,0.0,0.0,0.0238,dancer_01,base
1,0,10,0.563449,1.0,0.0,0.5691,dancer_01,base
2,0,100,0.002656,0.0,0.0,0.0716,dancer_01,base
3,0,1000,0.000246,0.0,0.0,0.0236,dancer_01,base
4,0,1001,0.504137,0.0,0.0,0.0614,dancer_01,base


In [7]:
# Split into train/test
X_train, X_test, E_train, E_test, Y_train,Y_test = train_test_split(dataset['asim_likelihood'], dataset['edge_exists_original'], dataset['edge_exists_modified'],
                                         test_size=0.3, random_state=4269, stratify=dataset['edge_exists_modified'])

# 'Train', get best threshold, via f1_score
precision, recall, thresholds = precision_recall_curve(Y_train, X_train)
f1_scores = np.divide(2*recall*precision, (recall+precision), out=np.zeros_like(recall+precision), where=(recall+precision!=0))

# Get the threshold with the best results
threshold = thresholds[np.argmax(f1_scores)]
train_f1_score = np.max(f1_scores)
# Get test best score
test_f1_score = f1_score((X_test > threshold).astype('float64').values, Y_test)
# Comparing to the original graph (just the test ones)
exists_test_f1_score = f1_score((X_test > threshold).astype('float64').values, E_test)
# Comparing to the original graph (ALL OF THEM!)
exists_all_f1_score = f1_score(np.concatenate(((X_train > threshold).astype('float64').values, (X_test > threshold).astype('float64').values)), np.concatenate((E_train,E_test)))

print('asim_likelihood: '+str(threshold)+'\n', 'train_f1_score: '+str(train_f1_score)+'\n', 'test_f1_score: '+str(test_f1_score)+'\n', 'exists_f1_score: '+str(exists_test_f1_score)+'\n', 'exists_all_f1_score: '+str(exists_all_f1_score))

asim_likelihood: 0.8429287
 train_f1_score: 0.2742304309586631
 test_f1_score: 0.2681144288995615
 exists_f1_score: 0.27037773359840955
 exists_all_f1_score: 0.27584575348947243


In [10]:
# Split into train/test
X_train, X_test, E_train, E_test, Y_train,Y_test = train_test_split(dataset['seal_likelihood'], dataset['edge_exists_original'], dataset['edge_exists_modified'],
                                         test_size=0.3, random_state=4269, stratify=dataset['edge_exists_modified'])

# 'Train', get best threshold, via f1_score
precision, recall, thresholds = precision_recall_curve(Y_train, X_train)
f1_scores = np.divide(2*recall*precision, (recall+precision), out=np.zeros_like(recall+precision), where=(recall+precision!=0))

# Get the threshold with the best results
threshold = thresholds[np.argmax(f1_scores)]
train_f1_score = np.max(f1_scores)
# Get test best score
test_f1_score = f1_score((X_test > threshold).astype('float64').values, Y_test)
# Comparing to the original graph (just the test ones)
exists_test_f1_score = f1_score((X_test > threshold).astype('float64').values, E_test)
# Comparing to the original graph (ALL OF THEM!)
exists_all_f1_score = f1_score(np.concatenate(((X_train > threshold).astype('float64').values, (X_test > threshold).astype('float64').values)), np.concatenate((E_train,E_test)))

print('seal_likelihood: '+str(threshold)+'\n', 'train_f1_score: '+str(train_f1_score)+'\n', 'test_f1_score: '+str(test_f1_score)+'\n', 'exists_f1_score: '+str(exists_test_f1_score)+'\n', 'exists_all_f1_score: '+str(exists_all_f1_score))

seal_likelihood: 0.9324
 train_f1_score: 0.22650150931612367
 test_f1_score: 0.20456802383316783
 exists_f1_score: 0.21222768798313424
 exists_all_f1_score: 0.2290681502086231


In [12]:
selected_edges = dataset[(dataset['edge_exists_modified']==1) | ((dataset['asim_likelihood']>threshold))].copy()
selected_edges[['from','to']].to_csv('random_01_seal_graph.txt', sep=' ', header=None, index=False)

In [13]:
dataset_name = 'dancer_01'
community_file = 'D:/Pablo/clases/UJM/2. Semester, 2021/Mining Uncertain Social Networks/Experiments/datasets/dancer_01/dancer_01_comm.txt'

In [14]:
# Load community belonging
communities = {}
with open(community_file) as f:
    for line in f:
       (key, val) = line.split()
       communities[key] = val

In [20]:
# Unweighted analysis

results_rows = []

# Value to be used during all of the runs
p_size = len(set(communities.values()))

# Iterate generated graphs
for file_name in ['random_01_seal_graph.txt']:
    # Load graph
    graph = Graph.Read_Ncol(file_name, directed=False)
    
    # Add community belonging
    for vertex in graph.vs:
        vertex['community'] = communities[vertex['name']]

    # louvian
    method = 'Louvain-igraph'

    louvian = ig.Graph.community_multilevel(graph, return_levels=True)
    louvian = louvian[len(louvian)-1]
    p_louvian = len(set(louvian.membership))

    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P',
                 'value':p_louvian})    
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P*/P',
                 'value':p_size/p_louvian})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'Modularity',
                 'value':graph.modularity(louvian.membership)})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'NMI',
                 'value':normalized_mutual_info_score(graph.vs['community'], louvian.membership)})

    # FastGreedy
    method = 'Fastgreedy'

    fg = ig.Graph.community_fastgreedy(graph)
    p_fg = fg.optimal_count
    fg = fg.as_clustering()
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P',
                 'value':p_fg})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P*/P',
                 'value':p_size/p_fg})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'Modularity',
                 'value':graph.modularity(fg.membership)})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'NMI',
                 'value':normalized_mutual_info_score(graph.vs['community'], fg.membership)})
    # Infomap
    method = 'Infomap'

    infomap = ig.Graph.community_infomap(graph)
    p_im = len(set(infomap.membership))
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P',
                 'value':p_im})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P*/P',
                 'value':p_size/p_im})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'Modularity',
                 'value':graph.modularity(infomap.membership)})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'NMI',
                 'value':normalized_mutual_info_score(graph.vs['community'], infomap.membership)})

    # Label Propagation
    method = 'Label Propagation'

    lp = ig.Graph.community_label_propagation(graph)
    p_lp = len(set(lp.membership))
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P',
                 'value':p_lp})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P*/P',
                 'value':p_size/p_lp})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'Modularity',
                 'value':graph.modularity(lp.membership)})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'NMI',
                 'value':normalized_mutual_info_score(graph.vs['community'], lp.membership)})

    # Louvain
    method = 'Louvain'

    # Creating nx Graph to for other Louvain implementation
    nxG = nx.Graph()
    nxG.add_nodes_from([vertex.index for vertex in graph.vs])
    nxG.add_edges_from([edge.tuple for edge in graph.es])

    lv_partition = community_louvain.best_partition(nxG)
    p_lv = len(set(lv_partition.values()))
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P',
                 'value':p_lv})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'P*/P',
                 'value':p_size/p_lv})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'Modularity',
                 'value':graph.modularity(lv_partition.values())})
    results_rows.append({'dataset':dataset_name, 'file_name':file_name, 'method':method, 'metric':'NMI',
                 'value':normalized_mutual_info_score(graph.vs['community'], pd.Series(lv_partition.values()))})

community_algos = pd.DataFrame(results_rows)

community_algos.head()

Unnamed: 0,dataset,file_name,method,metric,value
0,dancer_01,random_01_seal_graph.txt,Louvain-igraph,P,8.0
1,dancer_01,random_01_seal_graph.txt,Louvain-igraph,P*/P,0.75
2,dancer_01,random_01_seal_graph.txt,Louvain-igraph,Modularity,0.638858
3,dancer_01,random_01_seal_graph.txt,Louvain-igraph,NMI,0.684448
4,dancer_01,random_01_seal_graph.txt,Fastgreedy,P,7.0


In [22]:
community_algos[community_algos['metric']=='NMI']

Unnamed: 0,dataset,file_name,method,metric,value
3,dancer_01,random_01_seal_graph.txt,Louvain-igraph,NMI,0.684448
7,dancer_01,random_01_seal_graph.txt,Fastgreedy,NMI,0.656665
11,dancer_01,random_01_seal_graph.txt,Infomap,NMI,0.547304
15,dancer_01,random_01_seal_graph.txt,Label Propagation,NMI,0.687467
19,dancer_01,random_01_seal_graph.txt,Louvain,NMI,0.687754
