# Homework 3

In [118]:
# !pip install cdlib
# !pip install leidenalg
# !pip uninstall community
# !pip install python-louvain

import networkx as nx
from networkx.algorithms import node_classification
import random
from cdlib import algorithms, evaluation, NodeClustering
from community import community_louvain
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import os

## 2. Who’s the winner?

### (i)

In [99]:
def build_synhtetic_graph(mu = 0.5):
    G = nx.Graph(name = "girvan_newman")
    n = 72
    cluster_div = 24
    for i in range(n):
        G.add_node(i, cluster = i // cluster_div + 1)
    
    nodes = list(G.nodes(data=True))
    for _ in range(1440):
        node = random.choice(nodes)
        if random.random() > mu:
            next_node = random.choice([n for (n, d) in nodes if d['cluster'] == node[1]['cluster']])
            G.add_edge(node[0], next_node)
        else:
            next_node = random.choice([n for (n, d) in nodes if d['cluster'] != node[1]['cluster']])
            G.add_edge(node[0], next_node)
    return G

def compare_on_synthetic(algo_fn, mus, iter_num=10):
    nmis = []
    for mu in mus:
        NMI = 0
        for _ in range(iter_num):
            G = build_synhtetic_graph(mu)
            clustered_G = algo_fn(G)
            partition = get_graph_ideal_partition(G) 
            NMI += clustered_G.normalized_mutual_information(partition).score / iter_num
            nmis.append(NMI)
        print("mu={:.2f}, NMI: {:5.3f}".format(mu, NMI))
    return nmis
        
def get_graph_ideal_partition(G):
    P = {}
    for node in G.nodes(data = True):
        if node[1]['cluster'] not in P:
            P[node[1]['cluster']] = []
        P[node[1]['cluster']].append(node[0])
    node_clusters = P.values()
    return NodeClustering(list(node_clusters), G, 'Ideal')

        
iter_num = 25
mus = [0.1 * i for i in range(0, 6)]
algs = {
    "Louvain": algorithms.louvain,
    "Walktrap": algorithms.walktrap,
    "Label propagation": algorithms.label_propagation
}

for algo_name in algs:
    print('ALGORITHM:', algo_name)
    nmis = compare_on_synthetic(algs[algo_name], mus, iter_num)
    print('======================'*3)  

ALGORITHM: Louvain
mu=0.00, NMI: 1.000
mu=0.10, NMI: 1.000
mu=0.20, NMI: 1.000
mu=0.30, NMI: 0.998
mu=0.40, NMI: 0.907
mu=0.50, NMI: 0.295
ALGORITHM: Walktrap
mu=0.00, NMI: 1.000
mu=0.10, NMI: 1.000
mu=0.20, NMI: 1.000
mu=0.30, NMI: 1.000
mu=0.40, NMI: 0.884
mu=0.50, NMI: 0.351
ALGORITHM: Label propagation
mu=0.00, NMI: 1.000
mu=0.10, NMI: 0.453
mu=0.20, NMI: 0.210
mu=0.30, NMI: 0.000
mu=0.40, NMI: 0.000
mu=0.50, NMI: 0.000


### (ii)

In [121]:
def read_net(folder, graph_name):
    """Read network"""
    file_name = graph_name + '.net'
    G = nx.MultiGraph(name = file_name)
    with open(os.path.join(folder, file_name), 'r', encoding='utf8') as f:
        f.readline()
        # add nodes
        for line in f:
            if line.startswith("*"):
                break
            else:
                node_info = line.split("\"")
                node = int(node_info[0]) - 1
                label = node_info[1]
                cluster = int(node_info[2]) if len(node_info) > 2 and len(node_info[2].strip()) > 0 else 0
                G.add_node(node, label=label, cluster=cluster)

        # add edges
        for line in f:
            node1_str, node2_str = line.split()[:2]
            G.add_edge(int(node1_str)-1, int(node2_str)-1)
    return G

def compare_on_lrf(algo_fn, mus, iter_num=10):
    nmis = []
    for mu in mus:
        NMI = 0
        for i in range(iter_num):
            G = read_net("data/LFR/", "LFR_0{}_{}".format(int(mu*10), i))
            clustered_G = algo_fn(G)
            partition = get_graph_ideal_partition(G) 
            NMI += clustered_G.normalized_mutual_information(partition).score / iter_num
            nmis.append(NMI)
        print("mu={:.2f}, NMI: {:5.3f}".format(mu, NMI))
    return nmis
        
def get_graph_ideal_partition(G):
    P = {}
    for node in G.nodes(data = True):
        if node[1]['cluster'] not in P:
            P[node[1]['cluster']] = []
        P[node[1]['cluster']].append(node[0])
    node_clusters = P.values()
    return NodeClustering(list(node_clusters), G, 'Ideal')

iter_num = 25
mus = [0.2 * i for i in range(0, 5)]

for algo_name in algs:
    print('ALGORITHM:', algo_name)
    nmis = compare_on_lrf(algs[algo_name], mus, iter_num)
    print('======================'*3)  

ALGORITHM: Louvain
mu=0.00, NMI: 0.995
mu=0.20, NMI: 0.949
mu=0.40, NMI: 0.868
mu=0.60, NMI: 0.508
mu=0.80, NMI: 0.133
ALGORITHM: Walktrap
mu=0.00, NMI: 1.000
mu=0.20, NMI: 0.975
mu=0.40, NMI: 0.820
mu=0.60, NMI: 0.607
mu=0.80, NMI: 0.427
ALGORITHM: Label propagation
mu=0.00, NMI: 0.998
mu=0.20, NMI: 0.965
mu=0.40, NMI: 0.888
mu=0.60, NMI: 0.000
mu=0.80, NMI: 0.000


## 3. Peers, ties and the Internet

In [48]:
def leiden(G, node_pairs):
    coms = algorithms.leiden(G)
    C = dict()
    for i in range(len(coms.communities)):
        for node in coms.communities[i]:
            C[node] = i
    part_G = community_louvain.induced_graph(C, G)

    scores = []
    for n1, n2 in node_pairs:
        score = 0
        if C[n1] == C[n2]:
            try: 
                nc = len(coms.communities[C[n1]])
                mc = part_G[C[n1]][C[n2]]['weight']
                score = mc / (nc * (nc-1) / 2)
            except KeyError:
                score = 0
        scores.append([n1, n2, score])
    return scores

def sample(G, p):
    nodes = list(G.nodes())
    N = int(G.number_of_edges()*p)
    
    Ln = []
    while len(Ln) != N:
        node1, node2 = random.choice(nodes), random.choice(nodes)
        if not G.has_edge(node1, node2) and (node1, node2) not in Ln:
            Ln.append((node1, node2))
    
    Lp = random.sample(list(G.edges()), N)
    G.remove_edges_from(Lp)  
    
    return Ln, Lp, G, N

def predict(G, Ln, Lp):
    LpLn = [*Lp, *Ln]
    return np.array([leiden(G, LpLn), nx.preferential_attachment(G, LpLn), nx.adamic_adar_index(G, LpLn)])

def AUC(G):
    Ln, Lp, G, N = sample(G.copy(), 0.1)
    
    index = len(Lp)
    predicted = predict(G, Ln, Lp)
    print(predicted)
    scores = []
    for pred in predicted:
        m1, m2 = 0, 0
        for n in range(N):
            s1 = random.sample(pred[:index], 1)[0]
            s2 = random.sample(pred[index:], 1)[0]
            if s2[2] > s1[2]:
                m1 += 1
            elif s1[2] == s2[2]:
                m2 +=1
        scores.append((m1 + m2/2) / N)
    return np.array(scores)

def AUC_runs(G, name, n=10):
    scores = [0, 0, 0]
    for i in range(n):
        scores += AUC(G)
    print(name + ':')
    print("  Leiden:", scores[0]/n)
    print("  Preferential attachment:", scores[1]/n)
    print("  Adamic adar index:", scores[2]/n)

In [49]:
G = nx.Graph(nx.read_pajek("data/gnutella.net")).to_undirected()
AUC_runs(G, "Gnutella", 10)

TypeError: 'generator' object is not subscriptable

In [50]:
Ln, Lp, G, N = sample(G.copy(), 0.1)

index = len(Ln)
predicted = predict(G, Ln, Lp)

In [54]:
pred = predicted[0]
random.sample(pred[:3], 1)[0]

['16889', '24967', 0]

In [7]:
G = nx.Graph(nx.read_pajek("data/circles.net")).to_undirected()
AUC_runs(G, "Facebook", 10)

Gnutella:
  Leiden: 0.9557350107673127
  Preferential attachment: 0.8314065510597303
  Adamic adar index: 0.9929049076277909


In [8]:
G = nx.Graph(nx.read_pajek("data/nec.net")).to_undirected()
AUC_runs(G, "Internet", 10)

Internet:
  Leiden: 0.8994864403459182
  Preferential attachment: 0.8232417228736951
  Adamic adar index: 0.6972222999636171


## 4. Get at least 70% right!

In [5]:
def read_and_split(filepath):
    G = nx.Graph() 

    with open(filepath, 'r') as f:
        f.readline()
        test = []
        for line in f:
            if line.startswith("*"):
                continue

            l = line.split()
            if len(l) == 3:
                if l[1][-3:-1] == '13':
                    G.add_node(int(l[0]))
                    test.append([int(l[0]), l[2]])
                else:
                    G.add_node(int(l[0]), label=l[2])
            else:
                G.add_edge(int(l[0]), int(l[1]))

    return G, test

In [6]:
G, test = read_and_split("data/aps_2008_2013.net")
predicted = node_classification.local_and_global_consistency(G)

count = 0
for (node, label) in test:
    if predicted[node-1] == label:
        count += 1

acc = count / len(test)
acc

0.7289582669640012