# Practical Assignment — Community detection

In [None]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In this assignment, we try to detect communities in Les Miserables graph. But first, let us drop weights of nodes.

In [None]:
G = nx.Graph()
G.add_edges_from(nx.les_miserables_graph().edges)
pos = nx.nx_agraph.graphviz_layout(G)

### Task. k-core decomposition
Let us apply k-core decomposition from networkx. Use `nx.k_core` and colorize every node by its k-shell.

Write a function `k_core_decompose` that takes a graph `G` and returns a np.array with k of node's k-shell. For example, if there are node 1, 2, 3, 4 and nodes 1 is in 1-shell, nodes 2, 3, 4 are in 2-shell, then the output will be [1, 2, 2, 2].

In [None]:
def k_core_decompose(G):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
colors = k_core_decompose(G)
assert colors.shape == (77,)
assert colors.min() == 1
assert colors.max() == 9
assert colors[10] == 8
assert colors[65] == 9

Let us draw k-shells of the graph in some k-cores.

In [None]:
plt.figure(figsize=(8*2, 8*4))

x_max, y_max = np.array(list(pos.values())).max(axis=0)
x_min, y_min = np.array(list(pos.values())).min(axis=0)

for i in range(8):
    plt.subplot(4, 2, i+1)
    subG = nx.k_core(G, i+1)
    nodes = nx.draw_networkx_nodes(
        subG, 
        pos,
        cmap=plt.cm.gist_rainbow,
        node_color=k_core_decompose(subG), 
        node_size=100, 
        linewidths=1, 
        edgecolors='black'
    )
    nx.draw_networkx_edges(
        subG, 
        pos,
        alpha=0.2,
        width=1, 
        edge_color='black'
    )
    eps = (x_max - x_min) * 0.05
    plt.xlim(x_min-eps, x_max+eps)
    plt.ylim(y_min-eps, y_max+eps)
    plt.legend(*nodes.legend_elements())
    plt.axis('off')
    plt.title('k-shells on {}-core'.format(i+1))

### Task. Clique detection

Cluque is a complete subgraph — simple model of a community. Find the largest cliques in the graph by `nx.find_cliques` and visualize them.

Write a function `largest_cliques` that takes a graph and finds the largest cliques of the same size and collect colors and widths by the following rule: all nodes are white except of nodes in the largest cluque, all edges are thin except of edges inside of the clique. The function returns a tuple of 2 np.arrays: 
* rgb np.array has a shape [n, m, 3] where n is a number of cliques, m is a number of nodes, 3 is (r, g, b) values in the interval [0-1]. 
* width np.array has a shape [n, k] where n is a number of cliques, k is a number of edges. 

Colors should be ordered by `G.nodes`. Widths should be ordered by `G.edges`.

In [None]:
def largest_cliques(G):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
colors, widths = largest_cliques(G)
assert colors.shape == (2, 77, 3)
assert np.unique(colors[0], axis=0, return_counts=True)[1][0] == 10
assert np.unique(colors[1], axis=0, return_counts=True)[1][0] == 10
assert widths.shape == (2, 254)
assert np.unique(widths[0], axis=0, return_counts=True)[1][1] == 45
assert np.unique(widths[1], axis=0, return_counts=True)[1][1] == 45

Let us draw a few examples of found cliques.

In [None]:
plt.figure(figsize=(16, 8))

for i in range(colors.shape[0]):
    plt.subplot(1, 2, i+1)
    nodes = nx.draw_networkx_nodes(
        G, 
        pos,
        node_color=colors[i], 
        node_size=100, 
        linewidths=1, 
        edgecolors='black'
    )
    nx.draw_networkx_edges(
        G,
        pos,
        width=widths[i], 
        edge_color='black'
    )
    plt.title('The largest clique')
    plt.axis('off')

### Task. Edge betweenness
Let us apply Girvan Newman algorithm with edge betweenness. Use `nx.algorithms.community.girvan_newman` to find communities.

Write a function `edge_betweenness` that takes a graph and the number of divisions and returns np.array of (integer) labels of nodes in each iteration. The shape of the output is [n, m] where n is a number of iteration and m is a number of nodes.

In [None]:
def edge_betweenness(G, n):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
labels = edge_betweenness(G, 6)
assert labels.shape == (6, 77)
assert np.unique(labels[0]).shape == (2,)
assert np.unique(labels[0]).shape[0] < np.unique(labels[1]).shape[0] < np.unique(labels[5]).shape[0]

Let us draw Girvan Newman algorithm step-by-step.

In [None]:
plt.figure(figsize=(8*2, 8*3))
colors = edge_betweenness(G, 6)
for i in range(colors.shape[0]):
    plt.subplot(3, 2, i+1)
    nx.draw_networkx_nodes(
        G, 
        pos,
        cmap=plt.cm.rainbow,
        node_color=colors[i], 
        node_size=100, 
        linewidths=1, 
        edgecolors='black'
    )
    nx.draw_networkx_edges(
        G,
        pos,
        alpha=0.2,
        edge_color='black'
    )
    plt.title('Edge betweenness, {} communities'.format(i+2))
    plt.axis('off')

### Task. Modularity

Modularity helps to decide when to stop splitting the graph. The large modularity, the better partitioning. Let us see how modularity changes during division.

Write a function `edge_betw_modularity` that takes a graph, number of iterations of Girvan Newman algorithm and returns a np.array with modularity after each iteration. Use `nx.algorithms.community.modularity`.

In [None]:
def edge_betw_modularity(G, n):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
modularities = edge_betw_modularity(G, 8)
assert modularities.shape == (8, )
assert round(modularities[0], 4) == 0.0746
assert round(modularities[-1], 4) == 0.4519

Let us draw dependency between a number of iteration and modularity.

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(range(2, 10), modularities)
best_n = np.argmax(modularities) + 2
plt.plot(
    [best_n, best_n], [min(modularities), max(modularities)], 
    'k--', c='tab:red', label='number of communities with max modularity {:.2f}'.format(max(modularities))
)
plt.ylabel('Modularity score')
plt.xlabel('Number of communities')
plt.legend(loc='upper left')
plt.ylim(min(modularities), 0.5)
plt.show()