# Experiments on Wikilinks

In [1]:
from time import time
t0 = time()

from functools import partial
from multiprocessing import Pool
import pickle

import numpy as np

from sknetwork.data import load_netset, block_model
from sknetwork.ranking import Dirichlet
from sknetwork.utils import bipartite2undirected
from sknetwork.utils.check import check_labels, check_n_jobs

from sklearn.metrics import f1_score

### Algorithms

In [2]:
def temperatures(A, seeds, n_jobs=2, n_iter=10):
    n_jobs = check_n_jobs(n_jobs)
    dirichlet = Dirichlet()
    
    classes = np.unique(seeds[seeds>=0])
    seeds_ova = []
    for c in classes:
        seeds_binary = -np.ones_like(seeds)
        seeds_binary[(seeds >= 0)] = 0.
        seeds_binary[(seeds == c)] = 1.
        seeds_ova.append(seeds_binary)
        
    local_function = partial(dirichlet.fit_transform, A)
    with Pool(n_jobs) as pool:
        T = np.array(pool.map(local_function, seeds_ova)).T
        
    return T

In [3]:
def vanilla(T):
    return T.argmax(axis=1)

In [4]:
def weighted(T, seeds):
    scores = T.copy()
    classes, counts = np.unique(seeds[seeds >= 0], return_counts=True)
    ix = np.argwhere(seeds <= 0).ravel()
    for i, c in enumerate(classes):
        scores[ix, i] *= counts[i] / scores[ix, i].sum()
    return scores.argmax(axis=1)

In [5]:
def centered(T):
    return (T - T.mean(axis=0)).argmax(axis=1)

### Seed sampling

In [6]:
def node_sampling(labels, frac = 0.01):
    n = len(labels)
    n_seeds = int(n * frac)
    labels_seeds = -np.ones(n)
    seeds = np.random.choice(n, n_seeds, replace=False)
    labels_seeds[seeds] = labels[seeds]
    
    return labels_seeds

In [7]:
def edge_sampling(adjacency, labels, frac = 0.01):
    n = len(labels)
    n_seeds = int(n * frac)
    labels_seeds = -np.ones(n)
    probas = adjacency.dot(np.ones(n))
    probas /= probas.sum()
    seeds = np.random.choice(n, n_seeds, replace=False, p=probas)
    labels_seeds[seeds] = labels[seeds]
    
    return labels_seeds

## Dataset

In [8]:
wikilinks = load_netset('wikilinks')

In [9]:
del wikilinks['names_col']
del wikilinks['names']
del wikilinks['wikidata_ids']

In [10]:
wikilinks.adjacency = bipartite2undirected(wikilinks.adjacency).astype(bool)

In [11]:
wikilinks

{'biadjacency': <3210346x3506207 sparse matrix of type '<class 'numpy.bool_'>'
 	with 138134642 stored elements in Compressed Sparse Row format>,
 'adjacency': <6420692x6420692 sparse matrix of type '<class 'numpy.bool_'>'
 	with 134392592 stored elements in Compressed Sparse Row format>,
 'meta': {'description': 'Partial graph of Wikipedia (2013 dump). The adjacency represents the links between articles. The biadjacency represents the articles (rows) and the stemmed factors (columns) they contain.',
  'sources': 'Built from https://github.com/tscheepers/Wikipedia-Summary-Dataset and https://snap.stanford.edu/data/enwiki-2013.html',
  'name': 'wikilinks'}}

## Computing labels for Wikilinks

In [12]:
from sknetwork.clustering import BiLouvain
bilouvain = BiLouvain(resolution=1.)
bilouvain.fit(wikilinks.biadjacency)

BiLouvain(resolution=1.0, modularity='dugue', tol_aggregation=0.001, n_aggregations=-1, shuffle_nodes=False, sort_clusters=True, return_membership=True, return_aggregate=True)

In [13]:
del wikilinks['biadjacency']

In [14]:
new_labels = bilouvain.labels_row_.copy()
new_labels[new_labels > 8] = 9

In [15]:
_, counts = np.unique(new_labels, return_counts=True)

In [16]:
counts[-1] / counts.sum()

0.004788580420926592

In [17]:
np.save('labels_wikilinks', new_labels)

In [18]:
labels = np.load('labels_wikilinks.npy')
wikilinks.labels = np.concatenate((labels, labels))

In [19]:
dataset = wikilinks

### Classification

In [20]:
def benchmark(dataset, seeds_set):
    ns = len(seeds_set)
    scores = {'Vanilla': np.zeros(ns),
              'Weighted': np.zeros(ns),
              'Centered': np.zeros(ns)}
    for i, seeds in enumerate(seeds_set):
        T = temperatures(dataset.adjacency, seeds)
        labels_v = vanilla(T)
        labels_w = weighted(T, seeds)
        labels_c = centered(T)
        
        scores['Vanilla'][i] = f1_score(labels_v, dataset.labels, average='macro')
        scores['Weighted'][i] = f1_score(labels_w, dataset.labels, average='macro')
        scores['Centered'][i] = f1_score(labels_c, dataset.labels, average='macro')
    return scores

In [21]:
def display(scores):
    μ = scores.mean()
    σ = scores.std()
    return '{:.2f} pm {:.2f}'.format(μ, σ)

In [22]:
n_runs = 10

#### Uniform sampling

In [23]:
np.random.seed(0)
seeds_ns = [node_sampling(dataset.labels) for i in range(n_runs)]

In [24]:
scores_ns = benchmark(dataset, seeds_ns)

for key, val in scores_ns.items():
    print(key, display(val))

Vanilla 0.19 pm 0.01
Weighted 0.18 pm 0.01
Centered 0.55 pm 0.01


#### Degree sampling

In [25]:
np.random.seed(0)
seeds_es = [edge_sampling(dataset.adjacency, dataset.labels) for i in range(n_runs)]

In [26]:
scores_es = benchmark(dataset, seeds_es)

for key, val in scores_es.items():
    print(key, display(val))

Vanilla 0.34 pm 0.00
Weighted 0.40 pm 0.00
Centered 0.45 pm 0.00


In [27]:
total_runtime = time() - t0
print(total_runtime / 60)

28.130960885683695
