# Social datasets from SNAP
* download website https://snap.stanford.edu/data/

In [1]:
from time import time
t0 = time()

from csv import reader

import numpy as np
from scipy import sparse

from sknetwork.data import load_edge_list, Bunch
from sknetwork.utils import edgelist2biadjacency
from sknetwork.ranking import Dirichlet, top_k

from sklearn.metrics import f1_score

## Datasets

In [2]:
def snap_dataset(name):
    dataset = load_edge_list(name+'.ungraph.txt', directed=False)
    n = dataset.adjacency.shape[0]
    
    ix = {}
    for i, node in enumerate(dataset.names):
        ix[node] = i
    dataset.ix = ix
    
    edgelist = []
    with open(name + '.top5000.cmty.txt', 'r', encoding='utf-8') as f:
        csv_reader = reader(f, delimiter='\t')
        for i, line in enumerate(csv_reader):
            edgelist += [(dataset.ix[int(node)], i) for node in line]
        
    B = edgelist2biadjacency(edgelist)
    delta = n - B.shape[0]
    block = sparse.csr_matrix((delta, B.shape[1]), dtype=bool)
    dataset.membership = sparse.bmat([[B], [block]])
    
    return dataset

In [3]:
dblp = snap_dataset('com-dblp')

In [4]:
dblp.adjacency

<317080x317080 sparse matrix of type '<class 'numpy.bool_'>'
	with 2099732 stored elements in Compressed Sparse Row format>

In [5]:
B = dblp.membership
d = B.dot(np.ones(B.shape[1]))
(d > 0).mean()

0.2946638072410748

In [6]:
amazon = snap_dataset('com-amazon')

In [7]:
amazon.adjacency

<334863x334863 sparse matrix of type '<class 'numpy.bool_'>'
	with 1851744 stored elements in Compressed Sparse Row format>

In [8]:
B = amazon.membership
d = B.dot(np.ones(B.shape[1]))
(d > 0).mean()

0.04991892206663621

## Binary classification

As these datasets are multi-labels (i.e. one node might belong to several classes), we can perform independent binary classification for each class.

In [9]:
def balanced_sampling(labels, frac = 0.01):
    classes, counts = np.unique(labels, return_counts=True)
    labels_seeds = -np.ones_like(labels)
    
    for i, c in enumerate(classes):
        n_seeds = int(frac * counts[i])
        ix = np.argwhere(labels == c).ravel()
        seeds = np.random.choice(ix, n_seeds, replace=False)
        labels_seeds[seeds] = c
    return labels_seeds

In [10]:
def get_labels(A, seeds):
    dirichlet = Dirichlet()
    T = dirichlet.fit_transform(A, seeds)
    
    labels_vanilla = (T > 0.5).astype(int)
    
    p = seeds[seeds>=0].mean()
    ix = np.argwhere(seeds<0).ravel()
    labels_weighted = seeds.copy()
    labels_weighted[ix] = (p * T[ix] / T[ix].sum() > (1 - p) * (1 - T[ix]) / (1 - T[ix]).sum())
    labels_weighted = labels_weighted.astype(int)
    
    labels_centered = (T > T.mean()).astype(int)
    
    return labels_vanilla, labels_weighted, labels_centered

In [11]:
def benchmark(dataset, n_classes=10, frac=0.01, n_runs=10):
    B = dataset.membership
    classes_sizes = B.T.dot(np.ones(B.shape[0]))
    top_classes = top_k(classes_sizes, n_classes)
    B = B[:, top_classes]
    
    A = dataset.adjacency
    n = A.shape[0]
    n_seeds = int(frac * n)
    
    scores = np.zeros((n_runs, 3))
    for i in range(n_runs):
        
        scores_binary = np.zeros((n_classes, 3))
        for j in range(n_classes):
            y_true = B[:, j].toarray().ravel().astype(int)
            seeds = balanced_sampling(y_true, frac)
        
            labels_v, labels_w, labels_c = get_labels(A, seeds)
            score_v = f1_score(y_true, labels_v)
            score_w = f1_score(y_true, labels_w)
            score_c = f1_score(y_true, labels_c)
            
            scores_binary[j] = np.array([score_v, score_w, score_c])
            
        scores[i] = scores_binary.mean(axis=0)
        
    return scores.mean(axis=0), scores.std(axis=0)

In [12]:
np.random.seed(0)
μ, σ = benchmark(dblp, n_classes=3, frac=1e-2)

In [13]:
μ, σ

(array([0.04100458, 0.03935462, 0.18731496]),
 array([0.00344166, 0.00299894, 0.0121849 ]))

In [14]:
np.random.seed(0)
μ, σ = benchmark(amazon, n_classes=3, frac=1e-2)

In [15]:
μ, σ

(array([0.0532165 , 0.05361524, 0.17924359]),
 array([0.01293815, 0.01266617, 0.16127366]))

In [16]:
total_runtime = time() - t0
print(total_runtime / 60)

0.3768748084704081
