In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

In [4]:
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)

In [5]:
from src.nettack.GCN import *
from src.nettack.utils import *
from src.nettack.nettack import *

# Tests

## Layers

In [26]:
layer = GraphConvolution(in_features=8, out_features=5)

In [27]:
N = 30
Adjacency_matrix = torch.tensor((np.random.rand(N, N) > 0.7).astype(np.float32)).to_sparse()
input = torch.rand(N, 8).to_sparse()

In [28]:
layer(input, Adjacency_matrix).shape

torch.Size([30, 5])

## GCN

In [29]:
N = 30
C = 5
D = 20
Adjacency_matrix = torch.tensor((np.random.rand(N, N) > 0.7).astype(np.float32)).to_sparse()

X_observed = torch.rand(N, D)
Y_observed = torch.randint(low=0, high=C, size=(N,))

H = 10

In [30]:
gcn = GCN(sizes=[H, C],
          An=Adjacency_matrix,
          X_obs=X_observed,
          name='test', with_relu=True, params_dict={})

In [31]:
gcn_model = GCN_Model(gcn, lr=1e-3)

In [32]:
node_ids = torch.tensor([1, 3, 7])
node_labels = torch.tensor([2, 0, 3])

In [33]:
train_nodes = [1, 2, 5, 6, 7, 9, 15]
val_nodes = [17, 18, 19, 20, 22]

In [34]:
gcn_model.train(split_train=train_nodes, split_val=val_nodes, Z_obs=Y_observed)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Iteration 0 : 
Training loss : 17.686004638671875
Validation loss : 17.04974937438965
Training metric : 0.43386243386243384
Validation metric : 0.0
Iteration 1 : 
Training loss : 16.561416625976562
Validation loss : 15.877942085266113
Training metric : 0.43386243386243384
Validation metric : 0.0
Iteration 2 : 
Training loss : 15.435795783996582
Validation loss : 14.719581604003906
Training metric : 0.43386243386243384
Validation metric : 0.0
Iteration 3 : 
Training loss : 14.317334175109863
Validation loss : 13.57121467590332
Training metric : 0.43386243386243384
Validation metric : 0.0
Iteration 4 : 
Training loss : 13.211156845092773
Validation loss : 12.431061744689941
Training metric : 0.43386243386243384
Validation metric : 0.0
Iteration 5 : 
Training loss : 12.116142272949219
Validation loss : 11.31371021270752
Training metric : 0.43386243386243384
Validation metric : 0.0
Iteration 6 : 
Training loss : 11.031316757202148
Validation loss : 10.21229076385498
Training metric : 0.433

## Random

In [36]:
node_labels = pd.read_csv('datasets/citeseer/citeseer.node_labels', header=None, names=['id', 'label'])

In [37]:
edges = pd.read_csv('datasets/citeseer/citeseer.edges', header=None, names=['e1', 'e2', 'weight'])

In [38]:
G = nx.Graph()
for _, row in node_labels.iterrows():
    G.add_node(row.id, label=row.label)
G.add_edges_from(edges.loc[:, ['e1', 'e2']])

## Nettack

### Data preprocessing

In [6]:
_A_obs, _X_obs, _z_obs = load_npz('src/data/citeseer.npz')

In [7]:
# Adjacency matrix
_A_obs

<3312x3312 sparse matrix of type '<class 'numpy.float32'>'
	with 4715 stored elements in Compressed Sparse Row format>

In [8]:
# Feature matrix
_X_obs

<3312x3703 sparse matrix of type '<class 'numpy.int64'>'
	with 105165 stored elements in Compressed Sparse Row format>

In [21]:
np.array(_X_obs[0].todense())[0].shape

(3703,)

In [8]:
# Labels
_z_obs
pd.value_counts(_z_obs)

2    701
4    668
1    596
5    590
3    508
0    249
dtype: int64

In [9]:
# Normalizing Adjacency matrix
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1

In [10]:
# For the algorithm to work, we have to consider a connected graph.
lcc = largest_connected_components(_A_obs)

Selecting 1 largest connected components


In [11]:
print(f'Largest connected component has {len(lcc)} nodes')
# keeping the largest connected component of the graph
_A_obs = _A_obs[lcc][:,lcc]
_X_obs = _X_obs[lcc].astype('float32')
_z_obs = _z_obs[lcc]

Largest connected component has 2110 nodes


In [12]:
assert np.abs(_A_obs - _A_obs.T).sum() == 0, "Input graph is not symmetric"
assert _A_obs.max() == 1 and len(np.unique(_A_obs[_A_obs.nonzero()].A1)) == 1, "Graph must be unweighted"
assert _A_obs.sum(0).A1.min() > 0, "Graph contains singleton nodes"

In [13]:
_N = _A_obs.shape[0]
_K = _z_obs.max()+1

# In our pytorch implementation, labels are not one hot encoded
# _Z_obs = np.eye(_K)[_z_obs]
_Z_obs = _z_obs
# Normalizing adjacency matrix
_An = preprocess_graph(_A_obs)
sizes = [16, _K]
degrees = _A_obs.sum(0).A1


np.random.seed(seed)

In [14]:
unlabeled_share = 0.8
val_share = 0.1
train_share = 1 - unlabeled_share - val_share
splits = train_val_test_split_tabular(np.arange(_N), train_size=train_share,
                                      val_size=val_share, test_size=unlabeled_share,
                                      stratify=_z_obs)
split_train, split_val, split_unlabeled = splits

In [15]:
print(f'Number of training node : {len(split_train)}')
print(f'Number of validation nodes : {len(split_val)}')
print(f'Number of unlabeled (unknown) nodes : {len(split_unlabeled)}')

Number of training node : 210
Number of validation nodes : 211
Number of unlabeled (unknown) nodes : 1688


### Choosing the node to attack

In [16]:
u = 0 # node to attack
assert u in split_unlabeled

### Train surrogate model

Initialization of Nettack : train on a simple model (with no ReLU) the weights of the GCN.

In [17]:
def sparse_numpy2sparse_torch(x):
    x = x.tocoo()
    values = x.data
    indices = np.vstack((x.row, x.col)).astype(float)
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = x.shape
    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [18]:
An = sparse_numpy2sparse_torch(_An)
X_obs = sparse_numpy2sparse_torch(_X_obs)

In [19]:
surrogate_nn = GCN(sizes, An, X_obs, with_relu=False, name="surrogate")

In [20]:
surrogate_model = GCN_Model(surrogate_nn, lr=1e-2)

In [21]:
split_train = np.array(split_train).astype(np.int64)
split_val = np.array(split_val).astype(np.int64)

In [23]:
surrogate_model.train(split_train, split_val, torch.tensor(_Z_obs.astype(np.int64)), print_info=False)

  'precision', 'predicted', average, warn_for)


converged after 11 iterations


In [24]:
preds = surrogate_model.predictions.detach().cpu().numpy().argmax(axis=1)
truth = _Z_obs[split_val]
(truth == preds).mean()

0.6208530805687204

In [25]:
W1 = surrogate_model.gcn.gc1.weight
W2 = surrogate_model.gcn.gc2.weight

### Setup attack

In [26]:
nettack = Nettack(_A_obs, _X_obs, _z_obs, W1, W2, u, verbose=True)

In [28]:
direct_attack = True
n_influencers = 1 if direct_attack else 5
n_perturbations = int(degrees[u]) # How many perturbations to perform. Default: Degree of the node
perturb_features = True
perturb_structure = True

### Poison the data

In [29]:
nettack.reset()

In [35]:
direct_attack = True
n_influencers = 1 if direct_attack else 5
n_perturbations = int(degrees[u]) # How many perturbations to perform. Default: Degree of the node
perturb_features = True
perturb_structure = True

### Poison the data

In [36]:
nettack.reset()

In [37]:
nettack.attack_surrogate(n_perturbations,
                         perturb_structure=perturb_structure,
                         perturb_features=perturb_features,
                         direct=direct_attack,
                         n_influencers=n_influencers)

##### Starting attack #####
##### Attack node with ID 0 using structure and feature perturbations #####
##### Attacking the node directly #####
##### Performing 12 perturbations #####
##### ...1/12 perturbations ... #####
Edge perturbation: [  0 526]
##### ...2/12 perturbations ... #####
Edge perturbation: [  0 597]
##### ...3/12 perturbations ... #####
Edge perturbation: [  0 885]
##### ...4/12 perturbations ... #####
Edge perturbation: [ 0 55]
##### ...5/12 perturbations ... #####
Edge perturbation: [   0 2012]
##### ...6/12 perturbations ... #####
Edge perturbation: [  0 594]
##### ...7/12 perturbations ... #####
Edge perturbation: [  0 781]
##### ...8/12 perturbations ... #####
Edge perturbation: [   0 1051]
##### ...9/12 perturbations ... #####
Edge perturbation: [  0 260]
##### ...10/12 perturbations ... #####
Edge perturbation: [   0 1797]
##### ...11/12 perturbations ... #####
Edge perturbation: [  0 939]
##### ...12/12 perturbations ... #####
Edge perturbation: [ 0 76]


In [38]:
print(nettack.structure_perturbations)

[(0, 526), (0, 597), (0, 885), (0, 55), (0, 2012), (0, 594), (0, 781), (0, 1051), (0, 260), (0, 1797), (0, 939), (0, 76)]


In [39]:
print(nettack.feature_perturbations)

[(), (), (), (), (), (), (), (), (), (), (), ()]


### Train GCN without perturbations

In [40]:
retrain_iters=5

In [42]:
classification_margins_clean = []
class_distrs_clean = []

u_nodes = torch.tensor([nettack.u])
u_labels = torch.tensor(_Z_obs.astype(np.int64))[u_nodes]
one_hot_encod_u = np.zeros(np.max(_Z_obs)+1)
one_hot_encod_u[_Z_obs[nettack.u]] = 1


for _ in range(retrain_iters):
    print("... {}/{} ".format(_+1, retrain_iters))
    gcn_before_nn = GCN(sizes, An, X_obs, with_relu=True,
                   name="gcn_orig")
    gcn_before = GCN_Model(gcn_before_nn, lr=1e-2)
    gcn_before.train(split_train, split_val, torch.tensor(_Z_obs.astype(np.int64)), print_info=False)
    preds = gcn_before.predictions.detach().cpu().numpy().argmax(axis=1)
    truth = _Z_obs[split_val]
    print(f'Validation accuracy : {(truth == preds).mean():.3f}')
    gcn_before._compute_loss_and_backprop(u_nodes, u_labels, False)
    probs_before_attack = gcn_before.predictions.detach().cpu().numpy()[0]

    class_distrs_clean.append(probs_before_attack)
    best_second_class_before = (probs_before_attack - 1000*one_hot_encod_u).argmax()
    margin_before = probs_before_attack[_z_obs[nettack.u]] - probs_before_attack[best_second_class_before]
    classification_margins_clean.append(margin_before)
class_distrs_clean = np.array(class_distrs_clean)

... 1/5 


  'precision', 'predicted', average, warn_for)


converged after 50 iterations
Validation accuracy : 0.678
... 2/5 
converged after 19 iterations
Validation accuracy : 0.668
... 3/5 
converged after 58 iterations
Validation accuracy : 0.673
... 4/5 
converged after 24 iterations
Validation accuracy : 0.664
... 5/5 
converged after 11 iterations
Validation accuracy : 0.673


### Train GCN with perturbations

In [44]:
classification_margins_corrupted = []
class_distrs_retrain = []

nettack_adj_torch = sparse_numpy2sparse_torch(nettack.adj_preprocessed)
nettack_X_torch = sparse_numpy2sparse_torch(nettack.X_obs.tocsr())

for _ in range(retrain_iters):
    print("... {}/{} ".format(_+1, retrain_iters))
    gcn_retrain_nn = GCN(sizes, nettack_adj_torch, nettack_X_torch, with_relu=True,
                   name="gcn_retrain")
    gcn_retrain = GCN_Model(gcn_retrain_nn, lr=1e-2)

    gcn_retrain.train(split_train, split_val, torch.tensor(_Z_obs.astype(np.int64)), print_info=False)
    gcn_retrain._compute_loss_and_backprop(u_nodes, u_labels, False)
    probs_after_attack = gcn_retrain.predictions.detach().cpu().numpy()[0]
    class_distrs_retrain.append(probs_after_attack)
    best_second_class_after = (probs_after_attack - 1000*one_hot_encod_u).argmax()
    margin_after = probs_after_attack[_z_obs[nettack.u]] - probs_after_attack[best_second_class_after]
    classification_margins_corrupted.append(margin_after)
class_distrs_retrain = np.array(class_distrs_retrain)

... 1/5 
converged after 10 iterations
... 2/5 
converged after 24 iterations
... 3/5 
converged after 19 iterations
... 4/5 
converged after 14 iterations
... 5/5 
converged after 92 iterations


### Visualize results

In [45]:
classification_margins_corrupted, classification_margins_clean

([-0.99947315, -0.99796355, -0.99963593, -0.99908423, -0.9982646],
 [0.988104, 0.9429521, 0.98562205, 0.98493934, 0.93839175])

In [46]:
def make_xlabel(ix, correct):
    if ix==correct:
        return "Class {}\n(correct)".format(ix)
    return "Class {}".format(ix)

figure = plt.figure(figsize=(12,4))
plt.subplot(1, 2, 1)
center_ixs_clean = []
for ix, block in enumerate(class_distrs_clean.T):
    x_ixs= np.arange(len(block)) + ix*(len(block)+2)
    center_ixs_clean.append(np.mean(x_ixs))
    color = '#555555'
    if ix == nettack.label_u:
        color = 'darkgreen'
    plt.bar(x_ixs, block, color=color)

ax=plt.gca()
plt.ylim((-.05, 1.05))
plt.ylabel("Predicted probability")
ax.set_xticks(center_ixs_clean)
ax.set_xticklabels([make_xlabel(k, nettack.label_u) for k in range(_K)])
ax.set_title("Predicted class probabilities for node {} on clean data\n({} re-trainings)".format(nettack.u, retrain_iters))

fig = plt.subplot(1, 2, 2)
center_ixs_retrain = []
for ix, block in enumerate(class_distrs_retrain.T):
    x_ixs= np.arange(len(block)) + ix*(len(block)+2)
    center_ixs_retrain.append(np.mean(x_ixs))
    color = '#555555'
    if ix == nettack.label_u:
        color = 'darkgreen'
    plt.bar(x_ixs, block, color=color)


ax=plt.gca()
plt.ylim((-.05, 1.05))
ax.set_xticks(center_ixs_retrain)
ax.set_xticklabels([make_xlabel(k, nettack.label_u) for k in range(_K)])
ax.set_title("Predicted class probabilities for node {} after {} perturbations\n({} re-trainings)".format(nettack.u, n_perturbations, retrain_iters))
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>