In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import tqdm

In [3]:
import torch
import torch.nn as nn

In [4]:
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)

In [5]:
from src.nettack.GCN import *

# Tests

## Layers

In [5]:
layer = GraphConvolution(in_features=8, out_features=5)

In [6]:
N = 30
Adjacency_matrix = torch.tensor((np.random.rand(N, N) > 0.7).astype(np.float32)).to_sparse()
input = torch.rand(N, 8).to_sparse()

In [7]:
layer(input, Adjacency_matrix).shape

torch.Size([30, 5])

## GCN

In [8]:
N = 30
C = 5
D = 20
Adjacency_matrix = torch.tensor((np.random.rand(N, N) > 0.7).astype(np.float32)).to_sparse()

X_observed = torch.rand(N, D)
Y_observed = torch.randint(low=0, high=C, size=(N,))

H = 10

In [9]:
gcn = GCN(sizes=[H, C],
          An=Adjacency_matrix,
          X_obs=X_observed,
          name='test', with_relu=True, params_dict={})

In [10]:
gcn_model = GCN_Model(gcn, lr=1e-3)

In [11]:
node_ids = torch.tensor([1, 3, 7])
node_labels = torch.tensor([2, 0, 3])

In [12]:
train_nodes = [1, 2, 5, 6, 7, 9, 15]
val_nodes = [17, 18, 19, 20, 22]

In [13]:
gcn_model.train(split_train=train_nodes, split_val=val_nodes, Z_obs=Y_observed)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  'precision', 'predicted', average, warn_for)


New best performance : 0.283
New best performance : 0.300
New best performance : 0.750
converged after 59 iterations



## Random

In [28]:
a.is_sparse

True

In [12]:
node_labels = pd.read_csv('datasets/citeseer/citeseer.node_labels', header=None, names=['id', 'label'])

In [13]:
edges = pd.read_csv('datasets/citeseer/citeseer.edges', header=None, names=['e1', 'e2', 'weight'])

In [14]:
G = nx.Graph()
for _, row in node_labels.iterrows():
    G.add_node(row.id, label=row.label)
G.add_edges_from(edges.loc[:, ['e1', 'e2']])

## Nettack

### Data preprocessing

In [44]:
from src.nettack.utils import *
from src.nettack.nettack import *

In [7]:
_A_obs, _X_obs, _z_obs = load_npz('src/data/citeseer.npz')

In [8]:
# Adjacency matrix
_A_obs

<3312x3312 sparse matrix of type '<class 'numpy.float32'>'
	with 4715 stored elements in Compressed Sparse Row format>

In [9]:
# Feature matrix
_X_obs

<3312x3703 sparse matrix of type '<class 'numpy.int64'>'
	with 105165 stored elements in Compressed Sparse Row format>

In [10]:
# Labels
_z_obs
pd.value_counts(_z_obs)

2    701
4    668
1    596
5    590
3    508
0    249
dtype: int64

In [11]:
# Normalizing Adjacency matrix
_A_obs = _A_obs + _A_obs.T
_A_obs[_A_obs > 1] = 1

In [12]:
# For the algorithm to work, we have to consider a connected graph.
lcc = largest_connected_components(_A_obs)

Selecting 1 largest connected components


In [13]:
print(f'Largest connected component has {len(lcc)} nodes')
# keeping the largest connected component of the graph
_A_obs = _A_obs[lcc][:,lcc]
_X_obs = _X_obs[lcc].astype('float32')
_z_obs = _z_obs[lcc]

Largest connected component has 2110 nodes


In [14]:
assert np.abs(_A_obs - _A_obs.T).sum() == 0, "Input graph is not symmetric"
assert _A_obs.max() == 1 and len(np.unique(_A_obs[_A_obs.nonzero()].A1)) == 1, "Graph must be unweighted"
assert _A_obs.sum(0).A1.min() > 0, "Graph contains singleton nodes"

In [15]:
_N = _A_obs.shape[0]
_K = _z_obs.max()+1

# In our pytorch implementation, labels are not one hot encoded
# _Z_obs = np.eye(_K)[_z_obs]
_Z_obs = _z_obs
# Normalizing adjacency matrix
_An = preprocess_graph(_A_obs)
sizes = [16, _K]
degrees = _A_obs.sum(0).A1


np.random.seed(seed)

In [16]:
unlabeled_share = 0.8
val_share = 0.1
train_share = 1 - unlabeled_share - val_share
splits = train_val_test_split_tabular(np.arange(_N), train_size=train_share,
                                      val_size=val_share, test_size=unlabeled_share,
                                      stratify=_z_obs)
split_train, split_val, split_unlabeled = splits

In [17]:
print(f'Number of training node : {len(split_train)}')
print(f'Number of validation nodes : {len(split_val)}')
print(f'Number of unlabeled (unknown) nodes : {len(split_unlabeled)}')

Number of training node : 210
Number of validation nodes : 211
Number of unlabeled (unknown) nodes : 1688


### Choosing the node to attack

In [18]:
u = 0 # node to attack
assert u in split_unlabeled

### Train surrogate model

Initialization of Nettack : train on a simple model (with no ReLU) the weights of the GCN.

In [19]:
def sparse_numpy2sparse_torch(x):
    x = x.tocoo()
    values = x.data
    indices = np.vstack((x.row, x.col)).astype(float)
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = x.shape
    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [20]:
An = sparse_numpy2sparse_torch(_An)
X_obs = sparse_numpy2sparse_torch(_X_obs)

In [21]:
surrogate_nn = GCN(sizes, An, X_obs, with_relu=False, name="surrogate", gpu_id=None)

In [22]:
surrogate_model = GCN_Model(surrogate_nn, lr=1e-3)

In [31]:
split_train = np.array(split_train).astype(np.int64)
split_val = np.array(split_val).astype(np.int64)

In [39]:
surrogate_model.train(split_train, split_val, torch.tensor(_Z_obs.astype(np.int64)))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

New best performance : 0.332
New best performance : 0.385
New best performance : 0.427
New best performance : 0.435
New best performance : 0.494
New best performance : 0.523
New best performance : 0.540
New best performance : 0.591
New best performance : 0.615
New best performance : 0.683
New best performance : 0.691


  'precision', 'predicted', average, warn_for)


New best performance : 0.709
New best performance : 0.725
New best performance : 0.738
New best performance : 0.750
New best performance : 0.762
New best performance : 0.787
New best performance : 0.809
New best performance : 0.840
New best performance : 0.852
New best performance : 0.871
New best performance : 0.882
New best performance : 0.902
New best performance : 0.920
New best performance : 0.931
New best performance : 0.940
New best performance : 0.961
New best performance : 0.972
New best performance : 0.980
New best performance : 0.991
New best performance : 1.000
New best performance : 1.022
New best performance : 1.031
New best performance : 1.039
New best performance : 1.039
New best performance : 1.048
New best performance : 1.092
New best performance : 1.102
New best performance : 1.105
New best performance : 1.113
New best performance : 1.135
New best performance : 1.145
New best performance : 1.153
New best performance : 1.162
New best performance : 1.170
New best perfo

In [47]:
W1 = surrogate_model.gcn.gc1.weight
W2 = surrogate_model.gcn.gc2.weight

In [None]:
# surrogate_model = GCN(sizes, _An, _X_obs, with_relu=False, name="surrogate", gpu_id=gpu_id)
# surrogate_model.train(split_train, split_val, _Z_obs)
# W1 =surrogate_model.W1.eval(session=surrogate_model.session)
# W2 =surrogate_model.W2.eval(session=surrogate_model.session)

### Setup attack

In [55]:
nettack = Nettack(_A_obs, _X_obs, _z_obs, W1, W2, u, verbose=True)

In [56]:
direct_attack = True
n_influencers = 1 if direct_attack else 5
n_perturbations = int(degrees[u]) # How many perturbations to perform. Default: Degree of the node
perturb_features = True
perturb_structure = True

### Poison the data

In [57]:
nettack.reset()

In [58]:
nettack.attack_surrogate(n_perturbations,
                         perturb_structure=perturb_structure,
                         perturb_features=perturb_features,
                         direct=direct_attack,
                         n_influencers=n_influencers)

##### Starting attack #####
##### Attack node with ID 0 using structure and feature perturbations #####
##### Attacking the node directly #####
##### Performing 12 perturbations #####
##### ...1/12 perturbations ... #####
Edge perturbation: [  0 526]
##### ...2/12 perturbations ... #####
Edge perturbation: [ 0 55]
##### ...3/12 perturbations ... #####
Edge perturbation: [  0 594]
##### ...4/12 perturbations ... #####
Edge perturbation: [  0 260]
##### ...5/12 perturbations ... #####
Edge perturbation: [   0 1797]
##### ...6/12 perturbations ... #####
Edge perturbation: [   0 2012]
##### ...7/12 perturbations ... #####
Edge perturbation: [  0 597]
##### ...8/12 perturbations ... #####
Edge perturbation: [   0 1051]
##### ...9/12 perturbations ... #####
Edge perturbation: [   0 1309]
##### ...10/12 perturbations ... #####
Edge perturbation: [  0 939]
##### ...11/12 perturbations ... #####
Edge perturbation: [  0 781]
##### ...12/12 perturbations ... #####
Edge perturbation: [  0 254]


In [59]:
print(nettack.structure_perturbations)

[(0, 526), (0, 55), (0, 594), (0, 260), (0, 1797), (0, 2012), (0, 597), (0, 1051), (0, 1309), (0, 939), (0, 781), (0, 254)]


In [60]:
print(nettack.feature_perturbations)

[(), (), (), (), (), (), (), (), (), (), (), ()]


### Train GCN without perturbations

In [61]:
retrain_iters=5

In [20]:
An = sparse_numpy2sparse_torch(_An)
X_obs = sparse_numpy2sparse_torch(_X_obs)

In [21]:
surrogate_nn = GCN(sizes, An, X_obs, with_relu=False, name="surrogate", gpu_id=None)

In [22]:
surrogate_model = GCN_Model(surrogate_nn, lr=1e-3)

In [66]:
gcn_before.predictions.shape

torch.Size([210, 6])

In [67]:
classification_margins_clean = []
class_distrs_clean = []


# gcn_before = GCN.GCN(sizes, _An, _X_obs, "gcn_orig", gpu_id=gpu_id)

for _ in range(retrain_iters):
    print("... {}/{} ".format(_+1, retrain_iters))
    gcn_before_nn = GCN(sizes, An, X_obs, with_relu=True,
                   name="gcn_orig", gpu_id=None)
    gcn_before = GCN_Model(gcn_before_nn, lr=1e-2)
    gcn_before.train(split_train, split_val, torch.tensor(_Z_obs.astype(np.int64)))
#     probs_before_attack = gcn_before.predictions.eval(session=gcn_before.session,
#                                                       feed_dict={gcn_before.node_ids: [nettack.u]})[0]
#     class_distrs_clean.append(probs_before_attack)
#     best_second_class_before = (probs_before_attack - 1000*_Z_obs[nettack.u]).argmax()
#     margin_before = probs_before_attack[_z_obs[nettack.u]] - probs_before_attack[best_second_class_before]
#     classification_margins_clean.append(margin_before)
# class_distrs_clean = np.array(class_distrs_clean)

... 1/5 


TypeError: super(type, obj): obj must be an instance or subtype of type