# Comparative study of Centrality based Adversarial Attacks on GCN model for Node classification

##Importing modules

In [324]:
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.2.1

In [325]:
import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

In [326]:
import pandas as pd
import os

import networkx as nx

import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN
from stellargraph import StellarGraph
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [327]:
zip_file = keras.utils.get_file(
    fname="cora.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")

citations = pd.read_csv(
    os.path.join(data_dir, "cora.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)

print(citations.shape)

(5429, 2)


In [328]:
G = nx.from_pandas_edgelist(citations, "source", "target", create_using=nx.Graph())

In [329]:
nodes_2_attack = list(citations.sample(n=50,random_state=1).drop_duplicates(subset=["target"])['target'])

print(nodes_2_attack)

[3217, 35, 62389, 10796, 628667, 16437, 575292, 14429, 16008, 18777, 137868, 54131, 22835, 63915, 5348, 634938, 10177, 132821, 95594, 6385, 27543, 6818, 126926, 46491, 189708, 31479, 20534, 141160, 6213, 89547, 15984, 2440, 114, 8696, 116084, 3191, 6125, 644577, 94639, 29492, 561789, 31863, 75121, 7532, 20601, 910, 6214]


## Adversarial Attack

### Defining centrality measure

Below are the four centrality measures used to carry out the adversarial attacks, as well as the code for the attacks themselves. To carry out an attack the required centrality measure and attack have to be uncommented and run.

In [330]:
global_centrality = 'Eigenvector centrality'
e_centrality = nx.eigenvector_centrality(G)

e_top_influential = sorted(e_centrality.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:100]
e_top_influential_nodes = [n[0] for n in e_top_influential]
print(e_top_influential_nodes)

[35, 82920, 85352, 210871, 887, 1688, 12576, 287787, 84021, 35061, 103515, 54131, 41714, 66556, 33907, 575077, 503883, 38205, 69284, 259702, 141342, 273152, 1119708, 513189, 289780, 1154459, 1153280, 1152421, 593091, 1127913, 1153943, 210872, 1114331, 415693, 568857, 289781, 56119, 198653, 801170, 1129683, 634975, 54129, 98698, 307015, 608326, 132806, 787016, 573978, 647447, 44368, 28290, 503893, 640617, 1103960, 579008, 8865, 66563, 1136814, 1129778, 48766, 1153577, 190697, 578780, 135130, 573964, 634904, 634902, 1128453, 289779, 576725, 265203, 78511, 33904, 1130847, 634938, 248425, 97645, 69296, 1112911, 206371, 593260, 190706, 1129573, 1117476, 576795, 1113438, 646809, 197054, 263279, 1129027, 1127430, 574462, 561238, 593813, 1131360, 229635, 1033, 141347, 1153853, 263498]


In [331]:
# global_centrality = 'Betweenness centrality'
# b_centrality = nx.betweenness_centrality(G)
# print(b_centrality)
# b_top_influential = sorted(b_centrality.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:100]
# b_top_influential_nodes = [n[0] for n in b_top_influential]
# print(b_top_influential_nodes)

In [332]:
# global_centrality = 'Degree centrality'
# d_centrality = nx.degree_centrality(G)

# d_top_influential = sorted(d_centrality.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:100]
# d_top_influential_nodes = [n[0] for n in d_top_influential]
# print(d_top_influential_nodes)

In [333]:
# global_centrality = 'Closeness centrality'
# c_centrality = nx.closeness_centrality(G)

# c_top_influential = sorted(c_centrality.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:100]
# c_top_influential_nodes = [n[0] for n in c_top_influential]
# print(c_top_influential_nodes)

### Performing the attack

In [334]:
global_action = 'Adding ~100 edges'
edge_count = 0
for n in e_top_influential_nodes:
  for m in nodes_2_attack:
    if not G.has_edge(n, m):
      citations = citations.append({'source': n, 'target': m}, ignore_index = True)
      edge_count += 1
  if edge_count >= 100:
    break
print(edge_count)

136


In [335]:
# global_action = 'Adding ~150 edges'

# edge_count = 0

# for n in e_top_influential_nodes:
#   for m in nodes_2_attack:
#     if not G.has_edge(n, m):
#       citations = citations.append({'source': n, 'target': m}, ignore_index = True)
#       edge_count += 1
#   if edge_count >= 150:
#     break
# print(edge_count)

In [336]:
# global_action = 'Adding ~200 edges'
# edge_count = 0
# for n in e_top_influential_nodes:
#   for m in nodes_2_attack:
#     if not G.has_edge(n, m):
#       citations = citations.append({'source': n, 'target': m}, ignore_index = True)
#       edge_count += 1
#   if edge_count >= 200:
#     break
# print(edge_count)

In [337]:
# global_action = 'Deleting ~100 edges'

# edge_count = 0
# for n in nodes_2_attack:
#   if G.has_node(n):
#     neighbors = G[n]
#     if len(neighbors) < 10:
#       citations = citations[(citations.source != n) & (citations.target != n)]
#     else:
#       neighbor_centralities = {}
#       for m in neighbors:
#         neighbor_centralities[m] = e_centrality[m]
#       neighbor_centralities = sorted(neighbor_centralities.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)
#       print(neighbor_centralities)
#       top_neighbor_centralities = [n[0] for n in neighbor_centralities][:9]
#       for m in top_neighbor_centralities:
#         if G.has_edge(m, n):
#           citations = citations[~(((citations.source == n) & (citations.target == m)) | ((citations.source == m) & (citations.target == n)))]
#           edge_count += 1
#       if edge_count >= 100:
#         print(edge_count)
#         break

In [338]:
# global_action = 'Deleting ~150 edges'

# edge_count = 0
# for n in nodes_2_attack:
#   if G.has_node(n):
#     neighbors = G[n]
#     if len(neighbors) < 10:
#       citations = citations[(citations.source != n) & (citations.target != n)]
#     else:
#       neighbor_centralities = {}
#       for m in neighbors:
#         neighbor_centralities[m] = e_centrality[m]
#       neighbor_centralities = sorted(neighbor_centralities.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)
#       print(neighbor_centralities)
#       top_neighbor_centralities = [n[0] for n in neighbor_centralities][:9]
#       for m in top_neighbor_centralities:
#         if G.has_edge(m, n):
#           citations = citations[~(((citations.source == n) & (citations.target == m)) | ((citations.source == m) & (citations.target == n)))]
#           edge_count += 1
#       if edge_count >= 150:
#         break

# print(edge_count)

In [339]:
# global_action = 'Deleting ~200 edges'

# edge_count = 0
# for n in nodes_2_attack:
#   if G.has_node(n):
#     neighbors = G[n]
#     if len(neighbors) < 10:
#       citations = citations[(citations.source != n) & (citations.target != n)]
#     else:
#       neighbor_centralities = {}
#       for m in neighbors:
#         neighbor_centralities[m] = e_centrality[m]
#       neighbor_centralities = sorted(neighbor_centralities.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)
#       print(neighbor_centralities)
#       top_neighbor_centralities = [n[0] for n in neighbor_centralities][:9]
#       for m in top_neighbor_centralities:
#         if G.has_edge(m, n):
#           citations = citations[~(((citations.source == n) & (citations.target == m)) | ((citations.source == m) & (citations.target == n)))]
#           edge_count += 1
#       if edge_count >= 200:
#         break

# print(edge_count)

In [340]:
print(citations.shape)

(5565, 2)


In [341]:
from stellargraph.datasets import Cora

cora = Cora()
cora.download()

cora_content_file = os.path.join(cora.base_directory, "cora.content")

In [342]:
cora_feature_names = [f"w{i}" for i in range(1433)]

cora_raw_content = pd.read_csv(
    cora_content_file,
    sep="\t",
    header=None,
    names=["id", *cora_feature_names, "subject"],
)
cora_raw_content.head(5)

Unnamed: 0,id,w0,w1,w2,w3,w4,w5,w6,w7,w8,...,w1424,w1425,w1426,w1427,w1428,w1429,w1430,w1431,w1432,subject
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods


In [343]:
cora_content_str_subject = cora_raw_content.set_index("id")
cora_content_no_subject = cora_content_str_subject.drop(columns="subject")

In [344]:
Graph = StellarGraph({"paper": cora_content_no_subject}, {"cites": citations})
print(Graph.info())

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 5565

 Node types:
  paper: [2708]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5565]
        Weights: all 1 (default)
        Features: none


## Data Preparation

### Loading the CORA network

In [345]:
dataset = sg.datasets.Cora()  
display(HTML(dataset.description))
G, node_subjects = dataset.load()
print(node_subjects)

31336             Neural_Networks
1061127             Rule_Learning
1106406    Reinforcement_Learning
13195      Reinforcement_Learning
37879       Probabilistic_Methods
                    ...          
1128975        Genetic_Algorithms
1128977        Genetic_Algorithms
1128978        Genetic_Algorithms
117328                 Case_Based
24043             Neural_Networks
Name: subject, Length: 2708, dtype: object


In [346]:
node_subjects.value_counts().to_frame()

Unnamed: 0,subject
Neural_Networks,818
Probabilistic_Methods,426
Genetic_Algorithms,418
Theory,351
Case_Based,298
Reinforcement_Learning,217
Rule_Learning,180


### Splitting the data

In [347]:
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=500, test_size=None, stratify=node_subjects
)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=700, test_size=None, stratify=test_subjects
)

# we have to remove the previously selected 20 nodes from train split and val split and add them to test split in both cases above
test_subjects

1102873           Neural_Networks
38205          Genetic_Algorithms
1130927           Neural_Networks
20178                  Case_Based
134315      Probabilistic_Methods
                    ...          
1136634                    Theory
308529     Reinforcement_Learning
189655      Probabilistic_Methods
139738            Neural_Networks
936                    Case_Based
Name: subject, Length: 1508, dtype: object

In [348]:
for node in nodes_2_attack:
  if node in train_subjects.index:
    test_subjects[node] = train_subjects[node]
    train_subjects.drop(node, inplace = True)
  elif node in val_subjects.index:
    test_subjects[node] = val_subjects[node]
    val_subjects.drop(node,inplace = True)

test_subjects

1102873           Neural_Networks
38205          Genetic_Algorithms
1130927           Neural_Networks
20178                  Case_Based
134315      Probabilistic_Methods
                    ...          
6125                       Theory
29492             Neural_Networks
75121                  Case_Based
7532              Neural_Networks
6214       Reinforcement_Learning
Name: subject, Length: 1534, dtype: object

In [349]:
train_subjects.value_counts().to_frame()

Unnamed: 0,subject
Neural_Networks,150
Genetic_Algorithms,77
Probabilistic_Methods,76
Theory,63
Case_Based,53
Reinforcement_Learning,36
Rule_Learning,33


### Converting to numeric arrays

In [350]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

## Creating the GCN layers

In [351]:
generator = FullBatchNodeGenerator(Graph, method="gcn")

Using GCN (local pooling) filters...


In [352]:
train_gen = generator.flow(train_subjects.index, train_targets)

In [353]:
gcn = GCN(
    layer_sizes=[16, 16], activations=["relu", "relu"], generator=generator, dropout=0.5
)

In [354]:
x_inp, x_out = gcn.in_out_tensors()

x_out

<KerasTensor: shape=(1, None, 16) dtype=float32 (created by layer 'gather_indices_7')>

In [355]:
predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

## Training and evaluating

### Training the model

In [356]:
model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

  super(Adam, self).__init__(name, **kwargs)


In [357]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [358]:
from tensorflow.keras.callbacks import EarlyStopping

es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

In [359]:
history = model.fit(
    train_gen,
    epochs=300,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,
    callbacks=[es_callback],
)

Epoch 1/300
1/1 - 1s - loss: 1.9459 - acc: 0.1434 - val_loss: 1.9071 - val_acc: 0.3542 - 1s/epoch - 1s/step
Epoch 2/300
1/1 - 0s - loss: 1.9043 - acc: 0.3484 - val_loss: 1.8572 - val_acc: 0.3601 - 143ms/epoch - 143ms/step
Epoch 3/300
1/1 - 0s - loss: 1.8499 - acc: 0.3934 - val_loss: 1.7956 - val_acc: 0.3994 - 141ms/epoch - 141ms/step
Epoch 4/300
1/1 - 0s - loss: 1.7767 - acc: 0.4283 - val_loss: 1.7214 - val_acc: 0.4286 - 210ms/epoch - 210ms/step
Epoch 5/300
1/1 - 0s - loss: 1.6893 - acc: 0.4488 - val_loss: 1.6347 - val_acc: 0.4621 - 192ms/epoch - 192ms/step
Epoch 6/300
1/1 - 0s - loss: 1.5930 - acc: 0.4713 - val_loss: 1.5405 - val_acc: 0.4883 - 143ms/epoch - 143ms/step
Epoch 7/300
1/1 - 0s - loss: 1.4986 - acc: 0.5205 - val_loss: 1.4481 - val_acc: 0.5306 - 143ms/epoch - 143ms/step
Epoch 8/300
1/1 - 0s - loss: 1.4246 - acc: 0.5656 - val_loss: 1.3558 - val_acc: 0.5802 - 154ms/epoch - 154ms/step
Epoch 9/300
1/1 - 0s - loss: 1.3049 - acc: 0.5963 - val_loss: 1.2643 - val_acc: 0.6312 - 145ms

In [360]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [361]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.5910
	acc: 0.8442


### Making predictions with the model

In [362]:
all_nodes = node_subjects.index
all_gen = generator.flow(all_nodes)
all_predictions = model.predict(all_gen)

In [363]:
node_predictions = target_encoding.inverse_transform(all_predictions.squeeze())

In [364]:
df = pd.DataFrame({"Predicted": node_predictions, "True": node_subjects})
df.head()

Unnamed: 0,Predicted,True
31336,Neural_Networks,Neural_Networks
1061127,Rule_Learning,Rule_Learning
1106406,Reinforcement_Learning,Reinforcement_Learning
13195,Reinforcement_Learning,Reinforcement_Learning
37879,Probabilistic_Methods,Probabilistic_Methods


In [365]:
df_final = df.loc[nodes_2_attack]
df_final

Unnamed: 0,Predicted,True
3217,Theory,Theory
35,Genetic_Algorithms,Genetic_Algorithms
62389,Genetic_Algorithms,Case_Based
10796,Theory,Case_Based
628667,Reinforcement_Learning,Reinforcement_Learning
16437,Neural_Networks,Neural_Networks
575292,Genetic_Algorithms,Genetic_Algorithms
14429,Probabilistic_Methods,Probabilistic_Methods
16008,Probabilistic_Methods,Probabilistic_Methods
18777,Probabilistic_Methods,Probabilistic_Methods


In [366]:
c = 0
for i in range(len(nodes_2_attack)):
  if df_final.loc[nodes_2_attack[i]][0] == df_final.loc[nodes_2_attack[i]][1]:
    c += 1
target_accuracy = str(c * (100 / len(nodes_2_attack))) + '%'

In [367]:
print('Action: ' + global_action)
print('Centrality: '+ global_centrality)
print('Accuracy: ' + target_accuracy)

Action: Adding ~100 edges
Centrality: Eigenvector centrality
Accuracy: 85.1063829787234%


In [368]:
# confusion matrix of target nodes

confusion_matrix(df_final['Predicted'], df_final['True'])

array([[ 0,  0,  0,  0,  0,  0,  0],
       [ 1,  5,  0,  1,  0,  0,  2],
       [ 0,  0,  8,  0,  0,  0,  0],
       [ 0,  0,  0, 12,  0,  0,  0],
       [ 0,  0,  0,  0,  7,  0,  0],
       [ 1,  0,  1,  0,  0,  2,  0],
       [ 1,  0,  0,  0,  0,  0,  6]])

In [369]:
# confusion matrix of the entire model

confusion_matrix(node_predictions, node_subjects)

array([[262,   4,   9,   2,   1,   7,  18],
       [  6, 396,   9,   1,  13,   4,   4],
       [  7,   6, 726,  20,  11,   4,  25],
       [  2,   3,  33, 394,   8,   1,  20],
       [  2,   3,  13,   0, 176,   0,   8],
       [ 10,   2,   7,   0,   1, 139,   9],
       [  9,   4,  21,   9,   7,  25, 267]])