# Adverserial Attack on Node classification with Graph Convolutional Network

##Importing modules

In [None]:
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.2.1

In [None]:
import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

In [None]:
import pandas as pd
import os

import networkx as nx

import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN
from stellargraph import StellarGraph
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from tensorflow import keras
%matplotlib inline

In [None]:
zip_file = keras.utils.get_file(
    fname="cora.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")

citations = pd.read_csv(
    os.path.join(data_dir, "cora.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)

print(citations.shape)

(5429, 2)


Unnamed: 0,target,source
2706,31336,1129442
2707,31336,31349
2708,31336,686532


In [None]:
G = nx.from_pandas_edgelist(citations, "source", "target", create_using=nx.Graph())

In [None]:
nodes_2_attack = list(citations.sample(n=20,random_state=1).drop_duplicates(subset=["target"])['target'])

print(nodes_2_attack)

[3217, 35, 62389, 10796, 628667, 16437, 575292, 14429, 16008, 18777, 137868, 54131, 22835, 63915, 5348, 634938, 10177, 132821, 95594, 6385]


In [None]:
e_centrality = nx.eigenvector_centrality(G)

e_top_influential = sorted(e_centrality.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:110]
e_top_influential_nodes = [n[0] for n in e_top_influential]
print(e_top_influential_nodes)

[35, 82920, 85352, 210871, 887, 1688, 12576, 287787, 84021, 35061, 103515, 54131, 41714, 66556, 33907, 575077, 503883, 38205, 69284, 259702, 141342, 273152, 1119708, 513189, 289780, 1154459, 1153280, 1152421, 593091, 1127913, 1153943, 210872, 1114331, 415693, 568857, 289781, 56119, 198653, 801170, 1129683, 634975, 54129, 98698, 307015, 608326, 132806, 787016, 573978, 647447, 44368, 28290, 503893, 640617, 1103960, 579008, 8865, 66563, 1136814, 1129778, 48766, 1153577, 190697, 578780, 135130, 573964, 634904, 634902, 1128453, 289779, 576725, 265203, 78511, 33904, 1130847, 634938, 248425, 97645, 69296, 1112911, 206371, 593260, 190706, 1129573, 1117476, 576795, 1113438, 646809, 197054, 263279, 1129027, 1127430, 574462, 561238, 593813, 1131360, 229635, 1033, 141347, 1153853, 263498, 18582, 1123756, 594047, 254923, 1153065, 1133338, 1103985, 694759, 387795, 128540]


In [None]:
for n in nodes_2_attack:
  for m in e_top_influential_nodes[:10]:
    if not G.has_edge(n, m):
      citations = citations.append({'source': n, 'target': m}, ignore_index = True)

In [None]:
b_centrality = nx.betweenness_centrality(G)
print(b_centrality)
b_top_influential = sorted(b_centrality.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:100]
b_top_influential_nodes = [n[0] for n in b_top_influential]
print(b_top_influential_nodes)

{1033: 0.002429835146308852, 35: 0.23248831450559274, 103482: 0.0045102893439409476, 103515: 0.0016807366772738268, 1050679: 0.02407837711834658, 1103960: 0.014910624854201108, 1103985: 0.0, 1109199: 0.008965254019513483, 1112911: 0.002413679750635914, 1113438: 0.003079337781980883, 1113831: 0.0006779390761298553, 1114331: 0.0016483521820987842, 1117476: 0.00019742465331897083, 1119505: 0.0006779390761298553, 1119708: 8.398698990101742e-05, 1120431: 0.006932163346417552, 1123756: 0.004005794956263533, 1125386: 0.009306054754118227, 1127430: 0.0, 1127913: 6.928223978106701e-06, 1128204: 0.00042752519839578217, 1128227: 0.0008988443953549828, 1128314: 0.0006779390761298553, 1128453: 0.0005059965109521121, 1128945: 0.0, 1128959: 0.00010610925157796256, 1128985: 0.0007002390060774327, 1129018: 0.000196598842235467, 1129027: 0.0, 1129573: 0.00024131119857443613, 1129683: 0.0039591245722197885, 1129778: 0.00021273206080028682, 1130847: 0.0, 1130856: 0.0, 1131116: 0.0020321790348910643, 11313

In [None]:
for n in nodes_2_attack:
  if G.has_node(n):
    neighbors = G[n]
    if len(neighbors) < 10:
      citations = citations[(citations.source != n) & (citations.target != n)]
    else:
      neighbor_centralities = {}
      for m in neighbors:
        neighbor_centralities[m] = b_centrality[m]
      neighbor_centralities = sorted(neighbor_centralities.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:10]
      print(neighbor_centralities)
      top_neighbor_centralities = [n[0] for n in neighbor_centralities]
      for m in top_neighbor_centralities:
        if G.has_edge(m, n):
          citations = citations[~(((citations.source == n) & (citations.target == m)) | ((citations.source == m) & (citations.target == n)))]

[(887, 0.067917639987497), (1956, 0.025947703372358206), (1050679, 0.02407837711834658), (263279, 0.017794245911054034), (12576, 0.015028408547881312), (1103960, 0.014910624854201108), (254923, 0.013845931833827953), (198653, 0.013214129628605442), (82098, 0.012777942915544368), (175291, 0.012475431153616521)]
[(6213, 0.07637499735733924), (887, 0.067917639987497), (10798, 0.019434135916653134), (30895, 0.009124144893260563), (1113934, 0.007917550642566157), (8699, 0.0030982409534939742), (417017, 0.0025304554450957316), (1110947, 0.0023657682863394165), (180373, 0.002089508191431831), (1153148, 0.0014026828587898985)]
[(3191, 0.016007131770255315), (189577, 0.014252116688527234), (189571, 0.012981891759633156), (1105531, 0.0022054336503729228), (642827, 0.0021287395420463997), (3192, 0.00148663729639864), (189572, 0.0007533779242725587), (523394, 0.0005960816210004269), (158098, 0.00011828285137740308), (308920, 5.865009588261183e-05)]
[(94713, 0.005094093201374563), (73146, 0.0050030

In [None]:
print(citations.shape)

(5355, 2)


In [None]:
from stellargraph.datasets import Cora

cora = Cora()
cora.download()

cora_content_file = os.path.join(cora.base_directory, "cora.content")

In [None]:
cora_feature_names = [f"w{i}" for i in range(1433)]

cora_raw_content = pd.read_csv(
    cora_content_file,
    sep="\t",
    header=None,
    names=["id", *cora_feature_names, "subject"],
)
cora_raw_content.head(5)

Unnamed: 0,id,w0,w1,w2,w3,w4,w5,w6,w7,w8,...,w1424,w1425,w1426,w1427,w1428,w1429,w1430,w1431,w1432,subject
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods


In [None]:
cora_content_str_subject = cora_raw_content.set_index("id")
cora_content_no_subject = cora_content_str_subject.drop(columns="subject")

In [None]:
Graph = StellarGraph({"paper": cora_content_no_subject}, {"cites": citations})
print(Graph.info())

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 5355

 Node types:
  paper: [2708]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5355]
        Weights: all 1 (default)
        Features: none


## Data Preparation

### Loading the CORA network

In [None]:
dataset = sg.datasets.Cora()  
display(HTML(dataset.description))
G, node_subjects = dataset.load()
print(node_subjects)

31336             Neural_Networks
1061127             Rule_Learning
1106406    Reinforcement_Learning
13195      Reinforcement_Learning
37879       Probabilistic_Methods
                    ...          
1128975        Genetic_Algorithms
1128977        Genetic_Algorithms
1128978        Genetic_Algorithms
117328                 Case_Based
24043             Neural_Networks
Name: subject, Length: 2708, dtype: object


In [None]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 5429

 Node types:
  paper: [2708]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5429]
        Weights: all 1 (default)
        Features: none


In [None]:
node_subjects.value_counts().to_frame()

Unnamed: 0,subject
Neural_Networks,818
Probabilistic_Methods,426
Genetic_Algorithms,418
Theory,351
Case_Based,298
Reinforcement_Learning,217
Rule_Learning,180


### Splitting the data

In [None]:
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=500, test_size=None, stratify=node_subjects
)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=700, test_size=None, stratify=test_subjects
)

# we have to remove the previously selected 20 nodes from train split and val split and add them to test split in both cases above
test_subjects

1130847        Genetic_Algorithms
158172      Probabilistic_Methods
137790                     Theory
62329          Genetic_Algorithms
463825                     Theory
                    ...          
463                    Case_Based
1131360        Genetic_Algorithms
114        Reinforcement_Learning
1119742     Probabilistic_Methods
145215                 Case_Based
Name: subject, Length: 1508, dtype: object

In [None]:
for node in nodes_2_attack:
  if node in train_subjects.index:
    test_subjects[node] = train_subjects[node]
    train_subjects.drop(node, inplace = True)
  elif node in val_subjects.index:
    test_subjects[node] = val_subjects[node]
    val_subjects.drop(node,inplace = True)

test_subjects

1130847        Genetic_Algorithms
158172      Probabilistic_Methods
137790                     Theory
62329          Genetic_Algorithms
463825                     Theory
                    ...          
22835             Neural_Networks
63915             Neural_Networks
634938         Genetic_Algorithms
132821                     Theory
6385       Reinforcement_Learning
Name: subject, Length: 1517, dtype: object

In [None]:
train_subjects.value_counts().to_frame()

Unnamed: 0,subject
Neural_Networks,149
Probabilistic_Methods,79
Genetic_Algorithms,75
Theory,65
Case_Based,55
Reinforcement_Learning,38
Rule_Learning,33


### Converting to numeric arrays

In [None]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

## Creating the GCN layers

In [None]:
generator = FullBatchNodeGenerator(Graph, method="gcn")

Using GCN (local pooling) filters...


In [None]:
train_gen = generator.flow(train_subjects.index, train_targets)

In [None]:
gcn = GCN(
    layer_sizes=[16, 16], activations=["relu", "relu"], generator=generator, dropout=0.5
)

In [None]:
x_inp, x_out = gcn.in_out_tensors()

x_out

<KerasTensor: shape=(1, None, 16) dtype=float32 (created by layer 'gather_indices_3')>

In [None]:
predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

## Training and evaluating

### Training the model

In [None]:
model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

In [None]:
history = model.fit(
    train_gen,
    epochs=300,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,
    callbacks=[es_callback],
)

Epoch 1/300
1/1 - 1s - loss: 1.9550 - acc: 0.1154 - val_loss: 1.8969 - val_acc: 0.3400 - 1s/epoch - 1s/step
Epoch 2/300
1/1 - 0s - loss: 1.8977 - acc: 0.3462 - val_loss: 1.8372 - val_acc: 0.3415 - 147ms/epoch - 147ms/step
Epoch 3/300
1/1 - 0s - loss: 1.8324 - acc: 0.3603 - val_loss: 1.7595 - val_acc: 0.3615 - 177ms/epoch - 177ms/step
Epoch 4/300
1/1 - 0s - loss: 1.7532 - acc: 0.3866 - val_loss: 1.6745 - val_acc: 0.3759 - 155ms/epoch - 155ms/step
Epoch 5/300
1/1 - 0s - loss: 1.6671 - acc: 0.4008 - val_loss: 1.5917 - val_acc: 0.3945 - 147ms/epoch - 147ms/step
Epoch 6/300
1/1 - 0s - loss: 1.5959 - acc: 0.4150 - val_loss: 1.5103 - val_acc: 0.4218 - 147ms/epoch - 147ms/step
Epoch 7/300
1/1 - 0s - loss: 1.4827 - acc: 0.4433 - val_loss: 1.4303 - val_acc: 0.4534 - 145ms/epoch - 145ms/step
Epoch 8/300
1/1 - 0s - loss: 1.4377 - acc: 0.4777 - val_loss: 1.3494 - val_acc: 0.4907 - 145ms/epoch - 145ms/step
Epoch 9/300
1/1 - 0s - loss: 1.3257 - acc: 0.5020 - val_loss: 1.2694 - val_acc: 0.5265 - 151ms

In [None]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [None]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.5567
	acc: 0.8425


### Making predictions with the model

In [None]:
all_nodes = node_subjects.index
all_gen = generator.flow(all_nodes)
all_predictions = model.predict(all_gen)

In [None]:
node_predictions = target_encoding.inverse_transform(all_predictions.squeeze())

In [None]:
df = pd.DataFrame({"Predicted": node_predictions, "True": node_subjects})
df.head()

Unnamed: 0,Predicted,True
31336,Neural_Networks,Neural_Networks
1061127,Rule_Learning,Rule_Learning
1106406,Reinforcement_Learning,Reinforcement_Learning
13195,Reinforcement_Learning,Reinforcement_Learning
37879,Probabilistic_Methods,Probabilistic_Methods


In [None]:
df.shape

(2708, 2)

In [None]:
df_final = df.loc[nodes_2_attack]
df_final

Unnamed: 0,Predicted,True
3217,Theory,Theory
35,Genetic_Algorithms,Genetic_Algorithms
62389,Case_Based,Case_Based
10796,Genetic_Algorithms,Case_Based
628667,Theory,Reinforcement_Learning
16437,Neural_Networks,Neural_Networks
575292,Genetic_Algorithms,Genetic_Algorithms
14429,Genetic_Algorithms,Probabilistic_Methods
16008,Genetic_Algorithms,Probabilistic_Methods
18777,Genetic_Algorithms,Probabilistic_Methods


In [None]:
c = 0
for i in range(20):
  if df_final.loc[nodes_2_attack[i]][0] == df_final.loc[nodes_2_attack[i]][1]:
    c += 1
print('Accuracy: ', c * 5, '%')

Accuracy:  55 %
