# 简单图网络后门检测任务 PoC

In [1]:
import torch
torch.__version__

'1.10.1+cu102'

In [1]:
from dgl.data.dgl_dataset import DGLBuiltinDataset
from model_lib.mnist_cnn_model import Model0 as Model
import os 
import re
import torch
from torch import nn
import dgl

class HomoStrucBackdoorDataset(DGLBuiltinDataset):
    def __init__(self, mode='train', raw_dir='/home/ubuntu/date/hdd4/shadow_model_ckpt/mnist/models/',
                 force_reload=False, verbose=False, transform=None):
        mode = mode.lower()
        assert mode in ['train', 'valid', 'test'], "Mode not valid."
        self.mode = mode    
        self.x = []
        self.y = []
        _url = None
        
        super(HomoStrucBackdoorDataset, self).__init__(name='HomoBackdoorDT',
                                           raw_dir=raw_dir,
                                           force_reload=force_reload,
                                           verbose=verbose,
                                           url=_url,
                                           transform=transform)
        self.load()
        
        
    def process(self):
        pass
    
    def has_cache(self):
        pass
    
    def load(self):
        '''load dataset info'''
        
        for filename in os.listdir(self.raw_dir):
            if '.model' not in filename:
                # not a model
                continue
            idx_pattern = '[0-9]+'
            idx = re.findall(idx_pattern, filename)
            if self.mode == 'train':
                if int(idx[0]) < 2048 and 'target' not in filename:
                    # is a training model
                    self.x.append(filename)
                else:
                    continue
                # print(filename)
            elif self.mode == 'valid':
                if int(idx[0]) >= 2048 and 'target' not in filename:
                    self.x.append(filename)
                else:
                    continue
            else:
                # self.mode == 'test'
                if 'target' in filename:# and 'B' not in filename
                    self.x.append(filename)
                else:
                    continue
            # add co
            if 'benign' in filename:
                self.y.append(0)
            else:
                self.y.append(1)
        
    def __getitem__(self, idx):
        assert idx < len(self.x), "Out of index when get item."
        # load data, process and return
        g, y = self.load_g(idx)
        return g, y
        
    def __len__(self):
        return len(self.x)
    
    def get_x_y(self):
        return self.x, self.y
    
    def iter_y(self):
        for y in self.y:
            yield y
            
    def is_correct_labeled(self):
        x = self.x
        y = self.y
        cnt = 0
        error = 0
        for i,j in zip(x,y):
            if 'benign' in i and j == 0:
                cnt += 1
            elif 'benign' not in i and j == 1:
                cnt += 1
            else:
                error += 1
        if cnt != len(x) or error > 0:
            return False
        else:
            return True
        
    def load_g(self, idx):
        x = os.path.join(self.raw_dir, self.x[idx])
        y = self.y[idx]
    #         print(label)
        CUDA_LAUNCH_BLOCKING=1
        basic_model = Model().cuda()
        t = torch.load(x)
        t = basic_model.load_state_dict(t)
        
        g = None
        with torch.no_grad():
            # nodes_feat 512 * 513
            nodes_feat = []
            cnt = 0
            # get conv1 nodes 
            conv1 = {}
            for weight in basic_model.conv1.weight:
                pad = nn.ZeroPad2d(padding=(254,254,253,254))
                feat = pad(weight[0])
                conv1[cnt] = feat
                nodes_feat.append(feat)
                cnt += 1

            # get conv2 nodes
            conv2 = {}
            for weight in basic_model.conv2.weight:
                pad = nn.ZeroPad2d(padding=(254,254,253,254))
                feat = pad(weight[0])
                conv2[cnt] = feat
                nodes_feat.append(feat)
                cnt += 1

            # get conv1 -> conv2 edges
            conv1_2 = []
            for src in conv1.keys():
                for dst in conv2.keys():
                    conv1_2.append([src, dst])


            # get fc node
            fc_index = cnt
            cnt += 1
            fc_node = torch.concat([basic_model.fc.weight, basic_model.fc.bias.reshape(512, 1)], 1)
            nodes_feat.append(fc_node)
            # print(fc_node.shape)

            # get conv2 -> fc edges
            conv2_fc = []
            for src in conv2.keys():
                conv2_fc.append([src, fc_index])

            # get output node
            out_index = cnt
            cnt += 1 
            out = torch.concat([basic_model.output.weight, basic_model.output.bias.reshape(10, 1)], 1)
            pad = nn.ZeroPad2d(padding=(0,0,251,251))
            out_node = pad(out)
            nodes_feat.append(out_node)

            # print(out_node.shape)

            # get fc -> output edge
            fc_out_edge = [[fc_index, out_index]]

            # get all nodes
            nodes_feat = torch.stack(nodes_feat)
            # print(nodes_feat.shape)
            # get all edges
            all_edges = torch.tensor(conv1_2 + conv2_fc + fc_out_edge).t().tolist()
            u, v = all_edges[0], all_edges[1]


            g = dgl.graph((u,v)).to('cuda')
            g.ndata['x'] = nodes_feat
        return g, y
    
    
dataset = HomoStrucBackdoorDataset(mode='valid')
print(len(dataset))
dataset.load()
x,y = dataset.get_x_y()
for i in x:
    print(i)

512
shadow_benign_2137.model
shadow_jumbo_2130.model
shadow_jumbo_2095.model
shadow_jumbo_2089.model
shadow_benign_2280.model
shadow_jumbo_2247.model
shadow_jumbo_2049.model
shadow_jumbo_2160.model
shadow_benign_2193.model
shadow_benign_2122.model
shadow_benign_2195.model
shadow_benign_2177.model
shadow_jumbo_2100.model
shadow_benign_2083.model
shadow_benign_2284.model
shadow_jumbo_2299.model
shadow_benign_2097.model
shadow_jumbo_2122.model
shadow_benign_2053.model
shadow_jumbo_2291.model
shadow_jumbo_2096.model
shadow_jumbo_2154.model
shadow_benign_2236.model
shadow_benign_2249.model
shadow_jumbo_2216.model
shadow_benign_2135.model
shadow_jumbo_2153.model
shadow_benign_2197.model
shadow_jumbo_2190.model
shadow_jumbo_2259.model
shadow_jumbo_2094.model
shadow_jumbo_2214.model
shadow_benign_2205.model
shadow_benign_2082.model
shadow_jumbo_2273.model
shadow_benign_2272.model
shadow_jumbo_2278.model
shadow_jumbo_2254.model
shadow_benign_2161.model
shadow_jumbo_2056.model
shadow_benign_2256

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import StratifiedKFold
from dgl.data import GINDataset
from dgl.dataloading import GraphDataLoader
from dgl.nn.pytorch.conv import GINConv
from dgl.nn.pytorch.glob import SumPooling, AvgPooling, MaxPooling, SortPooling
import argparse
from tqdm import tqdm

class MLP(nn.Module):
    """Construct two-layer MLP-type aggreator for GIN model"""
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.linears = nn.ModuleList()
        # two-layer MLP    
        self.linears.append(nn.Linear(input_dim, hidden_dim, bias=False))
        self.linears.append(nn.Linear(hidden_dim, output_dim, bias=False))
        self.batch_norm = nn.BatchNorm1d((hidden_dim))

    def forward(self, x):
        h = x
        h = F.relu(self.batch_norm(self.linears[0](h)))
        return self.linears[1](h)
    
class SGN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, pooling='sum'):
        super().__init__()
        assert pooling in ['sum', 'avg', 'max'], "Not supported pooling method."
        self.ginlayers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        num_layers = 2
        # five-layer GCN with two-layer MLP aggregator and sum-neighbor-pooling scheme
#         for layer in range(num_layers - 1): # excluding the input layer
#             if layer == 0:
#                 mlp = MLP(input_dim, hidden_dim, hidden_dim)
#             else:
#                 mlp = MLP(hidden_dim, hidden_dim, hidden_dim)
#             self.ginlayers.append(GINConv(mlp, learn_eps=False)) # set to True if learning epsilon
#             self.batch_norms.append(nn.BatchNorm1d(hidden_dim))
        # linear functions for graph sum poolings of output of each layer
        self.linear_prediction = nn.ModuleList()
        for layer in range(num_layers):
            if layer == 0:
                self.linear_prediction.append(nn.Linear(input_dim, output_dim))
            else:
                self.linear_prediction.append(nn.Linear(hidden_dim, output_dim))
        self.drop = nn.Dropout(0.8)
        if pooling == 'sum':
            self.pool = SumPooling() # change to mean readout (AvgPooling) on social network datasets
        elif pooling == 'avg':
            self.pool = AvgPooling()
        else:
            self.pool = MaxPooling()
#         self.topK = topK
#         self.pool = SortPooling(topK)
#         self.pool = AvgPooling()
        
    def forward(self, g, h):
        # list of hidden representation at each layer (including the input layer)
        hidden_rep = [h]
        for i, layer in enumerate(self.ginlayers):
            h = layer(g, h)
            h = self.batch_norms[i](h)
            h = F.relu(h)
            hidden_rep.append(h)
        score_over_layer = 0
        # perform graph sum pooling over all nodes in each layer
        for i, h in enumerate(hidden_rep):
            pooled_h = self.pool(g, h)
            score_over_layer += self.drop(self.linear_prediction[i](pooled_h))
        return score_over_layer
    
def split_fold10(labels, fold_idx=0):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    idx_list = []
    for idx in skf.split(np.zeros(len(labels)), labels):
        idx_list.append(idx)
    train_idx, valid_idx = idx_list[fold_idx]
    return train_idx, valid_idx

def evaluate(dataloader, device, model):
    model.eval()
    total = 0
    total_correct = 0
    total_tp = 0
    total_fp = 0
    total_tn = 0
    total_fn = 0
    for batch, (batched_graph, labels) in enumerate(tqdm(dataloader)):
        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        feat = batched_graph.ndata.pop('x')
        total += len(labels)
        logits = model(batched_graph, feat.view(len(feat), -1))
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        m = predicted + labels
        total_tp += (m >= 2).sum().item()
        total_tn += (m == 0).sum().item()
        total_fp += (predicted > labels).sum().item()
        total_fn += (predicted < labels).sum().item()
        # print(m, predicted, labels, total_tp, total_tn, total_fp, total_fn)
    acc = 1.0 * total_correct / total
    pre = 1.0 * total_tp / (total_tp + total_fp)
    rec = 1.0 * total_tp / (total_tp + total_fn)
    f1 = 2.0 * pre * rec / (rec + rec)
    # print(acc, pre, rec, f1)
    return acc, pre, rec, f1

def train(train_loader, val_loader, test_loader, device, model):
    # loss function, optimizer and scheduler
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
    info = {'train_acc':[], 'val_acc':[], 'test_acc':[], 
            'train_pre':[], 'val_pre':[], 'test_pre':[],
            'train_rec':[], 'val_res':[], 'test_res':[],
            'train_f1':[], 'val_f1':[], 'test_f1':[]}
    # training loop    
    for epoch in range(350):
        model.train()
        total_loss = 0
        for batch, (batched_graph, labels) in enumerate(tqdm(train_loader)):
            batched_graph = batched_graph.to(device)
            #print(batch, labels, type(labels))
            labels = labels.to(device)
            # print(labels)
            feat = batched_graph.ndata.pop('x')
            # print(feat.view(50,-1).shape)
            logits = model(batched_graph, feat.view(len(feat), -1))
            # print(logits)
            # print(logits.shape, labels.shape)
            loss = loss_fcn(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step()
        # acc, pre, rec, f1
        train_acc, train_pre, train_rec, train_f1 = evaluate(train_loader, device, model)
        info['train_acc'], info['train_pre'], info['train_rec'], info['train_f1'] = train_acc, train_pre, train_rec, train_f1
        val_acc, val_pre, val_rec, val_f1 = evaluate(val_loader, device, model)
        info['val_acc'], info['val_pre'], info['val_rec'], info['val_f1'] = val_acc, val_pre, val_rec, val_f1
        test_acc, test_pre, test_rec, test_f1 = evaluate(test_loader, device, model)
        info['test_acc'], info['test_pre'], info['test_rec'], info['test_f1'] = test_acc, test_pre, test_rec, test_f1
        print("Epoch {:05d} | Loss {:.4f} | Train Acc. {:.4f} | Validation Acc. {:.4f}| Test Acc. {:.4f} "
              . format(epoch, total_loss / (batch + 1), train_acc, valid_acc, test_acc))
        
    from datetime import datetime
    import json
    now = datetime.now()
    date = now.strftime("%Y-%m-%d-%H:%M:%S")
    with open('./intermediate_data/train-%s.json' % date, 'w') as f:
        json.dump(info, f)
        
if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--dataset', type=str, default="MUTAG",
#                         choices=['MUTAG', 'PTC', 'NCI1', 'PROTEINS'],
#                         help='name of dataset (default: MUTAG)')
#     parser.add_argument('--pooling', type=str, default='sum', choices=['sum', 'avg', 'max'], help='pooling method, default:sum')
#     args = parser.parse_args()
    print(f'Training with DGL built-in GINConv module with a fixed epsilon = 0')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # load and split dataset
    # dataset = GINDataset(args.dataset, self_loop=True, degree_as_nlabel=False) # add self_loop and disable one-hot encoding for input features
    dataset = HomoStrucBackdoorDataset()
    val_dataset = HomoStrucBackdoorDataset(mode='valid')
    test_dataset = HomoStrucBackdoorDataset(mode='test')

    # train_idx, val_idx = split_fold10(labels)
    # print(train_idx, val_idx)
    
    # create dataloader
    train_loader = GraphDataLoader(dataset, batch_size=4, pin_memory=torch.cuda.is_available())
    val_loader = GraphDataLoader(val_dataset, batch_size=4, pin_memory=torch.cuda.is_available())
    test_loader = GraphDataLoader(test_dataset, batch_size=4, pin_memory=torch.cuda.is_available())
    
    # create GIN model
    in_size = 512 * 513
    #gin_dataset = GINDataset('MUTAG', self_loop=True, degree_as_nlabel=False) # add self_loop and disable one-hot encoding for input features
    # print(gin_dataset.dim_nfeats)
    out_size = 2
    model = SGN(in_size, 16, out_size).to(device)

    # model training/validating
    print('Training Procedure...')
    train(train_loader, val_loader, test_loader, device, model)
    

Training with DGL built-in GINConv module with a fixed epsilon = 0
Training Procedure...


100%|██████████| 1024/1024 [01:45<00:00,  9.74it/s]
100%|██████████| 1024/1024 [00:44<00:00, 22.90it/s]
100%|██████████| 128/128 [00:11<00:00, 11.03it/s]
100%|██████████| 192/192 [00:19<00:00, 10.01it/s]


NameError: name 'valid_acc' is not defined