In [2]:
import gc
from sklearn.decomposition import PCA
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold,KFold
import os
import pickle
from sklearn.metrics import precision_score
from catboost import CatBoostClassifier
import dgl
from sklearn.model_selection import StratifiedKFold
import math
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from copy import deepcopy
import torch

Using backend: pytorch


In [3]:
def load_dgl_graph_k_fold(base_path, fold=-1, k=10, seed=1996):

    with open(os.path.join(base_path, 'labels.pkl'), 'rb') as f:
        label_data = pickle.load(f)
    labels = torch.from_numpy(label_data['label'])
    labels = labels.to(torch.int64)
    test_label_idx = label_data['test_label_idx']
    if fold == -1:
        tr_label_idx = label_data['tr_label_idx']
        val_label_idx = label_data['val_label_idx']
    else:
        train_idx = np.concatenate((label_data['tr_label_idx'], label_data['val_label_idx']))
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        for i, (tr, val) in enumerate(folds.split(train_idx, labels[train_idx])):
            tr_label_idx, val_label_idx = train_idx[tr], train_idx[val]
            if i == fold:
                print('    ###      use      fold: {}'.format(fold))
                break
    # get node features
    features = np.load(os.path.join(base_path, 'features.npy'))
    node_feat = torch.from_numpy(features).float()
    print('################ Feature info: ###############')
    print('Node\'s feature shape:{}'.format(node_feat.shape))
    return labels, tr_label_idx, val_label_idx, test_label_idx, node_feat

In [4]:
class FastTensorDataLoader:
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
    def __iter__(self):
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = [t[r] for t in self.tensors]
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches

epsilon = 1 - math.log(2)
def custom_loss_function(x, labels):
    y = F.cross_entropy(x, labels, reduction="none")
    y = torch.log(epsilon + y) - math.log(epsilon)
    return torch.mean(y)
loss_fn = nn.CrossEntropyLoss().to('cuda:0')

class ISO_Node_NN(nn.Module):  

    def __init__(self): 
        super(ISO_Node_NN, self).__init__()        
        self.net = nn.Sequential(
            nn.Linear(300,2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2048,1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024,512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512,256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 23),
        )
        

    def forward(self, x):  
        x = self.net(x)
        return x

In [9]:
base_path = 'E:/ZJL/DGL'
batch_size = 4096
device = 'cuda:0'
fold=0
labels, tr_label_idx, val_label_idx, test_label_idx, node_feat = load_dgl_graph_k_fold(base_path,0)

test_data_loader = FastTensorDataLoader(
    node_feat,
    labels,
    batch_size=batch_size, shuffle=False)

threshold = 0.99

model = ISO_Node_NN().to(device)
PATH = f'dnn_fold_{fold}'
model.load_state_dict(torch.load(PATH))
model.eval()
all_batch_list = []
for X_sequence, target in test_data_loader:
    X_sequence, target = X_sequence.to(device), target.to(device)
    y_hat = model(X_sequence)
    all_batch_list.append(y_hat.cpu().detach().numpy())
all_batch_list = np.vstack(all_batch_list)
all_batch_list = F.softmax(torch.tensor(all_batch_list), dim=1)

    ###      use      fold: 0
################ Feature info: ###############
Node's feature shape:torch.Size([3655452, 300])


In [39]:
confident_nid = torch.arange(len(all_batch_list))[all_batch_list.max(1)[0] > threshold].numpy()
# 非训练测试节点
label_nodes = set(tr_label_idx) | set(val_label_idx)| set(test_label_idx)
non_label_nodes = set([i for i in range(3655452)]) - label_nodes
non_label_nodes = np.array(list(non_label_nodes))
kf = KFold(n_splits=3,shuffle=True,random_state=fold) # 每次选1/3
for train_index, test_index in kf.split(non_label_nodes):
    break
add_nodes = np.array(list(set(confident_nid) & set(non_label_nodes[test_index])))
tr_label_idx = np.array(sorted(list(set(tr_label_idx) | set(add_nodes))))

In [43]:
print(len(tr_label_idx))
tr_label_idx = np.array(sorted(list(set(tr_label_idx) | set(add_nodes))))
print(len(tr_label_idx))

939975
0


In [60]:
base_path = 'E:/ZJL/DGL'
def adjust_learning_rate(optimizer, epoch):
    for param_group in optimizer.param_groups:
        param_group["lr"] = 0.001 + (epoch % 5)*0.001

n_epochs = 1000
device = 'cuda:0'
batch_size = 4096

for fold in range(10):
    labels, tr_label_idx, val_label_idx, test_label_idx, node_feat = load_dgl_graph_k_fold(base_path,fold)
    test_data_loader = FastTensorDataLoader(
        node_feat,
        labels,
        batch_size=batch_size, shuffle=False)
    
    for stage in range(1,2):
        if stage == 0:            
            train_data_loader = FastTensorDataLoader(
                node_feat[tr_label_idx],
                labels[tr_label_idx],
                batch_size=batch_size, shuffle=True)
            
        else:
            # 扩充训练集
            model = ISO_Node_NN().to(device)
            PATH = f'dnn_fold_{fold}'
            model.load_state_dict(torch.load(PATH))
            model.eval()
            all_batch_list = []
            for X_sequence, target in test_data_loader:
                X_sequence, target = X_sequence.to(device), target.to(device)
                y_hat = model(X_sequence)
                all_batch_list.append(y_hat.cpu().detach().numpy())
            all_batch_list = np.vstack(all_batch_list)
            all_batch_list = F.softmax(torch.tensor(all_batch_list), dim=1)
            # 置信节点
            confident_nid = torch.arange(len(all_batch_list))[all_batch_list.max(1)[0] > threshold].numpy()
            # 非训练测试节点
#             label_nodes = set(tr_label_idx) | set(val_label_idx)| set(test_label_idx)
            label_nodes = set(tr_label_idx) | set(val_label_idx)
            non_label_nodes = set([i for i in range(3655452)]) - label_nodes
            non_label_nodes = np.array(list(non_label_nodes))
            kf = KFold(n_splits=10,shuffle=True,random_state=2021)
            cur = 0
            for train_index, test_index in kf.split(non_label_nodes):
                if cur == fold:
                    break
                cur += 1
            add_nodes = np.array(sorted(list(set(confident_nid) & set(non_label_nodes[test_index]))))
            tr_label_idx = np.array(sorted(list(set(tr_label_idx) | set(add_nodes))))
            # 替换label
            pseudo = torch.argmax(all_batch_list,dim=1)
            labels[add_nodes] = pseudo[add_nodes]
            
            train_data_loader = FastTensorDataLoader(
                node_feat[tr_label_idx],
                labels[tr_label_idx],
                batch_size=batch_size, shuffle=True)
        
        valid_data_loader = FastTensorDataLoader(
            node_feat[val_label_idx],
            labels[val_label_idx],
            batch_size=batch_size, shuffle=True)
            

        model = ISO_Node_NN().to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, weight_decay=0)
        print('# model parameters:', sum(param.numel() for param in model.parameters()))

        loss_tra = []
        loss_val = []
        val_best = 1e9
        stop_count = 0
        for epoch in range(n_epochs):
            adjust_learning_rate(optimizer, epoch)
            model.train()
            epoch_loss = []

            tra_acc_list = []
            for X_sequence, target in train_data_loader:
                X_sequence, target = X_sequence.to(device), target.to(device)

                y_hat = model(X_sequence)
                train_loss = custom_loss_function(y_hat, target)

                val_batch_pred = torch.sum(torch.argmax(y_hat, dim=1) == target) / torch.tensor(target.shape[0])
                tra_acc_list.append(val_batch_pred.detach().cpu().numpy())

                train_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                epoch_loss.append(train_loss.item())
            loss_tra = np.mean(epoch_loss)

            # 验证集
            model.eval()
            epoch_loss = []
            val_acc_list = []
            for X_sequence, target in valid_data_loader:
                X_sequence, target = X_sequence.to(device), target.to(device)
                y_hat = model(X_sequence)
                valid_loss = custom_loss_function(y_hat, target)
                epoch_loss.append(valid_loss.item())

                val_batch_pred = torch.sum(torch.argmax(y_hat, dim=1) == target) / torch.tensor(target.shape[0])
                val_acc_list.append(val_batch_pred.detach().cpu().numpy())


            loss_val = np.mean(epoch_loss)
            if loss_val < val_best:
                val_best = loss_val
                acc_best = np.mean(val_acc_list)
                stop_count = 0
                best_model = deepcopy(model)
                print("Epoch: {}/{}".format(epoch, n_epochs))
                print('loss_tra',loss_tra)
                print('acc train',np.mean(tra_acc_list))
                print('loss_val',loss_val)
                print('acc val',np.mean(val_acc_list))
            else:
                stop_count += 1
            if stop_count == 20:
                break

        print('val_best',acc_best)
        torch.save(best_model.state_dict(), f'dnn_fold_{fold}_stage_{stage}')

    ###      use      fold: 0
################ Feature info: ###############
Node's feature shape:torch.Size([3655452, 300])
# model parameters: 3421463
Epoch: 0/1000
loss_tra 2.023354425529639
acc train 0.28836715
loss_val 1.820481525017665
acc val 0.3669834
Epoch: 1/1000
loss_tra 1.799092699587345
acc train 0.3848114
loss_val 1.7346322719867413
acc val 0.4012891
Epoch: 2/1000
loss_tra 1.7343041206399599
acc train 0.41474813
loss_val 1.6768360229638906
acc val 0.42493743
Epoch: 3/1000
loss_tra 1.6966936950882276
acc train 0.43307754
loss_val 1.6511922753774202
acc val 0.4375749
Epoch: 4/1000
loss_tra 1.6717496966322263
acc train 0.44419926
loss_val 1.6468414664268494
acc val 0.44025266
Epoch: 5/1000
loss_tra 1.6284491350253423
acc train 0.46135342
loss_val 1.6084308624267578
acc val 0.45583534
Epoch: 6/1000
loss_tra 1.6149665529529253
acc train 0.46623704
loss_val 1.5989548472257762
acc val 0.4575419
Epoch: 10/1000
loss_tra 1.5832453514138858
acc train 0.47832778
loss_val 1.5820906070

Epoch: 36/1000
loss_tra 1.485824786623319
acc train 0.5160447
loss_val 1.5579036474227905
acc val 0.477315
Epoch: 46/1000
loss_tra 1.4677226727207502
acc train 0.5222145
loss_val 1.5563025566247792
acc val 0.47768733
Epoch: 51/1000
loss_tra 1.4596826071540514
acc train 0.52569467
loss_val 1.5560671595426707
acc val 0.47784784
Epoch: 56/1000
loss_tra 1.4511796777447066
acc train 0.5288111
loss_val 1.5545003139055693
acc val 0.47889897
Epoch: 65/1000
loss_tra 1.4486889004707337
acc train 0.53005856
loss_val 1.5539449682602515
acc val 0.47956535
Epoch: 66/1000
loss_tra 1.4392852425575255
acc train 0.53377783
loss_val 1.5526817303437452
acc val 0.48078454
Epoch: 75/1000
loss_tra 1.4364735434452693
acc train 0.5348007
loss_val 1.5506541316325848
acc val 0.48213735
Epoch: 81/1000
loss_tra 1.422570490837097
acc train 0.5402246
loss_val 1.5506257506517263
acc val 0.4824101
Epoch: 82/1000
loss_tra 1.4256247570117315
acc train 0.5392317
loss_val 1.5503655076026917
acc val 0.48117036
Epoch: 85/10

Epoch: 2/1000
loss_tra 1.7641168229312818
acc train 0.39954993
loss_val 1.7078076096681447
acc val 0.4135267
Epoch: 3/1000
loss_tra 1.7382691678169853
acc train 0.41439015
loss_val 1.684967068525461
acc val 0.42555612
Epoch: 4/1000
loss_tra 1.7151986147852853
acc train 0.42546967
loss_val 1.6841925657712495
acc val 0.42416716
Epoch: 5/1000
loss_tra 1.690974504621197
acc train 0.43608797
loss_val 1.6466369674755976
acc val 0.44044003
Epoch: 6/1000
loss_tra 1.674763802670839
acc train 0.44246298
loss_val 1.6375041879140413
acc val 0.44298923
Epoch: 7/1000
loss_tra 1.6636891706355874
acc train 0.4466238
loss_val 1.6323436911289508
acc val 0.4448563
Epoch: 10/1000
loss_tra 1.6397325141795938
acc train 0.4565262
loss_val 1.623840840963217
acc val 0.4496823
Epoch: 11/1000
loss_tra 1.6264906661639076
acc train 0.46166846
loss_val 1.6151302731954134
acc val 0.45257515
Epoch: 16/1000
loss_tra 1.6261490484490928
acc train 0.46124387
loss_val 1.614814960039579
acc val 0.4545192
Epoch: 20/1000
los

Epoch: 20/1000
loss_tra 1.5343831171592077
acc train 0.49663737
loss_val 1.5656411372698271
acc val 0.47350866
Epoch: 21/1000
loss_tra 1.5231879527370136
acc train 0.5011577
loss_val 1.5636269129239595
acc val 0.47410113
Epoch: 25/1000
loss_tra 1.5186150178313256
acc train 0.50308067
loss_val 1.5594194348041828
acc val 0.4752548
Epoch: 30/1000
loss_tra 1.5062346478303275
acc train 0.5078636
loss_val 1.5590249116604145
acc val 0.4773291
Epoch: 35/1000
loss_tra 1.4939485937356949
acc train 0.51223445
loss_val 1.5567901134490967
acc val 0.47772434
Epoch: 36/1000
loss_tra 1.4834247733155885
acc train 0.51642144
loss_val 1.5544045292414153
acc val 0.47785616
Epoch: 41/1000
loss_tra 1.4738111530741056
acc train 0.5203855
loss_val 1.554217292712285
acc val 0.47862655
Epoch: 42/1000
loss_tra 1.4766101777553557
acc train 0.51915276
loss_val 1.55333005464994
acc val 0.4778948
Epoch: 52/1000
loss_tra 1.4596451858679453
acc train 0.52524173
loss_val 1.55214531146563
acc val 0.47934777
Epoch: 55/10