In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sp
from copy import deepcopy
import warnings 
import os
from sklearn.model_selection import KFold
import json
warnings.filterwarnings("ignore") 
import random
random.seed(1234)
np.random.seed(1234)

In [2]:
def obtain_data(task, isbalance, balance):
    
    pwd = r'E:/MDA-GCNFTG-main/GDA/data/0_data/'
    if isbalance:
        node_feature_label = pd.read_csv(pwd + 'node_feature_label.csv', index_col = 0)
    
    train_test_id_idx = np.load(r'E:/MDA-GCNFTG-main/GDA/data/task_' + task + balance + '__testlabel0_knn_edge_train_test_index_all.npz', allow_pickle = True)
    train_index_all = train_test_id_idx['train_index_all']
    test_index_all = train_test_id_idx['test_index_all']
    
    num_node = node_feature_label.shape[0]
    node_feat = node_feature_label.iloc[:, 3:]
    label = node_feature_label['label']

    gene_ids = list(set(node_feature_label['gene_idx']))
    peco_ids = list(set(node_feature_label['peco_idx']))
    random.shuffle(gene_ids)
    random.shuffle(peco_ids)
    print('# gene = {} | peco = {}'.format(len(gene_ids), len(peco_ids)))
    
    gene_test_num = int(len(gene_ids) / 5)
    peco_test_num = int(len(peco_ids) / 5)
    print('# Test: gene = {} | peco = {}'.format(gene_test_num, peco_test_num))
    
    return node_feature_label, num_node, node_feat, label, gene_ids, peco_ids, train_index_all, test_index_all

In [3]:
def generate_graphsaint_data(task, train_index_all, test_index_all, node_feat, n_neigh, label, num_node, balance):

    fold = 0
    for train_idx, test_idx in zip(train_index_all, test_index_all): #train_index与test_index为下标
        # read knn_graph
        pwd = r'E:/MDA-GCNFTG-main/GDA/data/0_data/'
        knn_graph_file = 'task_' + task + balance + '__testlabel0_knn' + str(n_neigh) + 'neighbors_edge__fold' + str(fold) + '.npz'
        knn_neighbors_graph = sp.load_npz(pwd + knn_graph_file)

        #nonzero()用于得到数组array中非零元素的位置（数组索引）的函数。
        edge_src_dst = knn_neighbors_graph.nonzero()
        #print(edge_src_dst)
        
        # save dir
        save_dir = r'E:/MDA-GCNFTG-main/gene_peco_data_/task_' + task + balance + '__testlabel0_' + str(n_neigh) + 'knn_edge_fold' + str(fold) + '/'

        try:
            os.mkdir(save_dir)
        except OSError as error:
            print(error, save_dir)

        # feats.npy，不需要自己标准化！因为在utils.py中的load_data中有标准化的步骤哦！
        feats = np.array(node_feat,dtype='float32')
        np.save(save_dir + 'feats.npy', feats)
        
        try:
            train_idx, test_idx = train_idx.tolist(), test_idx.tolist()
        except:
            train_idx, test_idx = train_idx, test_idx
            
        # role.json
        role = dict()
        role['tr'] = train_idx
        role['va'] = test_idx
        role['te'] = test_idx
        with open(save_dir + 'role.json','w') as f:
            json.dump(role, f)

        # class_map.json
        y = np.array(label)
        class_map = dict()
        for i in range(num_node):
            class_map[str(i)] = y[i].tolist()
        with open(save_dir + 'class_map.json', 'w') as f:
            json.dump(class_map, f)

        # adj_*.npz
        train_idx_set = set(train_idx)
        test_idx_set = set(test_idx)
        
        row_full, col_full = edge_src_dst[0], edge_src_dst[1]
        
        row_train = []
        col_train = []
        row_val = []
        col_val = []
        for i in tqdm(range(row_full.shape[0])):
            if row_full[i] in train_idx_set and col_full[i] in train_idx_set:
                row_train.append(row_full[i])
                col_train.append(col_full[i])
            if row_full[i] in test_idx_set and col_full[i] in test_idx_set:
                row_val.append(row_full[i])
                col_val.append(col_full[i])

        row_train = np.array(row_train)
        col_train = np.array(col_train)
        row_val = np.array(row_val)
        col_val = np.array(col_val)
        dtype = np.bool

        #sp.coo_matrix根据行列坐标生成矩阵，.tocsr()对矩阵进行压缩
        adj_full = sp.coo_matrix(
            (
                np.ones(row_full.shape[0], dtype=dtype),
                (row_full, col_full),
            ),
            shape=(num_node, num_node)
        ).tocsr()

        adj_train = sp.coo_matrix(
            (
                np.ones(row_train.shape[0], dtype=dtype),
                (row_train, col_train),
            ),
            shape=(num_node, num_node)
        ).tocsr()

        adj_val = sp.coo_matrix(
            (
                np.ones(row_val.shape[0], dtype=dtype),
                (row_val, col_val),
            ),
            shape=(num_node, num_node)
        ).tocsr()

        print('adj_full  num edges:', adj_full.nnz)
        print('adj_val   num edges:', adj_val.nnz)
        print('adj_train num edges:', adj_train.nnz)
        sp.save_npz(save_dir + 'adj_full.npz', adj_full)
        sp.save_npz(save_dir + 'adj_train.npz', adj_train)
        sp.save_npz(save_dir + 'adj_val.npz', adj_val) # adj_val not used in GraphSAINT source code

        fold += 1
    
    print('--Complete--', fold)
    return feats, role, class_map, adj_full, adj_train, adj_val, edge_src_dst

In [8]:
def run(task, isbalance):
    
    if isbalance:
        balance = ''
    else:
        balance = '__nobalance'
    for n_neigh in [1, 3, 5, 7, 10, 15]:

        node_feature_label, num_node, node_feat, label, gene_ids, peco_ids, train_index_all, test_index_all = obtain_data(task, 
                                                                                                                              isbalance,
                                                                                                                              balance)
        feats, role, class_map, adj_full, adj_train, adj_val, edge_src_dst = generate_graphsaint_data(task, train_index_all, test_index_all, node_feat, n_neigh,
                                                                                        label, 
                                                                                        num_node, 
                                                                                        balance)
    return node_feature_label, num_node, node_feat, label, gene_ids, peco_ids, train_index_all, test_index_all, \
    feats, role, class_map, adj_full, adj_train, adj_val, edge_src_dst

# RUN balance

In [9]:
#for n_neigh in [1, 3, 5, 7, 15]:
node_feature_label, num_node, node_feat, label, gene_ids, peco_ids, train_index_all, test_index_all, \
feats, role, class_map, adj_full, adj_train, adj_val, edge_src_dst = run(task = 'Tpe', isbalance = True)

# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold0/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold0/


100%|██████████| 47116/47116 [00:00<00:00, 724898.13it/s]


adj_full  num edges: 47116
adj_val   num edges: 4866
adj_train num edges: 40006
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold1/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold1/


100%|██████████| 47116/47116 [00:00<00:00, 628222.92it/s]


adj_full  num edges: 47116
adj_val   num edges: 3712
adj_train num edges: 40832
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold2/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold2/


100%|██████████| 47116/47116 [00:00<00:00, 692893.70it/s]


adj_full  num edges: 47116
adj_val   num edges: 4164
adj_train num edges: 40715
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold3/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold3/


100%|██████████| 47116/47116 [00:00<00:00, 692918.00it/s]


adj_full  num edges: 47116
adj_val   num edges: 5479
adj_train num edges: 39381
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold4/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_1knn_edge_fold4/


100%|██████████| 47116/47116 [00:00<00:00, 736188.75it/s]


adj_full  num edges: 47116
adj_val   num edges: 22602
adj_train num edges: 21237
--Complete-- 5
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold0/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold0/


100%|██████████| 141348/141348 [00:00<00:00, 781007.42it/s]


adj_full  num edges: 141348
adj_val   num edges: 10242
adj_train num edges: 115509
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold1/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold1/


100%|██████████| 141348/141348 [00:00<00:00, 780837.69it/s]


adj_full  num edges: 141348
adj_val   num edges: 6630
adj_train num edges: 118508
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold2/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold2/


100%|██████████| 141348/141348 [00:00<00:00, 803110.92it/s]


adj_full  num edges: 141348
adj_val   num edges: 8068
adj_train num edges: 117632
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold3/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold3/


100%|██████████| 141348/141348 [00:00<00:00, 798491.09it/s]


adj_full  num edges: 141348
adj_val   num edges: 11599
adj_train num edges: 113665
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold4/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_3knn_edge_fold4/


100%|██████████| 141348/141348 [00:00<00:00, 780928.20it/s]


adj_full  num edges: 141348
adj_val   num edges: 61376
adj_train num edges: 56645
--Complete-- 5
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold0/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold0/


100%|██████████| 235580/235580 [00:00<00:00, 832475.36it/s]


adj_full  num edges: 235580
adj_val   num edges: 14078
adj_train num edges: 189933
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold1/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold1/


100%|██████████| 235580/235580 [00:00<00:00, 804027.35it/s]


adj_full  num edges: 235580
adj_val   num edges: 8644
adj_train num edges: 195725
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold2/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold2/


100%|██████████| 235580/235580 [00:00<00:00, 815156.96it/s]


adj_full  num edges: 235580
adj_val   num edges: 10770
adj_train num edges: 193503
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold3/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold3/


100%|██████████| 235580/235580 [00:00<00:00, 820843.23it/s]


adj_full  num edges: 235580
adj_val   num edges: 15551
adj_train num edges: 186347
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold4/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_5knn_edge_fold4/


100%|██████████| 235580/235580 [00:00<00:00, 862864.84it/s]


adj_full  num edges: 235580
adj_val   num edges: 97041
adj_train num edges: 87316
--Complete-- 5
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold0/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold0/


100%|██████████| 329812/329812 [00:00<00:00, 832850.76it/s]


adj_full  num edges: 329812
adj_val   num edges: 16890
adj_train num edges: 262973
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold1/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold1/


100%|██████████| 329812/329812 [00:00<00:00, 826597.63it/s]


adj_full  num edges: 329812
adj_val   num edges: 10245
adj_train num edges: 272185
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold2/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold2/


100%|██████████| 329812/329812 [00:00<00:00, 831690.07it/s]


adj_full  num edges: 329812
adj_val   num edges: 12630
adj_train num edges: 268528
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold3/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold3/


100%|██████████| 329812/329812 [00:00<00:00, 845629.69it/s]


adj_full  num edges: 329812
adj_val   num edges: 18229
adj_train num edges: 257038
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold4/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_7knn_edge_fold4/


100%|██████████| 329812/329812 [00:00<00:00, 877160.86it/s]


adj_full  num edges: 329812
adj_val   num edges: 128194
adj_train num edges: 114900
--Complete-- 5
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold0/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold0/


100%|██████████| 471160/471160 [00:00<00:00, 841359.02it/s]


adj_full  num edges: 471160
adj_val   num edges: 20574
adj_train num edges: 372092
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold1/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold1/


100%|██████████| 471160/471160 [00:00<00:00, 820859.25it/s]


adj_full  num edges: 471160
adj_val   num edges: 12546
adj_train num edges: 385748
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold2/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold2/


100%|██████████| 471160/471160 [00:00<00:00, 835415.31it/s]


adj_full  num edges: 471160
adj_val   num edges: 15223
adj_train num edges: 379854
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold3/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold3/


100%|██████████| 471160/471160 [00:00<00:00, 855366.49it/s]


adj_full  num edges: 471160
adj_val   num edges: 21770
adj_train num edges: 362722
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold4/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_10knn_edge_fold4/


100%|██████████| 471160/471160 [00:00<00:00, 889610.68it/s]


adj_full  num edges: 471160
adj_val   num edges: 171680
adj_train num edges: 154857
--Complete-- 5
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold0/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold0/


100%|██████████| 706740/706740 [00:00<00:00, 843350.46it/s]


adj_full  num edges: 706740
adj_val   num edges: 26100
adj_train num edges: 552879
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold1/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold1/


100%|██████████| 706740/706740 [00:00<00:00, 851509.22it/s]


adj_full  num edges: 706740
adj_val   num edges: 16139
adj_train num edges: 574414
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold2/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold2/


100%|██████████| 706740/706740 [00:00<00:00, 848432.28it/s]


adj_full  num edges: 706740
adj_val   num edges: 19081
adj_train num edges: 565992
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold3/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold3/


100%|██████████| 706740/706740 [00:00<00:00, 858721.70it/s]


adj_full  num edges: 706740
adj_val   num edges: 27489
adj_train num edges: 538241
[WinError 183] 当文件已存在时，无法创建该文件。: 'E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold4/' E:/MDA-GCNFTG-main/gene_peco_data_/task_Tpe__testlabel0_15knn_edge_fold4/


100%|██████████| 706740/706740 [00:00<00:00, 910730.61it/s]


adj_full  num edges: 706740
adj_val   num edges: 241882
adj_train num edges: 219385
--Complete-- 5


In [None]:
# for n_neigh in [1, 3, 5, 7, 15]: 
node_feature_label, num_node, node_feat, label, gene_ids, peco_ids, train_index_all, test_index_all, \
feats, role, class_map, adj_full, adj_train, adj_val, edge_src_dst = run(task = 'Tg', isbalance = True)

In [None]:
# for n_neigh in [1, 3, 5, 7, 15]: 
node_feature_label, num_node, node_feat, label, gene_ids, peco_ids, train_index_all, test_index_all, \
feats, role, class_map, adj_full, adj_train, adj_val, edge_src_dst = run(task = 'Tp', isbalance = True)