In [1]:
import gc
from sklearn.decomposition import PCA
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold,KFold
import os
import pickle
from sklearn.metrics import precision_score
from catboost import CatBoostClassifier
import dgl
from sklearn.model_selection import StratifiedKFold
import math
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from copy import deepcopy
import torch

Using backend: pytorch


In [2]:
def load_dgl_graph_k_fold(base_path, fold=-1, k=6, seed=2021):

    with open(os.path.join(base_path, 'labels.pkl'), 'rb') as f:
        label_data = pickle.load(f)
    labels = torch.from_numpy(label_data['label'])
    labels = labels.to(torch.int64)
    test_label_idx = label_data['test_label_idx']
    if fold == -1:
        tr_label_idx = label_data['tr_label_idx']
        val_label_idx = label_data['val_label_idx']
    else:
        train_idx = np.concatenate((label_data['tr_label_idx'], label_data['val_label_idx']))
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        for i, (tr, val) in enumerate(folds.split(train_idx, labels[train_idx])):
            tr_label_idx, val_label_idx = train_idx[tr], train_idx[val]
            if i == fold:
                print('    ###      use      fold: {}'.format(fold))
                break
    # get node features
    features = np.load(os.path.join(base_path, 'features.npy'))
    node_feat = torch.from_numpy(features).float()
    print('################ Feature info: ###############')
    print('Node\'s feature shape:{}'.format(node_feat.shape))
    return labels, tr_label_idx, val_label_idx, test_label_idx, node_feat

class FastTensorDataLoader:
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
    def __iter__(self):
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = [t[r] for t in self.tensors]
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches

base_path = 'E:/ZJL/DGL'
def adjust_learning_rate(optimizer, epoch):
    for param_group in optimizer.param_groups:
        param_group["lr"] = 0.001 + (epoch % 5)*0.001

n_epochs = 1000
device = 'cuda:0'
batch_size = 4096
fold = 0
labels, tr_label_idx, val_label_idx, test_label_idx, node_feat = load_dgl_graph_k_fold(base_path,fold)

test_data_loader = FastTensorDataLoader(
    node_feat,
    labels,
    batch_size=batch_size, shuffle=False)

    ###      use      fold: 0
################ Feature info: ###############
Node's feature shape:torch.Size([3655452, 300])


In [4]:
class ISO_Node_NN(nn.Module):  

    def __init__(self): 
        super(ISO_Node_NN, self).__init__()        
        self.net = nn.Sequential(
            nn.Linear(300,2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2048,1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024,512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512,256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 23),
        )
        

    def forward(self, x):  
        x = self.net(x)
        return x

for fold in range(10):
    model = ISO_Node_NN().to(device)
    PATH = f'dnn_fold_{fold}_stage_1'
    model.load_state_dict(torch.load(PATH))
    model.eval()
    count = 0
    all_batch_list = []
    for X_sequence, target in test_data_loader:
        X_sequence, target = X_sequence.to(device), target.to(device)
        y_hat = model(X_sequence)
        all_batch_list.append(y_hat.cpu().detach().numpy())
    # 预测结果
    all_batch_list = np.vstack(all_batch_list)
    all_batch_list = F.softmax(torch.tensor(all_batch_list), dim=1).numpy()
    np.save(f'dnn_{fold}.npy',all_batch_list)

In [5]:
sum(all_batch_list[0])

0.9999998869223461

In [6]:
for fold in range(10):
    if fold == 0:
        dnn_pred = np.load(f'dnn_{fold}.npy') / 10
    else:
        dnn_pred += np.load(f'dnn_{fold}.npy') / 10

In [7]:
dnn_pred.shape

(3655452, 23)

In [8]:
base_path = 'E:/ZJL/DGL'
graphs, _ = dgl.load_graphs(os.path.join(base_path, 'graph.bin'))
graph = graphs[0]
print('################ Graph info: ###############')
print(graph)
degrees = (graph.in_degrees() + graph.out_degrees()).numpy()
iso_set = set(np.where(degrees==0)[0])
label_nodes = np.array(sorted(list(set(tr_label_idx)|set(val_label_idx))))
valid_nodes = np.array(sorted(list(set(label_nodes) & iso_set)))
train_nodes = np.array(sorted(list(set(label_nodes) - iso_set)))
iso_set = np.array(sorted(list(iso_set)))

################ Graph info: ###############
Graph(num_nodes=3655452, num_edges=29168650,
      ndata_schemes={}
      edata_schemes={})


In [None]:
iso_set = set(np.where(degrees==0)[0])
iso_set = np.array(sorted(list(iso_set)))
gat_pred[iso_set] = dnn_pred[iso_set]

In [9]:
dnn_pred[0]

array([3.49307243e-07, 2.25985750e-05, 1.41264347e-04, 6.99978700e-05,
       2.89810159e-05, 1.19760125e-05, 8.93251781e-05, 4.13037960e-05,
       1.50599162e-05, 1.15041994e-05, 1.46285573e-04, 3.74739502e-05,
       6.67416825e-05, 1.83618286e-05, 4.82234873e-06, 3.49691436e-05,
       5.08569030e-07, 1.32556409e-06, 9.98904109e-01, 2.46741001e-06,
       1.10471717e-06, 3.43295134e-04, 6.19029333e-06], dtype=float32)

In [10]:
gat = np.load('gat_full.npy')
# print(sum(gat[0]))
gat[0]


array([4.6558498e-07, 1.6007137e-04, 2.6909291e-04, 3.0858826e-04,
       3.3176594e-04, 1.4252805e-04, 2.5829382e-05, 1.7682193e-03,
       8.9285577e-06, 1.2256396e-03, 3.7529125e-04, 4.8739812e-04,
       1.2147106e-03, 1.1182405e-03, 1.7305930e-06, 9.2309747e-05,
       3.1169268e-04, 1.4099326e-04, 9.9908543e+00, 5.3259624e-05,
       8.1599894e-04, 3.2339729e-05, 2.6159891e-04], dtype=float32)

In [11]:
gat[iso_set] = dnn_pred[iso_set]

In [12]:
gat[0]

array([3.49307243e-07, 2.25985750e-05, 1.41264347e-04, 6.99978700e-05,
       2.89810159e-05, 1.19760125e-05, 8.93251781e-05, 4.13037960e-05,
       1.50599162e-05, 1.15041994e-05, 1.46285573e-04, 3.74739502e-05,
       6.67416825e-05, 1.83618286e-05, 4.82234873e-06, 3.49691436e-05,
       5.08569030e-07, 1.32556409e-06, 9.98904109e-01, 2.46741001e-06,
       1.10471717e-06, 3.43295134e-04, 6.19029333e-06], dtype=float32)

In [13]:
gat = gat[-592391:]

In [14]:
np.save('gat_iso.npy',gat)

In [148]:
prediction = ["A","B","C","D","E","F","G",
              "H","I","J","K","L","M","N",
              "O","P","Q","R","S","T","U",
              "V","W"]
score_map = {}
for i in range(23):
    score_map[i] = prediction[i]

gat = np.argmax(gat, axis=1)
submit = pd.read_csv("E:/ZJL/DGL/sample_submission_for_validation.csv")
print(submit.shape)
submit["label"] = gat[0:submit.shape[0]]
submit["label"] = submit["label"].map(score_map)
print(submit.head())
submit.to_csv("E:/ZJL/DGL/submission_gat.csv", index=None)

(591972, 2)
                                 id label
0  c39457cc34fa969b03819eaa4f9b7a52     P
1  668b9d0c53e9b6e2c6b1093102f976b3     P
2  ca5c7bc1b40c0ef3c3f864aed032ca90     G
3  44f810c0c000cda27ce618add55e815f     F
4  3c206335d88637d36d83c2942586be98     K


In [98]:
591972*(0.558989-0.5584803)

301.1361563999512

In [114]:
591972*(0.5590836-0.5584803) # 10 fold

357.1367075999842

In [137]:
591972*(0.5592412-0.5584803)  # 伪标签

450.43149479997237