In [1]:
from rdkit import Chem
import torch
from torch_geometric.data import Dataset, Data, DataLoader
from torch_geometric import loader
import numpy as np
import os
import networkx as nx
import torch.nn as nn
import torch.nn.functional as F
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torch_geometric.nn import MFConv
from torch.utils.data import random_split
import pickle
from torch_geometric.utils import from_networkx
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, f1_score, recall_score, jaccard_score, balanced_accuracy_score

In [2]:
train_datapath = './datasets/train_datsets.pkl'

In [3]:
fr = open(train_datapath, 'rb')
train_data = pickle.load(fr)

In [4]:
test_datapath = './datasets/test_datsets.pkl'

In [5]:
fe = open(test_datapath, 'rb')
test_data = pickle.load(fe)

In [6]:
len(train_data),len(test_data)

(544, 136)

In [7]:
class Model(nn.Module):
    def __init__(self, args):
        super(Model, self).__init__()
        num_classses = 2

        conv_hidden = args['conv_hidden']
        cls_hidden = args['cls_hidden']
        self.n_layers = args['n_layers']
        # cls_drop = ['cls_drop']

#         self.conv_layers = nn.ModuleList([])

        self.conv1 = MFConv(29, conv_hidden, 5)

#         for i in range(self.n_layers):
#             self.conv_layers.append(
#                 MFConv(conv_hidden, conv_hidden)
#             )
        self.conv2 = MFConv(conv_hidden, conv_hidden, 5)
        self.conv3 = MFConv(conv_hidden, conv_hidden, 5)
        self.conv4 = MFConv(conv_hidden, conv_hidden, 5)

        self.linear1 = nn.Linear(conv_hidden, cls_hidden)
        self.linear2 = nn.Linear(cls_hidden, num_classses)
        self.relu = nn.ReLU()
        self.drop1 = nn.Dropout(p=0.5)

    
    def forward(self, mol):

        res = self.conv1(mol.x, mol.edge_index)
#         for i in range(self.n_layers):
#             res = self.conv_layers[i](res, mol.edge_index)
            
        
        res = self.conv2(res, mol.edge_index)
        res_2 = res
        res = self.conv3(res, mol.edge_index)
        res_3 = res
        res = self.conv4(res, mol.edge_index)
        
        
        res = res+res_3+res_2
#         res = torch.cat((res, res_3), dim=-1)
          
        res = self.linear1(res)
        
        res = self.relu(res)
        # fc relu fc relu
        res = self.drop1(res)
        res = self.linear2(res)

        return res

In [8]:
import random
import os
import numpy as np
# 可显示数组中的所有元素，不受默认的截断限制
np.set_printoptions(threshold=np.inf)
def seed_torch(seed=42):
    # 设置种子可以确保每次运行代码时生成的随机数相同。
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) 

In [9]:
# evaluation
def top2(output, label):
    sf = nn.Softmax(dim=1)
    preds = sf(output)
    preds = preds[:, 1]
    _, indices = torch.topk(preds, 2)
    pos_index = []
    for i in range(label.shape[0]):
        if label[i] == 1:
            pos_index.append(i)  
    for li in pos_index:
        if li in indices:
            return True
    return False

def MCC(output, label):
    tn,fp,fn,tp=confusion_matrix(label, output).ravel()
    up = (tp * tn) - (fp * fn)
    down = ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    return up / down

def metrics(output, label):
    tn,fp,fn,tp=confusion_matrix(label, output).ravel()
    up = (tp * tn) - (fp * fn)
    down = ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
    mcc = up / down
    selectivity = tn / (tn + fp)
    recall = tp / (tp + fn)
    g_mean = (selectivity * recall) ** 0.5
    balancedAccuracy = (recall + selectivity) / 2
    return mcc, selectivity, recall, g_mean, balancedAccuracy

In [10]:
def Imbalanced_ratio(dataset_loader):
    som = 0
    no_som = 0
    for i in dataset_loader:
        for n in i.y:
            if n == 0:
                no_som = no_som+1
            else:
                som = som+1
    return('SOM vs no_SOM:', som, no_som,'1:{}'.format(no_som/som))

In [11]:
def train(args, model, device, training_set, optimizer, criterion, epoch):
    model.train()
    sf = nn.Softmax(dim=1)
    total_loss = 0
    all_pred = []
    all_pred_raw = []
    all_labels = []
    top2n = 0
    for mol in training_set:
        mol = mol.to(device)
        mol.x = mol.x.to(torch.float32)
        target = mol.y
        
        optimizer.zero_grad()
        output = model(mol)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # tracking
        top2n += top2(output, target)
        all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
        all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
        all_labels.append(target.cpu().detach().numpy())

    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    all_labels = np.concatenate(all_labels).ravel()

    mcc = MCC(all_pred, all_labels)
    print(f'Train Epoch: {epoch}, Ave Loss: {total_loss / len(training_set)} ACC: {accuracy_score(all_labels, all_pred)} Top2: {top2n / len(training_set)} AUC: {roc_auc_score(all_labels, all_pred_raw)} MCC: {mcc}')
    return top2n / len(training_set)

In [12]:
def val(args, model, device, val_set, optimizer, criterion, epoch):
    model.eval()
    sf = nn.Softmax(dim=1)
    total_loss = 0
    all_pred = []
    all_pred_raw = []
    all_labels = []
    top2n = 0
    for mol in val_set:
        mol = mol.to(device)
        mol.x = mol.x.to(torch.float32)
        target = mol.y
        optimizer.zero_grad()
        output = model(mol)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # tracking
        top2n += top2(output, target)
        all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
        all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
        all_labels.append(target.cpu().detach().numpy())
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    mcc = MCC(all_pred, all_labels)

    print(f'Val Epoch: {epoch}, Ave Loss: {total_loss / len(val_set)} ACC: {accuracy_score(all_labels, all_pred)} Top2: {top2n / len(val_set)} AUC: {roc_auc_score(all_labels, all_pred_raw)} MCC: {mcc}')
    return top2n / len(val_set)

In [13]:
def test(model, device, test_set):
    model.eval()
    sf = nn.Softmax(dim=1)
    all_pred = []
    all_pred_raw = []
    all_labels = []
    top2n = 0
    with torch.no_grad():
        for mol in test_set:
            mol = mol.to(device)
            mol.x = mol.x.to(torch.float32)
            mol.edge_attr = mol.edge_attr.to(torch.float32)
            target = mol.y
            output = model(mol)
            # squeeze
            output = torch.squeeze(output)
            # tracking
            top2n += top2(output, target)
            all_pred.append(np.argmax(output.cpu().detach().numpy(), axis=1))
            all_pred_raw.append(sf(output)[:, 1].cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())
    all_pred = np.concatenate(all_pred).ravel()
    all_pred_raw = np.concatenate(all_pred_raw).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    mcc, selectivity, recall, g_mean, balancedAcc = metrics(all_pred, all_labels)
    print(f'ACC: {accuracy_score(all_labels, all_pred)} \
        Top2: {top2n / len(test_set)} \
        AUC: {roc_auc_score(all_labels, all_pred_raw)}\
        MCC: {mcc} selectivity {selectivity} recall {recall} \
        g_mean {g_mean} balanced acc {balancedAcc} f1score {f1_score(all_labels, all_pred)} \
        precision score {precision_score(all_labels, all_pred)} jaccard score {jaccard_score(all_labels, all_pred)}')

In [14]:
training_set, validation_set = random_split(train_data, [int(len(train_data) * 0.85), len(train_data) - int(len(train_data) * 0.85)], generator=torch.Generator().manual_seed(12345))
batch_size = 1
train_loader = loader.DataLoader(training_set, batch_size, shuffle=True)
val_loader = loader.DataLoader(validation_set, batch_size, shuffle=True)
test_loader = loader.DataLoader(test_data, batch_size, shuffle=False)

In [15]:
len(train_loader), len(val_loader), len(test_loader)

(462, 82, 136)

In [16]:
def main(args, train_loader, val_loader):
    seed_torch(args['seed'])
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch.manual_seed(args['seed'])

    model = Model(args).to(device)
    print(model)
    weights = torch.tensor([1, args['pos_weight']], dtype=torch.float32).to(device)
    loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'])
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    max_top2 = 0
    for epoch in range(1, args['epoch'] + 1):
        train(args, model, device, train_loader, optimizer, loss_fn, epoch)
        top2acc = val(args, model, device, val_loader, optimizer, loss_fn, epoch)
        scheduler.step()
        if top2acc > max_top2:
            max_top2 = top2acc
            print('Saving model (epoch = {:4d}, top2acc = {:.4f})'
                .format(epoch, max_top2))
            torch.save(model.state_dict(), args['save_path'])

In [25]:
args = {
    'lr': 0.01,
    'epoch': 400,
    'seed': 12345,
    'save_path': './model/train_sum',   # 对应修改模型
    'pos_weight': 3,
    'conv_hidden': 1024, 
    'cls_hidden': 1024,
    'n_layers': 3,
    'max_degree': 5
}

train_test 最后结果

In [18]:
main(args, train_loader, val_loader)

Model(
  (conv1): MFConv(29, 1024)
  (conv2): MFConv(1024, 1024)
  (conv3): MFConv(1024, 1024)
  (conv4): MFConv(1024, 1024)
  (linear1): Linear(in_features=1024, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=2, bias=True)
  (relu): ReLU()
  (drop1): Dropout(p=0.5, inplace=False)
)
Train Epoch: 1, Ave Loss: 0.6047556184038714 ACC: 0.8018507918336195 Top2: 0.512987012987013 AUC: 0.719470594755429 MCC: 0.19359717191821119
Val Epoch: 1, Ave Loss: 0.49343662072972555 ACC: 0.8546731496488384 Top2: 0.6463414634146342 AUC: 0.7889227948563392 MCC: 0.29700077985815454
Saving model (epoch =    1, top2acc = 0.6463)
Train Epoch: 2, Ave Loss: 0.47702263553679247 ACC: 0.8499332188513642 Top2: 0.6601731601731602 AUC: 0.7875943621025987 MCC: 0.3004896050531058
Val Epoch: 2, Ave Loss: 0.460543882192635 ACC: 0.8606158833063209 Top2: 0.6707317073170732 AUC: 0.8243796463733172 MCC: 0.34544465863283574
Saving model (epoch =    2, top2acc = 0.6707)
Train Epoch: 3, Ave Loss

Train Epoch: 27, Ave Loss: 0.23655522608361257 ACC: 0.9144247281053234 Top2: 0.9155844155844156 AUC: 0.9510454486143076 MCC: 0.6229372418383361
Val Epoch: 27, Ave Loss: 0.20929131631898443 ACC: 0.9232847109670448 Top2: 0.9024390243902439 AUC: 0.9657235784609203 MCC: 0.6199767095577966
Train Epoch: 28, Ave Loss: 0.22592669940553353 ACC: 0.9178591871780195 Top2: 0.935064935064935 AUC: 0.9551595684727102 MCC: 0.6313431934870689
Val Epoch: 28, Ave Loss: 0.2045769411531027 ACC: 0.9211237169097785 Top2: 0.9146341463414634 AUC: 0.9645808217801889 MCC: 0.6367493583383844
Train Epoch: 29, Ave Loss: 0.22538492520648715 ACC: 0.9198626216370922 Top2: 0.9372294372294372 AUC: 0.9557653192702368 MCC: 0.6475568113948126
Val Epoch: 29, Ave Loss: 0.20484822510639433 ACC: 0.9227444624527282 Top2: 0.9512195121951219 AUC: 0.96485709262608 MCC: 0.6512927392154875
Saving model (epoch =   29, top2acc = 0.9512)
Train Epoch: 30, Ave Loss: 0.22115892062436898 ACC: 0.9203396298416333 Top2: 0.9415584415584416 AUC:

In [19]:
model = Model(args).to("cuda")
model.load_state_dict(torch.load(args['save_path']))

<All keys matched successfully>

In [20]:
Imbalanced_ratio(train_loader)

('SOM vs no_SOM:', 1145, 9337, '1:8.154585152838427')

In [21]:
Imbalanced_ratio(val_loader)

('SOM vs no_SOM:', 192, 1659, '1:8.640625')

In [22]:
(9337+1659)/(1145+192)

8.224382946896036

In [23]:
Imbalanced_ratio(test_loader)

('SOM vs no_SOM:', 325, 2611, '1:8.033846153846154')

In [24]:
test(model, "cuda", test_loader)

ACC: 0.8998637602179836         Top2: 0.8823529411764706         AUC: 0.9188627993989925        MCC: 0.573963336635243 selectivity 0.9199540405974722 recall 0.7384615384615385         g_mean 0.8242273206667673 balanced acc 0.8292077895295054 f1score 0.6201550387596899         precision score 0.534521158129176 jaccard score 0.449438202247191


In [24]:
import statistics
data = [0.665,
0.714,
0.668,
0.720,
0.726]
variance = statistics.variance(data)
standard_deviation = statistics.stdev(data)
print("方差为:", variance)

print("标准差为:", standard_deviation)

方差为: 0.000877799999999998
标准差为: 0.029627689751311997
