In [1]:
from My_Pka_Model import Pka_basic,Pka_acidic
import torch

import dgl
import dgllife
from torch.utils.data import DataLoader
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
import torch.optim as optim
import numpy as np
import random 
import pandas as pd

from torch.nn.utils import clip_grad_norm

Using backend: pytorch


In [2]:
!nvidia-smi

Fri Nov 27 08:09:24 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:06:00.0 Off |                    0 |
| N/A   37C    P0    34W / 250W |   2022MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:2F:00.0 Off |                    0 |
| N/A   36C    P0    34W / 250W |   1395MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   

In [3]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

seed = 0
random.seed(seed) 
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  
np.random.seed(seed)  
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [4]:
def collate(samples):
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)

node_featurizer = CanonicalAtomFeaturizer(atom_data_field='h')
edge_featurizer = CanonicalBondFeaturizer(bond_data_field='h')

def load_data(file_name,batch_size = 128,shuffle = True,split_ratio = False):
    dataset = []
    with open(file_name) as f:
        for line in f.readlines():
            line = line.replace('\n','').split('\t')
            g = smiles_to_bigraph(smiles=line[0], 
                          node_featurizer=node_featurizer,
                          edge_featurizer=edge_featurizer,
                          canonical_atom_order= False)
            dataset.append((g,float(line[1])))
            
    if split_ratio == False:
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,collate_fn=collate)
        return dataloader
    else:
        random.shuffle(dataset)
        length = len(dataset)
        dataloader_list = []
        for i in split_ratio:
            num = round(length * i)
            dataset_part = dataset[:num]
            dataset = dataset[num:]
            dataloader = DataLoader(dataset_part, batch_size=batch_size, shuffle=shuffle,collate_fn=collate)
            dataloader_list.append(dataloader)
        dataset_part = dataset
        dataloader = DataLoader(dataset_part, batch_size=batch_size, shuffle=shuffle,collate_fn=collate)
        dataloader_list.append(dataloader)
        return dataloader_list

In [5]:
batch_size = 1024
epoch_num = 1000
layer_num = 6
learning_rate = 0.0003
weight_decay = 0.0003

In [6]:
model = Pka_acidic(node_feat_size = 74,
                   edge_feat_size = 12,
                   output_size = 1,
                   num_layers= layer_num,
                   graph_feat_size=200,
                   dropout=0.2).to(device)

In [7]:
loss_func = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate,weight_decay=weight_decay)

In [8]:
train_loader = load_data('./Dataset/acidic_non_B_smiles_BOB.txt',batch_size = batch_size)
val_loader = load_data('./Dataset/BOB_smiles.txt',batch_size = 512)


In [9]:
print(len(train_loader.dataset))
print(len(val_loader.dataset))

9015
8


In [10]:
train_RMSE_lis = []
train_MAE_lis = []
val_RMSE_lis = []
val_MAE_lis = []
test_RMSE_lis = []
test_MAE_lis = []
test_2_RMSE_lis = []
test_2_MAE_lis = []
cur_rmse_lis = []
min_val_rmse = 100

file_name = './Logger/try_B.txt'

header = 'epoch:\ttrain_RMSD:\ttrain_MAE:\tval_RMSD:\tval_MAE:'
print(header)


for epoch in range(epoch_num):
    model.train()
    for iter, (bg, label) in enumerate(train_loader):
        bg = bg.to(device)
        label = label.reshape(-1,1).to(device)
        prediction = model(bg,bg.ndata['h'], bg.edata['h'])
        loss = loss_func(prediction, label).to(device)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(),max_norm=20,norm_type=2)
        optimizer.step()

    with torch.no_grad():
        model.eval()
        SSE = 0
        SAE = 0 
        for iter, (bg, label) in enumerate(train_loader):
            bg = bg.to(device)
            prediction = model(bg,bg.ndata['h'], bg.edata['h'])
            prediction = torch.squeeze(prediction)
            label = label.to(device)
            loss = prediction-label
            SSE += sum(loss**2)
            SAE += sum(torch.abs(loss))
        N = len(train_loader.dataset)
        train_RMSE = (SSE.item()/ N)**0.5
        train_MAE = SAE.item()/N
        #print(N,train_RMSD)

        SSE = 0
        SAE = 0 
        for iter, (bg, label) in enumerate(val_loader):
            bg = bg.to(device)
            prediction = model(bg,bg.ndata['h'], bg.edata['h'])
            prediction = torch.squeeze(prediction)
            label = label.to(device)
            loss = prediction-label
            SSE += sum(loss**2)
            SAE += sum(torch.abs(loss))
        N = len(val_loader.dataset)
        val_RMSE = (SSE.item()/ N)**0.5
        val_MAE = SAE.item()/N

        
        
        log = '{}\t{}\t{}\t{}\t{}'.format(epoch,round(train_RMSE,4),round(train_MAE,4),round(val_RMSE,4),round(val_MAE,4))
        print(log)
        
        train_RMSE_lis.append(train_RMSE)
        train_MAE_lis.append(train_MAE)
        val_RMSE_lis.append(val_RMSE)
        val_MAE_lis.append(val_MAE)
        

            
            
            

#         #save model
#         if val_RMSE <= min_val_rmse:
#             min_val_rmse = val_RMSE
#             if epoch >= 300:
#                 torch.save(model.state_dict(), './Trained_model/ramdom_split_acidic_4.pkl')
#                 print('saved')

#         with open(file_name,'a+') as f:
#             f.write(log)
#             f.write('\n')

epoch:	train_RMSD:	train_MAE:	val_RMSD:	val_MAE:




0	6.987	6.0806	6.9559	6.9181
1	6.405	5.4108	6.2875	6.2456
2	4.6184	3.361	3.985	3.9186


KeyboardInterrupt: 

In [11]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate/10,weight_decay=weight_decay)

In [12]:
for epoch in range(1000,1500):
    model.train()
    for iter, (bg, label) in enumerate(train_loader):
        bg = bg.to(device)
        label = label.reshape(-1,1).to(device)
        prediction = model(bg,bg.ndata['h'], bg.edata['h'])
        loss = loss_func(prediction, label).to(device)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(),max_norm=20,norm_type=2)
        optimizer.step()

    with torch.no_grad():
        model.eval()
        SSE = 0
        SAE = 0 
        for iter, (bg, label) in enumerate(train_loader):
            bg = bg.to(device)
            prediction = model(bg,bg.ndata['h'], bg.edata['h'])
            prediction = torch.squeeze(prediction)
            label = label.to(device)
            loss = prediction-label
            SSE += sum(loss**2)
            SAE += sum(torch.abs(loss))
        N = len(train_loader.dataset)
        train_RMSE = (SSE.item()/ N)**0.5
        train_MAE = SAE.item()/N
        #print(N,train_RMSD)

        SSE = 0
        SAE = 0 
        for iter, (bg, label) in enumerate(val_loader):
            bg = bg.to(device)
            prediction = model(bg,bg.ndata['h'], bg.edata['h'])
            prediction = torch.squeeze(prediction)
            label = label.to(device)
            loss = prediction-label
            SSE += sum(loss**2)
            SAE += sum(torch.abs(loss))
        N = len(val_loader.dataset)
        val_RMSE = (SSE.item()/ N)**0.5
        val_MAE = SAE.item()/N

        
        
        log = '{}\t{}\t{}\t{}\t{}'.format(epoch,round(train_RMSE,4),round(train_MAE,4),round(val_RMSE,4),round(val_MAE,4))
        print(log)
        
        train_RMSE_lis.append(train_RMSE)
        train_MAE_lis.append(train_MAE)
        val_RMSE_lis.append(val_RMSE)
        val_MAE_lis.append(val_MAE)

  # Remove the CWD from sys.path while we load stuff.


1000	0.5646	0.3911	0.628	0.5514
1001	0.5626	0.3887	0.6603	0.5763
1002	0.562	0.3889	0.6765	0.5866
1003	0.559	0.3851	0.6616	0.5767
1004	0.5613	0.3883	0.656	0.5725
1005	0.5585	0.3846	0.6229	0.5457
1006	0.5568	0.3829	0.6251	0.5336
1007	0.5567	0.3827	0.6246	0.5274
1008	0.5568	0.3834	0.6237	0.5331
1009	0.5588	0.3859	0.6465	0.5635
1010	0.557	0.3828	0.6336	0.5481
1011	0.5563	0.3822	0.6062	0.5191
1012	0.5558	0.3818	0.6096	0.5283
1013	0.5545	0.3803	0.6314	0.5509
1014	0.556	0.3809	0.655	0.5588
1015	0.5541	0.3803	0.6158	0.5353
1016	0.5592	0.3859	0.6157	0.5229
1017	0.5543	0.3802	0.6242	0.5369
1018	0.5541	0.3804	0.6126	0.5258
1019	0.557	0.3829	0.6197	0.525
1020	0.5561	0.3823	0.64	0.5483
1021	0.5556	0.3824	0.6233	0.5361
1022	0.5538	0.3811	0.6336	0.5559
1023	0.5533	0.3798	0.6488	0.5663
1024	0.5547	0.3823	0.6794	0.5924
1025	0.5542	0.3815	0.6741	0.588
1026	0.554	0.3804	0.6278	0.5407
1027	0.5541	0.381	0.6203	0.545
1028	0.5541	0.3807	0.672	0.5821
1029	0.5541	0.3811	0.6341	0.5543
1030	0.5531	0.3793	0.6285	

1252	0.5439	0.3755	0.6639	0.5754
1253	0.5412	0.3725	0.626	0.543
1254	0.5467	0.3796	0.6497	0.5679
1255	0.5415	0.3725	0.6488	0.5667
1256	0.5378	0.3679	0.6256	0.5315
1257	0.5381	0.3685	0.6179	0.5373
1258	0.5409	0.3713	0.633	0.5551
1259	0.5401	0.3705	0.6491	0.5699
1260	0.54	0.3697	0.617	0.5366
1261	0.5416	0.3721	0.6334	0.5521
1262	0.5397	0.3697	0.6352	0.5558
1263	0.5392	0.3697	0.6371	0.5553
1264	0.5402	0.3711	0.628	0.5483
1265	0.54	0.3701	0.6224	0.5403
1266	0.5382	0.369	0.6422	0.5588
1267	0.5414	0.3723	0.6267	0.5484
1268	0.5396	0.3699	0.6338	0.5361
1269	0.5404	0.3709	0.6234	0.5414
1270	0.5403	0.3716	0.6409	0.5563
1271	0.5424	0.3732	0.6228	0.5362
1272	0.5378	0.3688	0.6439	0.559
1273	0.5392	0.3703	0.6391	0.5559
1274	0.538	0.3686	0.629	0.5378
1275	0.5388	0.3697	0.6449	0.559
1276	0.5392	0.3708	0.6375	0.5573
1277	0.5382	0.3696	0.6453	0.5633
1278	0.5379	0.3693	0.6433	0.5641
1279	0.5386	0.3687	0.6394	0.5533
1280	0.5375	0.3685	0.6472	0.5604
1281	0.5438	0.3752	0.674	0.5833
1282	0.5447	0.377	0.6681	

In [13]:
torch.save(model.state_dict(), './Trained_model/non_B_try_BOB.pkl')

In [None]:
# 一般到后面才开始学smalp6?

这份，非常稳定，但感觉值偏大？
不能降到0.5？


这个感觉还算稳定了


先看看模型的泛化能力再决定要不要主动做聚类

In [None]:
感觉不稳定的测试还是尽量避免