# Dataset 불러오기

In [2]:
import torch
print(torch.__version__)

1.13.0


In [34]:
import numpy as np
import pandas as pd
import copy
from tqdm import tqdm
from torch_geometric.data import Data, InMemoryDataset, DataLoader
import torch.nn.functional as F

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
train = pd.read_csv("C:/mole/train.csv", index_col=0)  
test = pd.read_csv("C:/mole/test.csv", index_col=0)

#index_col=0(false) : do not use first column as index

In [25]:
print(train)

                                                        SMILES      mu
Id                                                                    
train_0      [H]N1C(=O)O[C@@](C([H])([H])[H])(C([H])([H])C(...  4.7864
train_1      [H]O/C1=C2C(=C(/[H])N1[H])/C([H])([H])C([H])([...  0.7142
train_2      [H][N-]C1=C([H])[C@H](N([H])C([H])([H])[H])C(=...  1.8529
train_3      [H]OC([H])([H])[C@]1(C([H])([H])[H])O[C@@]2([H...  1.9303
train_4      [H]C1=C([H])[C@@]2([H])C([H])([H])[C@@]2([H])[...  0.3210
...                                                        ...     ...
train_10979  [H]N(C(=O)[C@]1([H])N([H])C1([H])[H])C([H])([H...  3.5551
train_10980  [H]O[C@@]([H])(C#N)C([H])([H])C([H])([H])C(=O)...  5.6835
train_10981  [H]O[C@]([H])(C([H])([H])[H])[C@@]1([H])N([H])...  4.3993
train_10982  [H]C(=O)[C@@]([H])(C#CC([H])([H])[H])C([H])([H...  3.8385
train_10983  [H]C1=NC(=O)C(C([H])([H])[H])=C([H])N1C([H])([...  7.4259

[10984 rows x 2 columns]


csv 파일 내에는 해당 molecule의 smiles string과 y(real dipole moment)값이 담겨있음

In [26]:
d = torch.load(f"C:/mole/mol/train/train_1.pt")

In [27]:
print(d)

Data(x=[18, 1], edge_index=[2, 38], edge_attr=[38, 1], y=[1], pos=[18, 3], smiles='[H]O/C1=C2C(=C(/[H])N1[H])/C([H])([H])C([H])([H])C/2([H])[H]')


In [55]:
train_list = list()
test_list = list()

for idx in tqdm(train.index):
    d = torch.load(f"C:/mole/mol/train/{idx}.pt")
    train_list.append(d)
    
for idx in tqdm(test.index):
    d = torch.load(f"C:/mole/mol/test/{idx}.pt")
    test_list.append(d)
    

100%|██████████████████████████████████████████████████████████████████████████| 10984/10984 [00:02<00:00, 4581.02it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1217/1217 [00:00<00:00, 5263.23it/s]


# GNN Model

In [44]:
from torch_geometric.nn import GATConv

class single_layer(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        
        super(single_layer, self).__init__()
        
        self.convs = torch.nn.ModuleList(
            [GATConv(in_channels = input_dim, out_channels = output_dim, dropout = dropout)] +
            [GATConv(in_channels = input_dim, out_channels = output_dim, dropout = dropout) for i in range(num_layers-2)] +
            [GATConv(in_channels = input_dim, out_channels = output_dim, dropout = dropout)]
        )
        self.bns = torch.nn.ModuleList([
            torch.nn.BatchNorm1d(num_features=hidden_dim) for i in range(num_layers-1)
        ])
    
    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()
    
    def forward(self, x, adj_t):
        out = None
        for conv, bn in zip(self.convs[:-1], self.bns):
            x1 = F.relu(bn(conv(x, adj_t)))
            if self.training:     #nn.module class method : true = train, false = evaluation
                x1 = F.dropout(x1, p=self.dropout) # there's no need to drop out when evaluation
            x = x1
        out = self.convs[-1](x, adj_t)
        
        return out

In [53]:
print(f"Training Data Size : {len(train_list)}")

Training Data Size : 10984


In [58]:
from torch.utils.data import random_split
train2_list, valid_list = random_split(train_list, [9000, 1984])

print(f"Training Data Size : {len(train2_list)}")
print(f"Validation Data Size : {len(valid_list)}")
print(f"Test Data Size : {len(test_list)}")

Training Data Size : 9000
Validation Data Size : 1984
Test Data Size : 1217


In [59]:
from torch_geometric.nn import global_mean_pool
from ogb.graphproppred.mol_encoder import AtomEncoder

class totalnet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(totalnet, self).__init__()
        self.layer = single_layer(input_dim, hidden_dim, output_dim, dropout)
        self.header = global_mean_pool
        self.linear = torch.nn.Linear(output_dim, 1)
        self.node_encoder = AtomEncoder(input_dim)
        
        def reset_parameters(self):
            self.layer.reset_parameter()
            self.linear.reset_parameter()
            
        def forward(self, batched_data):
            x, edge_index, pos, batch = batched_data.x, batched_data.edge_index, batched_data.pos, batched_data.batch
            #batch : where data is batched(labelling)
            embed = self.node_encoder(x)
            embed = self.layer(embed, edge_index)
            embed = embed * pos
            features = self.header(embed, batch)
            out = self.linear(features)
            return out

In [60]:
train_loader = DataLoader(train2_list, batch_size=32, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_list, batch_size=128, shuffle=False, num_workers=0)
test_loader = DataLoader(test_list, batch_size=128, shuffle=False, num_workers=0)

In [62]:
def train(model, device, data_loader, optimizer, loss_fn):
    model.train() #torch.nn.module class mothod
    
    y_true = []
    y_pred = []
    loss = 0
    
    for step, batch in enumerate(tqdm(data_loader, desc="iteration")):
        batch = batch.to(device)
        optimizer.zero_grad()
        loss = loss_fn(out, batch.y.float())
        loss.backward()
        optimizer.step()
        
        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                pred = model(batch)
            y_true.append(batch.y.detach().cpu())
            y_pred.append(pred.detach().cpu())
    
    y_true = torch.cat(y_true, dim=0).numpy()
    y_pred = torch.cat(y_pred, dim=0).numpy()
    
    #문서작성용 dictionary 변환
    input_dict = {"y_true":y_true,"y_pred":y_pred}
    
    return loss.item()

In [63]:
def eval(model, device, DataLoader):
    model.eval()     #torch.nn.module class mothod
    y_true = []
    y_pred = []
    
    for step, batch in enumerate(loader):
        batch = batch.to(device)
        
        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                pred = model(batch)
            y_true.append(batch.y.view(pred.shape).detach().cpu())
            y_pred.append(pred.detach().cpu())
    
    y_true = torch.cat(y_true, dim=0).numpy()
    y_pred = torch.cat(y_pred, dim=0).numpy()
    
    input_dict = {"y_true":y_true,"y_pred":y_pred}
    
    print(f"ValidError = {evaluator(y_true, y_pred)}")
    


In [64]:
def evaluator(y_true, y_pred):
    acc = np.mean(np.sqrt((y_true - y_pred)**2))
    
    return acc

In [65]:
import copy

model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=0.001)
loss_fn = torch.nn.MSELoss()
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma= 0.99) 

best_model = None
best_valid_acc = 0

for epoch in range(1, 1 + args["epochs"]):
    print('Training...')
    loss = train_test(model, device, train_test_loader, optimizer, loss_fn)   
    train_result = eval(model, device, valid_loader)
    print(f"epoch = {epoch}")
    scheduler.step()

NameError: name 'model' is not defined