In [1]:
# from google.colab import drive,output,files
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
import torch
import torch.nn as nn
from catboost import CatBoostRegressor
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torchinfo import summary

# np.set_printoptions(threshold=sys.maxsize)
%matplotlib inline
from torch.utils import tensorboard
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard
torch.backends.cudnn.benchmark = True
print(torch.backends.cudnn.version())
print(torch.version.cuda)

8200
11.3


In [2]:
class CNN_1D(nn.Module):
    """CNN1D class from the original paper implemented in Python with PyTorch"""
    def __init__(self):
        super(CNN_1D, self).__init__()
        self.enc0 = nn.Sequential(
            nn.Conv1d(33, 300, 6, 1),
            nn.ReLU(),
            nn.Conv1d(300, 300, 6, 1),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(337, 600),
            nn.ReLU(),
            nn.Linear(600, 1),
            nn.Identity(),
        )

    def forward(self, x, col):
        x = self.enc0(x)
        # dimensions of all training tensors are 33*256 so using 
        # sum instead of average pooling for each leayer makes
        # much more sense from performance standpoint without any 
        # meaningful sacrifices in overall architecture
        x = x.sum(dim=2,keepdim=True).squeeze()
        x = torch.cat([x, col], dim=1)
        return self.fc(x)

In [3]:
class CNN2D(nn.Module):
    """CNN2D class from the original paper implemented in Python with PyTorch"""
    def __init__(self):
        super(CNN2D, self).__init__()
        self.conv0=nn.Sequential(
            nn.Conv2d(15,50,4,1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(50,300,4,1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(300,300,4,1),
            nn.ReLU(),
        )
        self.fc0=nn.Sequential(
            nn.Linear(337,600),
            nn.ReLU(),
        )
        self.fc=nn.Sequential(
            nn.Linear(600,1),
            nn.Identity()
        )
    def forward(self,x,col):
        x=self.conv0(x)
        # dimensions of all training tensors are 15*64*64 or 15*65*65
        # so using sum instead of average pooling for each leayer makes
        # much more sense from performance standpoint without any 
        # meaningful sacrifices in overall architecture
        x=x.sum(dim=[2,3],keepdim=True).squeeze()
        print(x.size())
        x=torch.cat([x,col],dim=1)
        x=self.fc0(x)
        return self.fc(x)

In [4]:
class MLP(nn.Module):
    """MLP class from the original paper implemented in Python with PyTorch"""
    def __init__(self):
        super(MLP, self).__init__()
        self.fc_ds_0=nn.Sequential(
            nn.Linear(1604+167+37,300),
            nn.Tanh(),
            nn.Linear(300,300),
            nn.ReLU()
        )
        self.fc_fp_0=nn.Sequential(
            nn.Linear(1024,1200),
            nn.ReLU(),
        )
        self.fc_res_1=nn.Sequential(
            nn.Linear(1200,1200),
            nn.Dropout(0.05),
            nn.ReLU(),
            nn.Linear(1200,1200),
            nn.Dropout(0.05),
            nn.ReLU()
        )
        self.fc_res_2=nn.Sequential(
            nn.Linear(1200,1200),
            nn.Dropout(0.05),
            nn.ReLU(),
            nn.Linear(1200,1200),
            nn.Dropout(0.05),
            nn.ReLU()
        )
        self.fc_3=nn.Sequential(
            nn.Linear(1500,600),
            nn.ReLU(),
            nn.Linear(600,1),
            nn.Identity()
        )
    
    def forward(self,md,fp,col,maccs):
        x=self.fc_ds_0(torch.cat([md,col,maccs],dim=1))
        y=self.fc_fp_0(fp)
        z=self.fc_res_1(y)+y
        y=self.fc_res_2(z)+z
        x=torch.cat([x,y],dim=1)
        return self.fc_3(x)

In [37]:
class MultiModal(nn.Module):
    """The whole prediction model that contains four blocks: CNN1D, CNN2D, MLP, Gradient Boosting."""

    def __init__(self, cnn1d=CNN_1D(), cnn2d=CNN2D(), mlp=MLP()):
        super(MultiModal, self).__init__()
        self.cnn1d = cnn1d
        self.cnn2d = cnn2d
        self.mlp = mlp
        self.fc = nn.Sequential(
            nn.Linear(4, 1),
            nn.ReLU(),
        )

    def forward(self, smiles, twod, md, maccs, fp, col,boost_pred):
        with torch.no_grad():
            # NN blocks are pre-trained, so they can be freezed
            x1 = self.cnn1d(smiles, col)
            x2 = self.cnn2d(twod, col)
            x3 = self.mlp(md, fp, col,maccs)
            x4 = boost_pred
        x = torch.cat([x1, x2, x3, x4], dim=1)
        return self.fc(x)

In [33]:
device=torch.device("cuda")
cnn1d=CNN_1D()
cnn1d.load_state_dict(torch.load("./cnn1d.pth"))
cnn2d=CNN2D()
cnn2d.load_state_dict(torch.load("./cnn2d.pth"))
mlp=MLP()
mlp.load_state_dict(torch.load("./mlp.pth"))
boost=CatBoostRegressor()
boost.load_model("../models/catbst_42_2000_015_10_clean.model")

<catboost.core.CatBoostRegressor at 0x1848d6a9f10>

In [38]:
mm=MultiModal(cnn1d,cnn2d,mlp)

In [39]:
summary(mm,input_data=(torch.zeros((16,33,256)),
torch.zeros((16,15,65,65)),
torch.zeros((16,1604)),
torch.zeros((16,167)),
torch.zeros((16,1024)),
torch.zeros((16,37)),
torch.zeros((16,1))))

torch.Size([16, 300])


Layer (type:depth-idx)                   Output Shape              Param #
MultiModal                               --                        --
├─CNN_1D: 1-1                            [16, 1]                   --
│    └─Sequential: 2-1                   [16, 300, 246]            --
│    │    └─Conv1d: 3-1                  [16, 300, 251]            59,700
│    │    └─ReLU: 3-2                    [16, 300, 251]            --
│    │    └─Conv1d: 3-3                  [16, 300, 246]            540,300
│    │    └─ReLU: 3-4                    [16, 300, 246]            --
│    └─Sequential: 2-2                   [16, 1]                   --
│    │    └─Linear: 3-5                  [16, 600]                 202,800
│    │    └─ReLU: 3-6                    [16, 600]                 --
│    │    └─Linear: 3-7                  [16, 1]                   601
│    │    └─Identity: 3-8                [16, 1]                   --
├─CNN2D: 1-2                             [16, 1]                   --


In [4]:
class CNN_1D_Dataset(Dataset):
    def __init__(self, dataframe,mask=None):
        def _split_formula(x: str):
            if "i" in x or "l" in x or "r" in x:
                tmp = list(x)
                NML = set(("Si", "Cl", "Br"))
                i = 0
                while i < len(tmp) - 1:
                    t_str = tmp[i] + tmp[i + 1]
                    if t_str in NML:
                        tmp[i] = t_str
                        tmp.pop(i + 1)
                    i += 1
                return np.array(tmp)
            else:
                return np.array(list(x))
        
        def _make_index(unique_formulas):
            d0=dict(zip(unique_formulas,range(len(unique_formulas))))
            d=np.zeros((len(self.str_formulas)))
            for i,val in enumerate(self.str_formulas):
                idx=d0[val]
                d[i]=idx
            return d.astype(int)
        
        def _encode_smiles(line):
            arr = np.zeros((len(self.SYMS), self.max_len))
            pad = (self.max_len-len(line))//2
            for i, sym in enumerate(line):
                arr[self.D_SYM.get(sym), i + pad] = 1
            return arr

        def _encode_column(col_type):
            res = np.zeros((37))
            res[col_type] = 1
            if col_type > 14:
                res[-1] = 1
            return res
        
        if mask is not None:
            df=dataframe.iloc[mask].copy()
        else:
            df=dataframe
        self.SYMS = [
            '#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8',
            '9', '=', 'Br', 'B', 'Cl', 'C', 'F', 'H', 'I', 'N', 'O', 'P', 'Si',
            'S', '[', ']', 'c', 'n', 'o', 's'
        ]
        self.D_SYM = dict(zip(self.SYMS, range(len(self.SYMS))))
        unique_formulas=pd.unique(df["Formula"]).tolist()
        self.str_formulas=df["Formula"].values
        self.index=_make_index(unique_formulas)
        self.max_len = 256
        self.ris = df["RI"].values/1000
        self.cols = df["ColType"].values
        split_formulas=list(map(_split_formula,tqdm(unique_formulas)))
        self.enc_formulas=list(map(_encode_smiles,tqdm(split_formulas)))
        self.enc_columns=np.vstack(list(map(_encode_column,range(37))))
        

    def __getitem__(self, index):
        smiles_encoded = torch.FloatTensor(self.enc_formulas[self.index[index]])
        col_encoded = torch.FloatTensor(self.enc_columns[self.cols[index]])
        return (smiles_encoded, col_encoded, self.ris[index])

    def __len__(self):
        return len(self.ris)
    
def get_train_test_datasets(df_name):
    df = pd.read_csv(df_name)
    df.columns = ["Formula", "RI", "ColType"]
    _,tst_unique=train_test_split(pd.unique(df["Formula"]),test_size=0.2,random_state=42)
    trn_mask,tst_mask=[],[]
    tst_unique=set(tst_unique)
    for i,val in enumerate(df["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    trn_ds=CNN_1D_Dataset(df,trn_mask)
    tst_ds=CNN_1D_Dataset(df,tst_mask)
    return (trn_ds,tst_ds)

def get_val_dataset(df_name):
    df = pd.read_csv(df_name)
    df.columns = ["Formula", "RI", "ColType"]
    return CNN_1D_Dataset(df)

In [5]:
def train(device,model,optim,crit,epoch_start,epoch_end,train_dl,test_dl,name,lr):
    torch.cuda.empty_cache()
    pbar=tqdm(range(epoch_start,epoch_end))
    b_trn=10
    b_tst=10
    for epoch in pbar:
        model.train()
        train_loss=[]
        for data,col,ris in train_dl:
            optim.zero_grad()
            preds=model(data.to(device),col.to(device))
            loss=crit(preds.squeeze(),ris.to(device))
            loss.backward()
            optim.step()
            train_loss.append(loss.detach().cpu().mean())
        train_loss=np.average(train_loss)
        b_trn=min(b_trn,train_loss)

        model.eval()
        test_loss=[]
        for data,col,ris in test_dl:
            with torch.no_grad():
                preds=model(data.to(device),col.to(device))
                loss=crit(preds.squeeze(),ris.to(device))
                test_loss.append(loss.detach().cpu().mean())
        test_loss=np.average(test_loss)
        b_tst=min(b_tst,test_loss)

        # val_loss=[]
        # with torch.no_grad():
        #     for data,col,ris in val_dl:
        #         res=model(data.to(device),col.to(device))
        #         loss=crit(res.squeeze(),ris.to(device))
        #         val_loss.append(loss.detach().cpu().mean())
        # val_loss=np.average(val_loss)

        # scheduler.step(test_loss)        
        pbar.set_postfix_str(f"{train_loss:.4f}/{b_trn:.4f}\t{test_loss:.4f}/{b_tst:.4f}")
        # tqdm.write(f"[{epoch+1}/{epoch_end}]:  trn: {train_loss:.4f}\ttst: {test_loss:.4f}\tval: {val_loss:.4f}")
        logging.info(f"[{epoch+1}/{epoch_end}]:  trn: {train_loss:.4f}\ttst: {test_loss:.4f}")
        writer.add_scalar("loss/trn",train_loss,epoch)
        writer.add_scalar("loss/tst",test_loss,epoch)
        # writer.add_scalar("loss/val",val_loss,epoch)
        if test_loss<=0.032 and test_loss<=b_tst or (epoch+1)%25==0:
            torch.save(model.state_dict(),f"../Models/CNN1D/{name}/{lr:.2e}_{epoch+1}_model.pth")
            torch.save(optim.state_dict(),f"../Models/CNN1D/{name}/{lr:.2e}_{epoch+1}_optim.pth")

In [6]:
it=0
lr=1e-4
batch_size=128
name=f"cnn1d_deep_v1.1_{batch_size}_center"

device=torch.device("cuda")
model=CNN1D().to(device)
optim=torch.optim.AdamW(model.parameters(),lr=lr)
crit=nn.L1Loss()

if it>0:
    model.load_state_dict(torch.load(f"../Models/CNN1D/{name}/{lr:.2e}_{it}.pth"))
    optim.load_state_dict(torch.load(f"../Models/CNN1D/{name}/{lr:.2e}_{it}.pth"))

if not(os.path.isdir(f"../Models/CNN1D/{name}/")):
        os.mkdir(f"../Models/CNN1D/{name}/")

writer=SummaryWriter(log_dir=f"./logs/CNN1D/{name}_{lr:.2e}",flush_secs=15)
logging.basicConfig(filename=f'./logs/CNN1D/{name}_{lr:.2e}.log', encoding='utf-8', level=logging.DEBUG)
logging.info(f"Model: {model}")
logging.info(f"Params: {optim}")



trn_ds,tst_ds=get_train_test_datasets("../Data/valid_nist.csv")
val_ds=get_val_dataset("../Data/valid_other.csv")

trn_dl=DataLoader(trn_ds, batch_size,pin_memory=True,shuffle=True,num_workers=8)
tst_dl=DataLoader(tst_ds, batch_size,pin_memory=True,num_workers=8)
val_dl=DataLoader(val_ds, batch_size,pin_memory=True,num_workers=8)

train(device,model,optim,crit,0,350,trn_dl,tst_dl,name,lr)

  0%|          | 0/70927 [00:00<?, ?it/s]

  0%|          | 0/70927 [00:00<?, ?it/s]

  0%|          | 0/17732 [00:00<?, ?it/s]

  0%|          | 0/17732 [00:00<?, ?it/s]

  0%|          | 0/3588 [00:00<?, ?it/s]

  0%|          | 0/3588 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

KeyboardInterrupt: 