In [1]:
import os
import re
import math
from pathlib import Path
import multiprocessing as mp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [2]:
DATA_PATH='../data/'
SKEMPI_CSV=DATA_PATH + 'skempi_v2_cleaned.csv'
df = pd.read_csv(SKEMPI_CSV, sep=';')
df.head()
name_wt  = df.iloc[1, :][0]
name_mut = df.iloc[1, :][0] + '_' + df.iloc[1, :][2].replace(',', '_')
# print(name_wt, '\t', name_mut)

WT_FEATURE_PATH=DATA_PATH + 'openmm/'
MUT_FEATURE_PATH=DATA_PATH + 'openmm_mutated/'
MLP_OUTPUT_PATH=DATA_PATH + 'mlp_features.csv'
d_mat_wt  = np.load(WT_FEATURE_PATH +'D_mat/'+name_wt +'.npy')
u_lj_wt   = np.load(WT_FEATURE_PATH +'U_LJ/'+name_wt +'.npy')
u_el_wt   = np.load(WT_FEATURE_PATH +'U_el/'+name_wt +'.npy')
test = np.stack([d_mat_wt, u_lj_wt, u_el_wt])
test_ii = np.stack([test, test])
# test_ii.shape

In [3]:
# check if we are in a conda virtual env
try:
   os.environ["CONDA_DEFAULT_ENV"]
except KeyError:
   print("\tPlease init the conda environment!\n")
   exit(1)

def standardize(arr):
    return (arr - np.mean(arr)) / np.std(arr)

DATA_PATH='../data/'
SKEMPI_CSV=DATA_PATH + 'skempi_v2_cleaned.csv'
WT_FEATURE_PATH=DATA_PATH + 'openmm/'
MUT_FEATURE_PATH=DATA_PATH + 'openmm_mutated/'
MLP_OUTPUT_PATH=DATA_PATH + 'mlp_features.csv'

R = (8.314/4184)  # kcal mol^-1 K^-1

def siamese_preprocessing(pandas_row):
    name_wt  = pandas_row[1].iloc[0]
    name_mut = pandas_row[1].iloc[0] + '_' + pandas_row[1].iloc[2].replace(',', '_')

    # matrix_features = ['D_mat', 'U_LJ', 'U_el']
    # for feature in matrix_features:
    #     pass

    if not Path(MUT_FEATURE_PATH + 'D_mat/' + name_mut + '.npy').exists():
        print(f'ERROR: {name_mut} does not exist.', '\n')
        return None
    if not Path(WT_FEATURE_PATH + 'D_mat/' + name_wt + '.npy').exists():
        print(f'ERROR: {name_wt} does not exist.', '\n')
        return None

    d_mat_wt  = standardize(np.load(WT_FEATURE_PATH + 'D_mat/' + name_wt +'.npy'))
    u_lj_wt   = standardize(np.load(WT_FEATURE_PATH + 'U_LJ/' + name_wt +'.npy'))
    u_el_wt   = standardize(np.load(WT_FEATURE_PATH + 'U_el/' + name_wt +'.npy'))
    
    wt_arr = np.stack([d_mat_wt, u_lj_wt, u_el_wt])
    
    d_mat_mut = standardize(np.load(MUT_FEATURE_PATH + 'D_mat/' + name_mut + '.npy'))
    u_lj_mut  = standardize(np.load(MUT_FEATURE_PATH + 'U_LJ/' + name_mut + '.npy'))
    u_el_mut  = standardize(np.load(MUT_FEATURE_PATH + 'U_el/' + name_mut + '.npy'))
    
    mut_arr = np.stack([d_mat_mut, u_lj_mut, u_el_mut])
    
    # calculate DDG
    A_wt  = pandas_row[1]['Affinity_wt_parsed']
    A_mut = pandas_row[1]['Affinity_mut_parsed']

    temp = float(re.match("[0-9]*", pandas_row[1]['Temperature'])[0])
    if math.isnan(temp):
        raise ValueError('temperature should not be NaN.')
    
    DG_wt = R * temp * np.log(A_wt)
    DG_mut = R * temp * np.log(A_mut)
    DDG = DG_mut - DG_wt
    
    # debug print
    print(f'parsed {name_mut}', '\n')

    return (np.stack([wt_arr, mut_arr]), DDG)

if __name__ == '__main__':
    df = pd.read_csv(SKEMPI_CSV, sep=';')
    df = df.iloc[:1000, :]
    input_list = []
    target_list = []
    
    n_non_existant = 0
    for data in mp.Pool(5).imap_unordered(siamese_preprocessing, df.iterrows()):
        if data is None:
            n_non_existant += 1
        else:
            input_list.append(data[0])
            target_list.append(data[1])

    print(f'{n_non_existant} PDBs do not have features.')
#     print(df_out)
#     df_out.to_csv(MLP_OUTPUT_PATH)

parsed 1CSE_E_I_LI38Dparsed 1CSE_E_I_LI38P  parsed 1CSE_E_I_LI38S


 parsed 1CSE_E_I_LI38Gparsed 1CSE_E_I_LI38I

  




parsed 1CSE_E_I_LI38E 

parsed 1ACB_E_I_LI38G 
parsed 1ACB_E_I_LI38S 

parsed 1ACB_E_I_LI38P
 
parsed 1ACB_E_I_LI38I 


parsed 1ACB_E_I_LI38D 

parsed 1ACB_E_I_LI38Eparsed 1SBN_E_I_RI38K  
parsed 1SIB_E_I_KI46R

 



parsed 1TM1_E_I_YI42A parsed 1TM1_E_I_YI42G 


parsed 1TM1_E_I_RI46A 
parsed 1TM1_E_I_RI48A 


parsed 1TM1_E_I_RI48C 

parsed 1TM1_E_I_RI48A_RI46A 

parsed 1TM1_E_I_TI39Dparsed 1TM1_E_I_TI39A  

parsed 1TM1_E_I_EI41A 



parsed 1TM1_E_I_TI39A_EI41A 
parsed 1TM1_E_I_TI39D_EI41A
 
parsed 1TM1_E_I_VI51A
 parsed 1Y1K_E_I_AI39T 

parsed 1Y33_E_I_PI39T 



parsed 1Y34_E_I_AI41E parsed 1Y3B_E_I_SI41E
 


parsed 1Y4A_E_I_SI40E_RI39M 

parsed 1Y3C_E_I_AI43R 

parsed 1Y48_E_I_AI46R 

parsed 1Y3D_E_I_AI48R parsed 1TM1_E_I_TI39A 



parsed 1TM1_E_I_TI39P 

 parsed 1TM1_E_I_EI41A

parsed 1TM1_E_I_EI41S 

parsed 1TM1_E_I_EI41S_MI40R parsed 1TM1_E_I_RI43A
 

parsed 1TM1

parsed 1DVF_AB_CD_WB52A  

parsed 1DVF_AB_CD_DB54A
 


parsed 1DVF_AB_CD_NB56A 


parsed 1DVF_AB_CD_DB58A 
parsed 1DVF_AB_CD_EB98AERROR: 1DVF_AB_CD_YB101A does not exist.parsed 1DVF_AB_CD_RB99A  


 


parsed 1DVF_AB_CD_DB100A 

parsed 1DVF_AB_CD_YB101F 
parsed 1DVF_AB_CD_EB98A
 

parsed 1DVF_AB_CD_DB54A 

parsed 1DVF_AB_CD_YA49A parsed 1DVF_AB_CD_DB58A

 

parsed 1DVF_AB_CD_YA32A 
parsed 1DVF_AB_CD_NB56A
 parsed 1DVF_AB_CD_WB52A

parsed 1DVF_AB_CD_YD102Aparsed 1DVF_AB_CD_DB100A 
 
 


parsed 1DVF_AB_CD_YC49A
 
parsed 1DVF_AB_CD_QD104A 


parsed 1DVF_AB_CD_ND55A 

parsed 1DVF_AB_CD_RD106A 

parsed 1DVF_AB_CD_KD30A 

parsed 1DVF_AB_CD_HD33A 
parsed 1DVF_AB_CD_DD52A 


parsed 1DVF_AB_CD_ID101A 
parsed 1DVF_AB_CD_EB98A_YD102A
 


parsed 1DVF_AB_CD_DB54A_YC49A 
parsed 1DVF_AB_CD_DB58A_QD104A 

parsed 1DVF_AB_CD_YA49A_ND55A 

parsed 1DVF_AB_CD_YA32A_RD106A 

parsed 1DVF_AB_CD_NB56A_QD104A 

parsed 1DVF_AB_CD_WB52A_QD104A 
parsed 1DVF_AB_CD_DB100A_HD33A
 parsed 1DVF_AB_CD_DB100A_DD52A parsed

parsed 2JEL_LH_P_VP6F 

 parsed 2JEL_LH_P_TP7S

 

parsed 2JEL_LH_P_PP11E 

parsed 2JEL_LH_P_NP12D parsed 2JEL_LH_P_RP17G

 parsed 2JEL_LH_P_KP24E
 

parsed 2JEL_LH_P_RP17K 

parsed 2JEL_LH_P_KP27E
 parsed 2JEL_LH_P_TP34N 

parsed 2JEL_LH_P_TP34Q 



parsed 2JEL_LH_P_TP36Q parsed 2JEL_LH_P_NP38T 



parsed 2JEL_LH_P_SP43Cparsed 2JEL_LH_P_SP41C  



parsed 2JEL_LH_P_SP46C 

parsed 2JEL_LH_P_QP57E 
parsed 2JEL_LH_P_TP62N
 

parsed 2JEL_LH_P_TP62A 

parsed 2JEL_LH_P_SP64T 

parsed 2JEL_LH_P_EP66K 

parsed 2JEL_LH_P_EP68A 

parsed 2JEL_LH_P_DP69E 

parsed 2JEL_LH_P_EP70K 

parsed 2JEL_LH_P_EP70A parsed 2JEL_LH_P_QP71E 



parsed 2JEL_LH_P_KP72R 

parsed 2JEL_LH_P_KP72E 
parsed 2JEL_LH_P_EP75R
 parsed 2JEL_LH_P_HP76D 



parsed 2JEL_LH_P_KP79Eparsed 2JEL_LH_P_HP76A 
parsed 2JEL_LH_P_AP82S
 

parsed 2JEL_LH_P_EP83A 
 

parsed 2JEL_LH_P_EP85Q
 

parsed 2JEL_LH_P_EP85D 

parsed 2JEL_LH_P_EP85A 

parsed 2JEL_LH_P_EP85K parsed 2OOB_A_B_KA7E 

 parsed 2OOB_A_B_AA6S_KA7E



parsed 2OOB_A_B_AA9E pa

parsed 1JTG_A_B_SA209A_DB49Aparsed 1JTG_A_B_SA105A_DB49A  

parsed 1JTG_A_B_SA209A_SA105A_DB49A 



parsed 1JTG_A_B_RA217A_DB49A parsed 1JTG_A_B_KA208A_DB49A 


parsed 1JTG_A_B_RA217A_SA209A_DB49A 


parsed 1JTG_A_B_RA217A_SA105A_DB49A parsed 1JTG_A_B_SA209A_KA208A_DB49A 

parsed 1JTG_A_B_KA208A_SA105A_DB49A 

parsed 1JTG_A_B_RA217A_SA209A_SA105A_DB49A

 parsed 1JTG_A_B_SA209A_SA105A_KA208A_DB49A 

parsed 1JTG_A_B_RA217A_KA208A_DB49A 



parsed 1JTG_A_B_RA217A_SA209A_KA208A_DB49A 
parsed 1JTG_A_B_KA208A_SA105A_RA217A_DB49A 


parsed 1JTG_A_B_SA209A 
parsed 1JTG_A_B_KA208A_SA209A_SA105A_RA217A_DB49A
parsed 1JTG_A_B_SA105Aparsed 1JTG_A_B_SA209A_SA105A 
 


parsed 1JTG_A_B_RA217A  



parsed 1JTG_A_B_KA208A 

parsed 1JTG_A_B_RA217A_SA209A 
parsed 1JTG_A_B_RA217A_SA105A 


parsed 1JTG_A_B_SA209A_KA208A parsed 1JTG_A_B_KA208A_SA105A
 


parsed 1JTG_A_B_RA217A_SA209A_SA105A parsed 1JTG_A_B_SA209A_SA105A_KA208A
parsed 1JTG_A_B_RA217A_KA208A 

parsed 1JTG_A_B_RA217A_SA209A_KA208A
  



 parsed

In [4]:
def gen_random_data(n_samples, channels, nx, ny):
    data = np.random.randn(n_samples, 2, channels, nx, ny).astype(np.float32)
    outputs = np.random.uniform(0, 100, n_samples).astype(np.float32)
    
    x_train, x_test, y_train, y_test = train_test_split(data, outputs, test_size=0.2, random_state=1)
    
    return x_train, x_test, y_train, y_test
    
def gen_loaders(x_tr, x_te, y_tr, y_te, batch_size):    
    x_train_tensor, x_test_tensor = torch.from_numpy(x_tr), torch.from_numpy(x_te)
    y_train_tensor, y_test_tensor = torch.from_numpy(y_tr), torch.from_numpy(y_te)
    
    train_data = TensorDataset(x_train_tensor, y_train_tensor)
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=False)
    
    test_data = TensorDataset(x_test_tensor, y_test_tensor)
    test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader

In [5]:
input_arr = np.array(input_list).astype(np.float32)
target_arr = np.array(target_list).astype(np.float32)[...,np.newaxis]
x_tr, x_te, y_tr, y_te = train_test_split(input_arr, target_arr, test_size=0.2, random_state=1)
train_data, test_data = gen_loaders(x_tr, x_te, y_tr, y_te, 32)

# x_train, x_test, y_train, y_test = gen_random_data(1000, 3, 256, 256)

# train_set, test_set = gen_loaders(x_train, x_test, y_train, y_test, 100)

# print('train_batches:')
# for batch_x_tr, batch_y_tr in train_set:
#     print(batch_x_tr[:,0].shape, batch_x_tr[:,1].shape, batch_y_tr.shape)

# print('\n', 'test_batches:')
# for batch_x_te, batch_y_te in test_set:
#     print(batch_x_te[:,0].shape, batch_x_te[:,1].shape, batch_y_te.shape)

In [6]:
class HydraNet(nn.Module):
    def __init__(self):
        super().__init__()
        # feature map output: [(W - K + 2P) / S] + 1
        # include batch norm, dropout (remember model.train() and model.eval() !!!)
        self.cnn1 = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3, stride=2), #output: (256-3)/2 + 1 = 
                                 nn.ReLU(),
                                 nn.MaxPool2d(3, stride=2),
                                 
                                 nn.Conv2d(in_channels=8, out_channels=64, kernel_size=3, stride=2),
                                 nn.ReLU(),
                                 nn.MaxPool2d(3, stride=2),
                                 
                                 nn.Conv2d(in_channels=64, out_channels=512, kernel_size=3),
                                 nn.ReLU(),
                                 nn.MaxPool2d(3, stride=2),
                                 nn.Dropout2d(p=0.5),
                                 
                                 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=2),
                                 nn.ReLU(),
                                 nn.MaxPool2d(2)
                                )
        
        self.cnn2 = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3, stride=2), #output: (256-3)/2 + 1 = 
                                 nn.ReLU(),
                                 nn.MaxPool2d(3, stride=2),
                                 
                                 nn.Conv2d(in_channels=8, out_channels=64, kernel_size=3, stride=2),
                                 nn.ReLU(),
                                 nn.MaxPool2d(3, stride=2),
                                 
                                 nn.Conv2d(in_channels=64, out_channels=512, kernel_size=3),
                                 nn.ReLU(),
                                 nn.MaxPool2d(3, stride=2),
                                 nn.Dropout2d(p=0.5),
                                 
                                 nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=2),
                                 nn.ReLU(),
                                 nn.MaxPool2d(2)
                                )
        
        # each output of self.cnn will have dimension 1024, so when concatenated we have 2048
        self.fc = nn.Sequential(nn.Linear(2048, 512),
                                nn.ReLU(),
                                nn.Linear(512, 128),
                                nn.ReLU(),
                                nn.Linear(128, 64),
                                nn.ReLU(),
                                nn.Linear(64, 32),
                                nn.ReLU(),
                                nn.Linear(32, 1))
        
    def forward(self, x1):
        output1 = self.cnn1(x1[:, 0])
        output2 = output1.view(output1.size()[0], -1)
        
        output3 = self.cnn2(x1[:, 1])
        output4 = output3.view(output3.size()[0], -1)
        
        output5 = torch.cat((output2, output4), 1)
        
        return self.fc(output5)

In [7]:
model = HydraNet()

# for x_batch, _ in test:
#     out = model.forward(x_batch)
#     print(out.shape)

In [8]:
def train(model, criterion, dataset_train, dataset_test, optimizer, num_epochs):
    """
    @param model: torch.nn.Module
    @param criterion: torch.nn.modules.loss._Loss
    @param dataset_train: torch.utils.data.DataLoader
    @param dataset_test: torch.utils.data.DataLoader
    @param optimizer: torch.optim.Optimizer
    @param num_epochs: int
    """
    print("Starting training")
    for epoch in range(num_epochs):
        # Train an epoch
        model.train()
        for batch_x, batch_y in dataset_train:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Evaluate the network (forward pass)
            # TODO: insert your code here
            logits = model.forward(batch_x)
            loss = criterion(logits, batch_y)

            # Compute the gradient
            # TODO: insert your code here
            optimizer.zero_grad()
            loss.backward()

            # Update the parameters of the model with a gradient step
            # TODO: insert your code here
            optimizer.step()

        # Test the quality on the test set
        model.eval()
        mse_test = []
        for batch_x, batch_y in dataset_test:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Evaluate the network (forward pass)
            prediction = model(batch_x)
            mse_test.append(criterion(prediction, batch_y))

        print("Epoch {} | Test loss: {:.5f}".format(epoch, sum(mse_test).item()/len(mse_test)))

In [9]:
num_epochs = 100
learning_rate = 1e-3

# If a GPU is available (should be on Colab, we will use it)
if not torch.cuda.is_available():
  raise Exception("Things will go much quicker if you enable a GPU in Colab under 'Runtime / Change Runtime Type'")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train the logistic regression model with the Adam optimizer
criterion = torch.nn.MSELoss() # MSE loss for regression
model_hydra = HydraNet().to(device)

optimizer = torch.optim.Adam(model_hydra.parameters(), lr=learning_rate)
train(model_hydra, criterion, train_data, test_data, optimizer, num_epochs)

Starting training
Epoch 0 | Test loss: 4.23765
Epoch 1 | Test loss: 3.80282
Epoch 2 | Test loss: 3.50356
Epoch 3 | Test loss: 3.21136
Epoch 4 | Test loss: 3.12800
Epoch 5 | Test loss: 3.05322
Epoch 6 | Test loss: 3.01200
Epoch 7 | Test loss: 2.96583
Epoch 8 | Test loss: 2.86648
Epoch 9 | Test loss: 2.84083
Epoch 10 | Test loss: 2.95896
Epoch 11 | Test loss: 3.05970
Epoch 12 | Test loss: 2.97026
Epoch 13 | Test loss: 3.57032
Epoch 14 | Test loss: 2.92239
Epoch 15 | Test loss: 2.89883
Epoch 16 | Test loss: 3.34669
Epoch 17 | Test loss: 2.78910
Epoch 18 | Test loss: 2.51531
Epoch 19 | Test loss: 2.33255
Epoch 20 | Test loss: 2.49157
Epoch 21 | Test loss: 2.91964
Epoch 22 | Test loss: 2.72885
Epoch 23 | Test loss: 2.81129
Epoch 24 | Test loss: 3.81342
Epoch 25 | Test loss: 3.19559
Epoch 26 | Test loss: 2.88820
Epoch 27 | Test loss: 2.53962
Epoch 28 | Test loss: 2.41861
Epoch 29 | Test loss: 4.18107
Epoch 30 | Test loss: 3.17623
Epoch 31 | Test loss: 2.41013
Epoch 32 | Test loss: 2.80116
Ep

In [10]:
model_hydra(torch.from_numpy(x_test).cuda())

NameError: name 'x_test' is not defined