## Instructions to setup a demo run

1. Prepare an input file similar to csv files given in ./data/demo/
2. Define the path to the input file in below cell
3. Define the parameter for prediction
4. Run the cell to get output file
5. The last five columns of the output csv file saved will have the prediction results

In [3]:
parameter = 'kcat' # allowed values: ["kcat", "Km", "Ki"] 
parameter = parameter.lower()

use_cpu = 1 # set to 0 if you have GPU enabled

input_file_path = './demo/batch_kcat.csv'

## Navigate to below cell and click "Run->Run Selected Cell" to get prediction

The result will be printed on the right column

In [3]:
parameter = 'kcat' # allowed values: ["kcat", "Km", "Ki"] 
parameter = parameter.lower()

use_cpu = 1 # set to 0 if you have GPU enabled

input_file_path = './demo/batch_kcat.csv'

In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import Image, display
from rdkit import Chem
from IPython.display import display, Latex, Math

def create_csv_sh(parameter, input_file_path):
    df = pd.read_csv(input_file_path)
    smiles_list = df.SMILES
    seq_list = df.sequence
    smiles_list_new = []
    i=0
    for smi in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smi)
            smi = Chem.MolToSmiles(mol)
        except:
            print(f'Invalid SMILES input in input row {i}')
            print('Correct your input! Exiting..')
            return
        if parameter=='kcat':
            if '.' in smi:
              x = smi.split('.')
              y = sorted(x)
              smi = '.'.join(y)
        smiles_list_new.append(smi)
        i+=1
  
    i=0
    valid_aas = list('ACDEFGHIKLMNPQRSTVWY')
    for seq in seq_list:
      for aa in seq:
        if not aa in valid_aas:
          print(f'Invalid Enzyme sequence input in row {i}!')
          print('Correct your input! Exiting..')
          return
      i+=1

    input_file_new_path = f'{input_file_path[:-4]}_input.csv'
    df['SMILES'] = smiles_list_new
    df.to_csv(input_file_new_path)
    
    f = open(f'predict.sh', 'w')
    f.write(f'''
    TEST_FILE_PREFIX={input_file_new_path[:-4]}
    RECORDS_FILE=${{TEST_FILE_PREFIX}}.json
    CHECKPOINT_DIR=../data/pretrained/production/{parameter}/
    
    python ./scripts/create_pdbrecords.py --data_file ${{TEST_FILE_PREFIX}}.csv --out_file ${{RECORDS_FILE}}
    gzip ${{RECORDS_FILE}}
    python predict.py --test_path ${{TEST_FILE_PREFIX}}.csv --preds_path ${{TEST_FILE_PREFIX}}_output.csv --checkpoint_dir $CHECKPOINT_DIR --uncertainty_method mve --smiles_column SMILES --individual_ensemble_predictions --protein_records_path ${{RECORDS_FILE}}.gz
    ''')
    f.close()
    
    return input_file_new_path[:-4]+'_output.csv'

outfile = create_csv_sh(parameter, input_file_path)

print('Predicting.. This will take a while..\n')

if use_cpu:
    os.system("export PROTEIN_EMBED_USE_CPU=1;./predict.sh")
else:
    os.system("export PROTEIN_EMBED_USE_CPU=0;./predict.sh") #>/dev/null 2>&1

def get_predictions(parameter, outfile):
    df = pd.read_csv(outfile)
    pred_col = []
    pred_logcol = []
    pred_sd_totcol = []
    pred_sd_aleacol = []
    pred_sd_epicol = []
    
    for ii in  range(df.shape[0]):
        unit = 'mM'
        if parameter=='kcat':
            parameter_print = 'k_{cat}'
            parameter_print_log = 'log_{10}(k_{cat})'
            target_col = 'log10kcat_max'
            unit = 's^(-1)'
        elif parameter=='km':
            target_col = 'log10km_mean'
            parameter_print = 'K_{m}'
            parameter_print_log = 'log_{10}(K_{m})'
        else:
            target_col = 'log10ki_mean'
            parameter_print = 'K_{i}'
            parameter_print_log = 'log_{10}(K_{i})'
    
        unc_col = f'{target_col}_mve_uncal_var'
        model_cols = [col for col in df.columns if col.startswith(target_col) and 'model_' in col]
    
        unc = df[unc_col].iloc[ii]
    
        prediction = df[target_col].iloc[ii]
        prediction_linear = np.power(10, prediction)
    
        model_out = df[target_col].iloc[ii]
        model_outs = np.array([df[col].iloc[ii] for col in model_cols])
        # print(model_outs)
        epi_unc = np.var(model_outs)#np.sum(np.power(2, model_outs))/10. - np.power(2, model_out)
        alea_unc = unc - epi_unc
        epi_unc = np.sqrt(epi_unc)
        alea_unc = np.sqrt(alea_unc)
        unc = np.sqrt(unc)
        
        # print(unc-epi_unc-alea_unc)
        # def display_outs(prediction_type, out, alea_output, epi_output, unit):
        pred_col.append(prediction_linear)
        pred_logcol.append(prediction)
        pred_sd_totcol.append(unc)
        pred_sd_aleacol.append(alea_unc)
        pred_sd_epicol.append(epi_unc)

    df[f'Prediction_({unit})'] = pred_col
    df['Prediction_log10'] = pred_logcol
    df['SD_total'] = pred_sd_totcol
    df['SD_aleatoric'] = pred_sd_aleacol
    df['SD_epistemic'] = pred_sd_epicol

    return df

output_final = get_predictions(parameter, outfile)
output_final.to_csv(f'{outfile}')
print('Output saved to', outfile)

Predicting.. This will take a while..



gzip: ./demo/batch_kcat_input.json.gz already exists;	not overwritten
  @autocast(enabled = False)
  @autocast(enabled = False)
  vars(torch.load(path, map_location=lambda storage, loc: storage)["args"]),
0it [00:00, ?it/s]

calculating protein embed only on cpu
Loading training args
Loading models
Setting molecule featurization parameters to default.
Loading data


  return torch.load(str(entry_path))
14it [01:31,  6.55s/it]
100%|██████████| 14/14 [00:00<00:00, 58428.12it/s]
100%|██████████| 14/14 [00:00<00:00, 1643.49it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Validating SMILES
Test size = 14


  state = torch.load(path, map_location=lambda storage, loc: storage)
  state = torch.load(path, map_location=lambda storage, loc: storage)

  0%|          | 0/1 [00:00<?, ?it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


100%|██████████| 1/1 [00:01<00:00,  1.61s/it][A
  state = torch.load(path, map_location=lambda storage, loc: storage)
  state = torch.load(path, map_location=lambda storage, loc: storage)

  0%|          | 0/1 [00:00<?, ?it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


100%|██████████| 1/1 [00:00<00:00,  7.27it/s][A
 20%|██        | 2/10 [00:04<00:15,  1.97s/it][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.55it/s][A
 30%|███       | 3/10 [00:05<00:08,  1.25s/it][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.57it/s][A
 40%|████      | 4/10 [00:05<00:05,  1.17it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.53it/s][A
 50%|█████     | 5/10 [00:05<00:03,  1.55it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.57it/s][A
 60%|██████    | 6/10 [00:05<00:02,  1.95it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.56it/s][A
 70%|███████   | 7/10 [00:06<00:01,  2.32it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.66it/s][A
 80%|████████  | 8/10 [00:06<00:00,  2.66it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.65it/s][A
 90%|█████████ | 9/10 [00:06<00:00,  2.95it/s][A

Creating protein model
MoleculeModel(
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
      )
    )
  )
  (seq_embedder): Embedding(21, 36, padding_idx=20)
  (rotary_embedder): RotaryEmbedding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
  )
  (attentive_pooler): AttentivePooling(
    (linear1): Linear(in_features=1316, out_features=1316, bias=True)
    (tanh): Tanh()
    (linear2): Linear(in_features=1316, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=False)
    (1): Linear(in_featu


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  7.65it/s][A
100%|██████████| 10/10 [00:06<00:00,  1.46it/s]A


Output saved to ./demo/batch_kcat_input_output.csv


  loaded_model = torch.load(trained_model_fp, map_location=device)


  embeds = torch.load('./demo/structures_embeds.pt')
