# molecule docking using Gnina

#### Training data:
- Perturbagens with known structures and protein targets
- Gene expression changes of cell lines when treated with certain perturbagen

#### Output:
- 2D graph structures of new molecules

```Can we compare the 

In [1]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import os
from collections import defaultdict
import re

In [2]:
with open("/data/ongh0068/l1000/l1000_biaae/protein_target_to_l1000_smiles.pkl", 'rb') as f:
    meta_data = pickle.load(f)

inv_meta_data = {}
for k,v in meta_data.items():
    for x in v:
        inv_meta_data.setdefault(x, []).append(k)
        
inv_meta_data

{'CS(=O)c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc3)[nH]2)cc1': ['AKT1',
  'AKT2',
  'AURKB',
  'EGFR',
  'MTOR'],
 'Cn1cc(C2=C(c3cn(CCCSC(=N)N)c4ccccc34)C(=O)NC2=O)c2ccccc21': ['AKT1',
  'SMAD3'],
 'CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1': ['AKT1', 'AKT2', 'AURKB'],
 'O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12': ['AKT1',
  'AKT2',
  'AURKB',
  'MTOR',
  'PIK3CA',
  'TP53'],
 'O=C1NC(=O)C(c2c[nH]c3ccccc23)=C1c1c[nH]c2ccccc12': ['AKT1',
  'AKT2',
  'AURKB',
  'MTOR'],
 'COc1cc2ncnc(Nc3ccc(Oc4ccccc4)cc3)c2cc1OC': ['AKT1', 'AKT2', 'MTOR'],
 'O=C(c1ccccc1)c1ccc(N2CCOCC2)cc1O': ['AKT1', 'AKT2', 'AURKB', 'MTOR'],
 'C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1': ['AKT1',
  'AKT2',
  'AURKB',
  'EGFR'],
 'O=C(Nc1cccc(Cl)c1)Nc1ncc(CCNc2ncnc3ccsc23)s1': ['AKT1', 'AKT2', 'EGFR'],
 'CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(O)C1': ['AKT1',
  'AKT2',
  'AURKB',
  'EGFR'],
 'O=[N+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc3)[nH]2)cc1': ['AKT1',
  'AKT2',
  'AURKB',
  'MTOR',
  'TP53'],
 'Cn1cc(C=C2C(=O)Nc3c

In [4]:
# models = ['aae', 'vae', 'wae']
models = ['aae']

# proteins = ['AKT1', 'AKT2', 'AURKB', 'EGFR', 'PIK3CA', 'SMAD3', 'HDAC1', 'TP53', 'MTOR']
proteins = ['AKT1']
for model in models:
    for pr in proteins:
        log_path = os.path.join('logs/' + model, pr)
        out_path = os.path.join('poses/' + model, pr)
        os.makedirs(log_path, exist_ok=True)
        os.makedirs(out_path, exist_ok=True)

In [8]:
# test_data = {k: inv_meta_data[k] for k in list(inv_meta_data.keys())[:5]}
# test_data

{'CS(=O)c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc3)[nH]2)cc1': ['AKT1',
  'AKT2',
  'AURKB',
  'EGFR',
  'MTOR'],
 'Cn1cc(C2=C(c3cn(CCCSC(=N)N)c4ccccc34)C(=O)NC2=O)c2ccccc21': ['AKT1',
  'SMAD3'],
 'CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1': ['AKT1', 'AKT2', 'AURKB'],
 'O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12': ['AKT1',
  'AKT2',
  'AURKB',
  'MTOR',
  'PIK3CA',
  'TP53'],
 'O=C1NC(=O)C(c2c[nH]c3ccccc23)=C1c1c[nH]c2ccccc12': ['AKT1',
  'AKT2',
  'AURKB',
  'MTOR']}

In [13]:
# for smi in test_data.keys():
#     escape_smi = re.sub(r'([()])', r'\\\1', smi)
#     print(escape_smi)

CS\(=O\)c1ccc\(-c2nc\(-c3ccc\(F\)cc3\)c\(-c3ccncc3\)[nH]2\)cc1
Cn1cc\(C2=C\(c3cn\(CCCSC\(=N\)N\)c4ccccc34\)C\(=O\)NC2=O\)c2ccccc21
CCC\(CO\)Nc1nc\(NCc2ccccc2\)c2ncn\(C\(C\)C\)c2n1
O=c1cc\(N2CCOCC2\)oc2c\(-c3ccccc3\)cccc12
O=C1NC\(=O\)C\(c2c[nH]c3ccccc23\)=C1c1c[nH]c2ccccc12


In [5]:
for model in models:
    for ref_smi, proteins in tqdm(inv_meta_data.items()):
        escape_smi = re.sub(r'([()])', r'\\\1', ref_smi)
        for pr in proteins:
            protein_file = 'protein/' + pr + '.pdb'
            # ligand_file = os.path.join('ligand/' + model, escape_smi + '.sdf')
            ligand_path = os.path.join('ligand/' + model, ref_smi)
            for filename in os.listdir(ligand_path):
                ligand_file = os.path.join(ligand_path, filename)
            log_file = os.path.join('logs/' + model, pr, escape_smi + '.log')
            out_file = os.path.join('poses/' + model, pr, escape_smi + '.sdf.gz')
            !gnina -r {protein_file} -l {ligand_file} --autobox_ligand {protein_file} -o {out_file} --exhaustiveness 16 --log {log_file} -q

  0%|                                                                                                                                                                              | 0/238 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'ligand/aae/CS(=O)c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc3)[nH]2)cc1'

In [11]:
# !gnina -r protein/AKT1.pdb -l ligand/wae/CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1.sdf --autobox_ligand protein/AKT1.pdb -o poses/wae/AKT1/CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1.sdf.gz --exhaustiveness 16 --log logs/wae/AKT1/CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1.log

/bin/bash: -c: line 0: syntax error near unexpected token `('
/bin/bash: -c: line 0: `gnina -r protein/AKT1.pdb -l ligand/wae/CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1.sdf --autobox_ligand protein/AKT1.pdb -o poses/wae/AKT1/CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1.sdf.gz --exhaustiveness 16 --log logs/wae/AKT1/CCC(CO)Nc1nc(NCc2ccccc2)c2ncn(C(C)C)c2n1.log'
