Libraries

In [1]:
import psycopg2
from psycopg2.extensions import AsIs

import glob
import pandas as pd
import io

import time

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import warnings
warnings.filterwarnings('ignore')

from secret import db_info

DB Connection

In [2]:
schema_name = 'sc'

conn = psycopg2.connect(**db_info)
conn.autocommit = True

curs = conn.cursor()
curs.execute(f'set search_path to {schema_name}')

SureChembl Files

In [3]:
data_folder = './sure_chembl_data/'
files = glob.glob(data_folder + '*.txt.gz')

colnames = ['id', 'smiles', 'inchi_key', 'corpus_freq', 'patent_id', \
            'pub_date', 'field_type', 'field_freq']

compound = 'compound'
patent = 'patent'
field_freq = 'field_freq'

Populating DB

In [16]:
for f_idx, f in enumerate(files[2:4]):
    # read one file
    df = pd.read_csv(f, header=None, sep='\t')
    df.columns = colnames

    # filter the file
    df = df.drop(['inchi_key', 'corpus_freq'], axis=1)
    
    df['id'] = df.id.apply(lambda x: x.split('SCHEMBL')[1])
    
    df['patent_id'] = df.patent_id.apply(lambda x: ''.join(x.split('-')))
    df = df[df.patent_id.str[0:2] == 'US']
    
    # upserting compounds
    start = time.time()

    valid_compounds = []
    for i_idx, (id, smi) in enumerate(df[['id', 'smiles']].drop_duplicates('id').values):
        if i_idx % 10000 == 0 and i_idx > 0:
            print(f'Processed {i_idx} compounds.')
        
        m = Chem.MolFromSmiles(smi)
        if not m: continue

        can_smi = Chem.MolToSmiles(m, canonical=True)
        if not can_smi: continue

        b_mfp = DataStructs.BitVectToBinaryText( AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024) )

        try:
            curs.execute('insert into %(tbl)s values (%(id)s, %(smi)s::mol, bfp_from_binary_text(%(mfp)s)) \
                on conflict (id) do update set smiles = excluded.smiles, mfp = excluded.mfp',
                {'tbl': AsIs(compound), 'id': id, 'smi': can_smi, 'mfp': b_mfp})
            
            valid_compounds.append(id)
        except Exception as e:
            print(e)
        
        m = can_smi = b_mfp = None
    
    df = df[df.id.isin(valid_compounds)]
    valid_compounds = None

    print(f'Compounds took {time.time() - start} seconds.')
    # end upserting compounds

    # upserting patents and field frequencies
    start = time.time()

    # patents
    curs.execute(f'create table tmp_{patent} (like {patent} including defaults)')
    
    csv = df[['patent_id', 'pub_date']].drop_duplicates('patent_id').to_csv(index=False, header=True, sep=',')
    buf = io.StringIO(csv)
    csv = None

    curs.copy_expert(sql=f"copy tmp_{patent} (num, pub_date) from stdin with csv header delimiter as ','", file=buf)
    buf = None

    curs.execute(f'insert into {patent} select * from tmp_{patent} on conflict (num) \
                 do update set pub_date = excluded.pub_date returning num, id')
    patent_map = {k: v for k, v in curs.fetchall()}

    curs.execute(f'drop table tmp_{patent}')

    df['patent_id'] = df.patent_id.map(patent_map)
    patent_map = None

    # field freqs
    curs.execute(f'create table tmp_{field_freq} (like {field_freq} including defaults)')

    csv = df[['id', 'patent_id', 'field_type', 'field_freq']].drop_duplicates(['id', 'patent_id', 'field_type']).to_csv(index=False, header=True, sep=',')
    buf = io.StringIO(csv)
    csv = None
    
    curs.copy_expert(sql=f"copy tmp_{field_freq} from stdin with csv header delimiter as ','", file=buf)
    buf = None

    curs.execute(f'insert into {field_freq} select * from tmp_{field_freq} on conflict (compound_id, patent_id, field_id) \
                 do update set freq = excluded.freq')
    curs.execute(f'drop table tmp_{field_freq}')

    print(f'Patents, frequencies took {time.time() - start} seconds.')
    # end upserting patents and field frequencies

    df = None

    print(f'Processed file number {f_idx + 1}.')

Processed 10000 compounds.
Processed 20000 compounds.
could not create molecule from SMILES 'CCCCCC(=O)[O-].CCCCCC[Mg+2]'
LINE 1: insert into compound values ('154273', 'CCCCCC(=O)[O-].CCCCC...
                                               ^

could not create molecule from SMILES 'CCCCCC(=O)[O-].[Mg+2]c1ccccc1'
LINE 1: insert into compound values ('149901', 'CCCCCC(=O)[O-].[Mg+2...
                                               ^

Processed 30000 compounds.
Processed 40000 compounds.
Processed 50000 compounds.
Processed 60000 compounds.
Processed 70000 compounds.
Processed 80000 compounds.
Processed 90000 compounds.
Processed 100000 compounds.
Processed 110000 compounds.
Processed 120000 compounds.
Processed 130000 compounds.
Processed 140000 compounds.
Processed 150000 compounds.
Processed 160000 compounds.
Processed 170000 compounds.
Processed 180000 compounds.
Processed 190000 compounds.
Processed 200000 compounds.
Processed 210000 compounds.
Processed 220000 compounds.
Processed 2

Testing Similarity and Substructure Searching

In [6]:
s = 'CC(=O)CCC=C(C)C'

curs.execute('select c.id, mol_to_smiles(c.smiles), p.num from compound c, patent p, field_freq f \
             where c.id = f.compound_id and p.id = f.patent_id \
             and c.smiles@>%s', (s, ))

r = curs.fetchall()

In [8]:
pd.DataFrame(r).sample(5)

Unnamed: 0,0,1,2
5753,8740,CN(C)c1ccc([C@H]2C[C@]3(C)[C@@H](CC[C@]3(O)CCC...,US20150210758A1
458,132821,CC1=C/C(O)CC(=O)Cc2nc(co2)C(=O)N2CCC=C2C(=O)OC...,US20150159148A1
1095,4530,COC[C@H]1OC(=O)c2coc3c2[C@@]1(C)C1=C(C3=O)C2CC...,US20150133661A1
1181,112855,CC(=O)CC/C=C(\C)CCC=C(C)C,US20150133431A1
5252,61769,CC1=CC2=C(C)C3(CC3)C(C)(O)C(=O)C2=C1,US9127012B2


In [4]:
s = 'CC(=O)CCC=C(C)C'
m = Chem.MolFromSmiles(s)
b_mfp = DataStructs.BitVectToBinaryText( AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024) )

curs.execute('set rdkit.tanimoto_threshold=0.50')
curs.execute('select c.id, mol_to_smiles(c.smiles), p.num from compound c, patent p, field_freq f \
             where c.id = f.compound_id and p.id = f.patent_id \
             and c.mfp%%bfp_from_binary_text(%s)', (b_mfp, ))

r = curs.fetchall()

In [5]:
pd.DataFrame(r).sample(5)

Unnamed: 0,0,1,2
127,23074,CC(C)=CCCC(C)=CC=O,US20150099810A1
2324,19826,CC(C)=CCCC(C)=CCO,US9105373B2
184,23074,CC(C)=CCCC(C)=CC=O,US9023382B2
2646,157509,CC(=O)CC/C=C(\C)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,US20150190356A1
542,57142,CC(=O)O.CC(C)=CCCC(C)=CCO,US20150164963A1


Close DB Connection

In [9]:
curs.close()
conn.close()