In [1]:
import polars as pl
import numpy as np

In [2]:
# Common name for calculated -log Molar activity
ACTIVITY = 'Activity [-logP]'

# ChemBl Data

In [3]:
# filter out any with null (standard units or  standard values) AND null pchembl
# calc normalized value (-log(Molar)), which should be pchem value if present
# for now, just get chembl info that is in nM 
# (only 600 not, and some probably are fixable...)
chemblData = (
    pl.scan_csv(
        'chembl_export.csv', 
        dtypes={'standard_value': pl.Float64}
    ).filter(
        (pl.col('standard_value').is_not_null() &
         pl.col('standard_units').is_not_null() &
         pl.col('standard_units').eq('nM')
        ) | pl.col('pchembl_value').is_not_null()
    ).with_columns([
        pl.when(pl.col('pchembl_value').is_null())
            .then(-((pl.col('standard_value') / 10e9).log10()))
            .otherwise(pl.col('pchembl_value'))
            .alias(ACTIVITY)
    ])
)

In [4]:
# read in precreated list of target chembl ids to uniprot ids
chemUniIds = (
    pl.scan_csv('uniprot-from-chembltarget.tsv', separator='\t')
    .select([
        'From', 
        pl.col('Entry').alias('uniprot'), 
        pl.col('Organism').alias('organism')
    ])
)

qChemblComplete = (
    chemblData
    .join(other=chemUniIds, left_on='target_chembl_id', right_on='From')
    .select([
        'compound_chembl_id', 
        'canonical_smiles', 
        'standard_inchi', 
        'uniprot',
        'organism',
        pl.col('standard_type').str.to_uppercase(), 
        ACTIVITY,
    ])
)

# DTC Data

In [5]:
# Deal with drug target commons data
qDtc = (
    pl.scan_csv('DTC_data.csv')
    .filter(pl.col('standard_relation').eq('='))
    .filter(pl.col('standard_type').is_in(['KD', 'KI', 'EC50', 'IC50']))
    .filter(pl.col('target_pref_name').str.contains('KINASE|kinase'))
    .filter(pl.col('compound_id').is_not_null())
    .filter(pl.col('target_id').ne(''))
    .filter(pl.col('standard_units').is_in(['NM', 'NMOL/L']))
    .with_columns([
        (-(pl.col('standard_value')/1e9).log10()).alias(ACTIVITY)
    ])
)

In [6]:
# Species Targets
# 'Rattus norvegicus (Rat)'
# 'Mus musculus (Mouse)'
DTC_SPECIES = ['Homo sapiens (Human)']

dtcUniprot = (
    pl.scan_csv('uniprot-DTC-target-info.tsv', separator='\t')
    .filter(pl.col('Organism').is_in(DTC_SPECIES))
    .select([
        pl.col('Entry').alias('uniprot'),
        pl.col('Organism').alias('organism'),
    ])
)

In [7]:
qDtcSmiles = (
    pl.scan_csv('dtc-smiles.csv')
    .select(pl.exclude('pref_name'))
)

In [8]:
qDtcComplete = (
    qDtc
    .join(other=dtcUniprot, left_on='target_id', right_on='uniprot')
    .join(other=qDtcSmiles, left_on='compound_id', right_on='chembl_id')
    .select([
        pl.col('compound_id').alias('compound_chembl_id'),
        'canonical_smiles',
        'standard_inchi',
        pl.col('target_id').alias('uniprot'),
        'organism',
        'standard_type',
        ACTIVITY
    ])
)

In [9]:
qFull = pl.concat((qChemblComplete, qDtcComplete))
qMedian = (
    qFull
    .groupby(['compound_chembl_id', 'canonical_smiles', 'standard_inchi', 'uniprot', 'standard_type'])
    .agg(pl.col(ACTIVITY).median().alias('Median '+ACTIVITY))
)

In [11]:
dfMedian = qMedian.collect()
dfMedian.head(5)

compound_chembl_id,canonical_smiles,standard_inchi,uniprot,standard_type,Median Activity [-logP]
str,str,str,str,str,f64
"""CHEMBL1090479""","""CCn1cc(-c2ccnc…","""InChI=1S/C30H3…","""Q9NQS7""","""IC50""",8.495
"""CHEMBL1945565""","""COCCOc1ccc2cc1…","""InChI=1S/C26H3…","""P36888""","""IC50""",7.60103
"""CHEMBL3263642""","""Cc1c(NC(=O)c2c…","""InChI=1S/C42H4…","""Q06187""","""IC50""",6.851936
"""CHEMBL231209""","""Nc1nccc2scc(-c…","""InChI=1S/C20H1…","""Q16620""","""KI""",5.4
"""CHEMBL1204014""","""CN1CCN(C(=O)c2…","""InChI=1S/C30H3…","""P48736""","""IC50""",8.045757


In [12]:
DF_OUTPUT = 'full-median.parquet'
dfMedian.write_parquet(DF_OUTPUT)

## Pivot Table

In [None]:
dfPivoted = qFull.collect().pivot(
    values=[ACTIVITY], 
    index=['compound_chembl_id', 'canonical_smiles', 'standard_inchi', 'uniprot'], 
    columns=['standard_type'], 
    aggregate_function='median'
)

# Uniprot Sequences
Map from Uniprot ID to Full Sequence as TSV with no header

In [None]:
OUTPUT = 'map-uniprot-seq.tsv'
dfSeq = (
    pl.concat((
        pl.scan_csv('uniprot-from-chembltarget.tsv', separator='\t'),
        pl.scan_csv('uniprot-DTC-target-info.tsv', separator='\t')
    )).select(['Entry', 'Sequence'])
)
         

dfSeq.collect().write_csv(OUTPUT, has_header=False, separator='\t')

# Fixed Queries
These create files for querying in external tools

## Target Chembl IDs for Uniprot

In [None]:
# Get target chembl ids for conversion to uniprot
target_ids = pl.scan_csv('chembl_export.csv').select('target_chembl_id').unique().collect().to_series()

OUTPUT = 'chembl_target_ids.txt'
with open(OUTPUT, 'wt') as f:
    for target in target_ids: 
        print(target, file=f)
print(f'output {len(target_ids)} targets to:', OUTPUT)

## DTC Uniprot IDs

In [None]:
# DTC data dump does not include Kinase species or sequence
# So, create a list of ids for querying uniprot
OUTPUT = 'DTC-targets.txt'
targetsDtc = qDtc.select('target_id').unique().collect().to_series()
with open(OUTPUT, 'wt') as f:
    for target in targetsDtc:
        print(target, file=f)
print(f'wrote {len(targetsDtc)} targets to {OUTPUT}')

In [None]:
# The DTC datadump did not include SMILES data, so create a query for
# an external SQL DB to run

PREAMBLE = '''SELECT md.chembl_id,
md.pref_name,
cs.canonical_smiles,
cs.standard_inchi
FROM molecule_dictionary md
    JOIN compound_structures cs on cs.molregno = md.molregno
    WHERE
        md.chembl_id in  ('''
def create_chembl_sql(ids: pl.Series, output: str):
    with open(output, 'wt') as f:
        print(PREAMBLE, file=f)
        for cid in ids:
            print(f"          '{cid}',", file=f)
        print('        );', file=f)
    
    print(f'wrote SQL query for {len(ids)} ChemBL ids to: {output}')

dtcCmpds = qDtcProt.select('compound_id').unique().collect().to_series()
create_chembl_sql(dtcCmpds, 'queries/dtc-cmpds.sql')