In [None]:
import sqlite3

conn = sqlite3.connect('/tmp/chembl_35/chembl_35_sqlite/chembl_35.db')
cursor = conn.cursor()

In [None]:
import pandas as pd

from boilerplate import list_tables, get_table_schema, sample, print_size

In [None]:
tables = list_tables(conn)
print(tables)

In [None]:
for table in tables:
    if 'bind' in table:
        print(table)

for table in tables:
    if 'assay' in table:
        print(table)

for table in tables:
    if 'act' in table:
        print(table)

for table in tables:
    if 'target' in table:
        print(table)

In [None]:
sample(conn, 'assay_type')

assay_type `B` for binding

In [None]:
sample(conn, 'assay_classification')

In [None]:
sample(conn, 'activities')

In [None]:
sample(conn, 'activities', n=10_000).standard_type.value_counts().to_dict()

'Potency' is a catch all... but likely KD if confidence is good (7+?)

In [None]:
kd_like_types = ["Kd", "KD", "Potency", "Binding affinity", "pKD", "Log K", "K", "Kb", "Log Ki", "log1/Ki", "Ka"]
placeholders = ','.join(f"'{s}'" for s in kd_like_types)
query = f"SELECT * FROM activities WHERE standard_type IN ({placeholders})"
activity = pd.read_sql_query(query, conn)

In [None]:
print(f'{len(activity):,} entries')

#activity.sample(5) #molregno assay_id	standard_value	standard_units standard_type

activity.standard_type.value_counts()
#print(activity.standard_type.value_counts().to_markdown())

In [None]:
len( activity.loc[activity.standard_type == 'Kd'].molregno.drop_duplicates() )

In [None]:
sample(conn, 'assays')

In [None]:
placeholders = ','.join(f"'{assay_id}'" for assay_id in activity.assay_id.drop_duplicates())
query = f"SELECT * FROM assays WHERE assay_id IN ({placeholders})"
assay = pd.read_sql_query(query, conn)

print(f'{len(assay):,} entries')

In [None]:
print('Misclassified if not `B`?')

assay.assay_type.value_counts()

In [None]:
clean_type_ids = activity.loc[activity.standard_type == 'Kd'].assay_id.to_list()
assay['is_KD'] = assay.assay_id.isin(clean_type_ids)
assay.groupby('is_KD').assay_type.value_counts()

The majority of the KD entries are `B` type.
However, in going via assay_id the number of non-KD assays are gone down,
So a lot where many measurements per single assay.

## TaxID ontology

Get pathogens. I will deal with women's health category later.

In [None]:
from pathlib import Path

def load_taxonomy_nodes(nodes_path: Path) -> dict[int, int]:
    """
    Load taxid → parent_taxid mapping from nodes.dmp
    """
    taxid_to_parent = {}
    with nodes_path.open(encoding="utf-8") as f:
        for line in f:
            parts = [p.strip() for p in line.split("|")]
            taxid, parent_taxid = int(parts[0]), int(parts[1])
            taxid_to_parent[taxid] = parent_taxid
    return taxid_to_parent

taxid_to_parent = load_taxonomy_nodes(Path("/tmp/nodes.dmp"))

In [None]:
import json

json.dump(taxid_to_parent, Path('taxid_to_parent.json').open('w'))

In [None]:
from typing import Set

def has_ancestor_in_set(taxid: int, target_ids: Set[int]) -> bool:
    """Check if any ancestor of a taxid is in the target set."""
    parent = get_ancestor_in_set(taxid, target_ids)
    return parent != -1

def get_ancestor_in_set(taxid: int, target_ids: Set[int]) -> bool:
    """Check if any ancestor of a taxid is in the target set."""
    visited = set()
    if str(taxid) == 'nan':
        return -1
    taxid = int(taxid)
    while True:
        if taxid == 1: # 1 is the root of NCBI taxonomy
            break
        if taxid in target_ids:
            return taxid
        if taxid in visited:
            break
        if taxid == -1:
            break # error
        visited.add(taxid)
        taxid = taxid_to_parent.get(taxid, -1)
    return -1



# protists is not valid, so I asked chatGTP for names but fetched IDs myself
pathogenic_parent_ids = {'bacteria': 2,
                         'virus': 10239,
                         'fungi': 4751, # 
                         'Apicomplexa': 5794, #Apicomplexa Plasmodium
                         'Euglenozoa': 33682, #Euglenozoa (e.g. Trypanosoma, Leishmania)
                         'Amoebozoa': 554915, # Amoebozoa (e.g. Entamoeba)
                         'Metamonada': 2611341 # Metamonada (e.g. Giardia)
                        }

In [None]:
for name, tax_id in [('human', 9606), ('E. coli', 562)]:
    print(f'Is {name} a parasite?', has_ancestor_in_set(tax_id, pathogenic_parent_ids.values()))

In [None]:
is_pathogen = lambda tax_id: has_ancestor_in_set(tax_id, pathogenic_parent_ids.values())
assay['pathogenic'] = assay.assay_tax_id.apply(is_pathogen).astype(bool)

def get_pathogen_type(tax_id):
    parent = get_ancestor_in_set(tax_id, pathogenic_parent_ids.values())
    parent_group = dict(zip(pathogenic_parent_ids.values(), pathogenic_parent_ids.keys())).get(parent, 'not-pathogenic')
    if parent_group in ('Apicomplexa','Euglenozoa','Amoebozoa', 'Metamonada'):
        return 'protist'
    return parent_group

assay['pathogenic_parent'] = assay.assay_tax_id.apply(get_pathogen_type)

In [None]:
assay.groupby('is_KD')['pathogenic_parent'].value_counts()

Let's go back to activity

In [None]:
activity['assay_organism'] = activity.assay_id.map( assay.set_index('assay_id').assay_organism ).fillna('NA')
activity['assay_tax_id'] = activity.assay_id.map( assay.set_index('assay_id').assay_tax_id ).fillna(-1)
activity['pathogenic'] = activity.assay_tax_id.apply(is_pathogen).astype(bool)
activity['pathogenic_parent'] = activity.assay_tax_id.apply(get_pathogen_type)

In [None]:
def molar_scale(unit):
    if unit is None:
        return float('nan')        
    if 's' in unit or 'kcal' in unit or 'g' in unit:
        return float('nan')
    unit = unit.replace("'", "^").replace("mol/L", "M").replace("/M", "M").replace(" ", "")
    if unit in ('M'):
        return 1
    if unit in ('mM', '10^-3M',"10^3nM"):
        return 10**-3
    if unit in ('uM', '10^-6M'):
        return 10**-6
    if unit in ('nM', '10^-9M'):
        return 10**-9
    if unit in ('pM', '10^-12M'):
        return 10**-12
    if unit in ('fM', '10^-15M'):
        return 10**-15
    return float('nan')
    
activity['unit_scale']=activity.standard_units.apply(molar_scale)

In [None]:
subset = activity.loc[(~activity.unit_scale.isna()) & activity.pathogenic & (activity.standard_type == 'Kd')].drop_duplicates(['pathogenic_parent', 'molregno'])

In [None]:
subset.assay_organism.value_counts()

In [None]:
import plotly.express as px
import numpy as np

subset['pkD'] = - (subset.unit_scale * subset.standard_value).apply(np.log10)

px.histogram(subset, 'pkD',
             template='plotly_white',
             title='Distribution of KD activities of possibly pathogenic clades',
             facet_col='pathogenic_parent')

In [None]:
assays

In [None]:
component_sequences

In [None]:
import pandas as pd

sql_query = "SELECT * FROM assays "
sql_query += "WHERE assay_type = 'F' "
sql_query += "ORDER BY RANDOM() LIMIT 10000000"
df = pd.read_sql_query(f"{sql_query};", conn)

print(f'{len(df):,} entries')
df.sample(5)

In [None]:
df.iloc[1]