In [3]:
import pandas as pd
import re
import numpy as np
import yaml

In [4]:
def load_config(yaml_path="P1-config.yaml"):
    with open(yaml_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

config = load_config()

In [5]:
def get_target_name(df):
    target_names = df["TARGNAME"].unique().tolist()
    target_names = [name.split(" (")[0] for name in target_names]
    return target_names


def normalise_name(name: str) -> str:
    """
    Normalises target / display names so they can be joined consistently.
    
    Steps:
    1. Lowercase the name
    2. Remove any content in parentheses e.g. "FGFR1 (Receptor)" -> "fgfr1"
    3. Remove punctuation like hyphens, commas, semicolons
    4. Collapse multiple spaces
    """
    if not isinstance(name, str):
        return ""

    # lowercase
    name = name.lower()

    # remove text in parentheses
    name = re.sub(r"\(.*?\)", "", name)

    # remove punctuation
    name = re.sub(r"[^a-z0-9\s]", " ", name)

    # collapse multiple spaces
    name = re.sub(r"\s+", " ", name)

    # strip leading/trailing whitespace
    return name.strip()


def _keynorm(s: str) -> str:
    return normalise_name(s).casefold()  # case-insensitive + unicode-safe

In [6]:
drug_target_final_path = config["processed_paths"]["drug_target_final"]
polymer_final_path = config["processed_paths"]["polymer_final"]


drug_target_final = pd.read_csv(drug_target_final_path)
polymer_data_filtered = pd.read_csv(polymer_final_path)

In [7]:
# map target names from drug_target to polymer_data
target_names = get_target_name(drug_target_final)
polymer_display_names = polymer_data_filtered['display_name'].unique().tolist()

# for target in target names, append matched polymer display names
mapped_targets = {}
for target in target_names:
    norm_target = _keynorm(target)
    matched_polymers = [
        display_name
        for display_name in polymer_display_names
        if _keynorm(display_name) == norm_target
    ]
    if matched_polymers:
        mapped_targets[target] = matched_polymers
print(f"Mapped {len(mapped_targets)} targets to polymer display names.")


Mapped 220 targets to polymer display names.


In [8]:
print(mapped_targets)

{'Fibroblast growth factor receptor 1': ['Fibroblast growth factor receptor 1'], 'Epidermal growth factor receptor': ['Epidermal growth factor receptor'], 'Tyrosine-protein kinase ABL1': ['Tyrosine-protein kinase ABL1'], 'Estrogen receptor': ['Estrogen receptor'], 'Insulin receptor': ['Insulin receptor'], 'Serine/threonine-protein kinase B-raf': ['Serine/threonine-protein kinase B-raf'], 'Platelet-derived growth factor receptor alpha': ['Platelet-derived growth factor receptor alpha'], 'Fibroblast growth factor receptor 2': ['Fibroblast growth factor receptor 2'], 'Acetylcholinesterase': ['Acetylcholinesterase'], 'Vascular endothelial growth factor receptor 2': ['Vascular endothelial growth factor receptor 2'], 'Serine/threonine-protein kinase mTOR': ['Serine/threonine-protein kinase mTOR'], 'Tyrosine-protein kinase BTK': ['Tyrosine-protein kinase BTK'], 'Histone deacetylase 1': ['Histone deacetylase 1'], 'Squalene monooxygenase': ['Squalene monooxygenase'], 'Acyl-CoA desaturase': ['Ac

### check unmapped targets


In [9]:
# check targets that were not mapped
unmapped_targets = [
    target
    for target in target_names
    if target not in mapped_targets
]
print(f"Unmapped targets ({len(unmapped_targets)}): {unmapped_targets}")

# save unmapped targets to a text file
with open('unmapped_targets.txt', 'w') as f:
    for target in unmapped_targets:
        f.write(f"{target}\n")

Unmapped targets (585): ['Polypeptide deformylase', 'Janus kinase 2', 'Phosphodiesterase 5A', 'Carbonic anhydrase I', 'Carbonic anhydrase II', 'Matrix metalloproteinase-1', 'HMG-CoA reductase', 'Erbb2 tyrosine kinase receptor', 'Tropomyosin-related kinase A', 'LCK tyrosine protein kinase', 'Proto-oncogene c-Ret', 'Matrix metalloproteinase-2', 'Arachidonate 5-lipoxygenase', 'Tyrosine-protein kinase Kit', 'Proto-oncogene c-Src', 'Adrenergic receptor beta-3', 'Aldose reductase', 'Monoamine oxidase type A', 'Carbonic anhydrase IV', 'Fms-like tyrosine kinase 3', 'Plasmodium Dihydroorotate dehydrogenase', 'Erbb4 tyrosine kinase receptor', 'PI3-kinase delta', 'Carbonic anhydrase XII', 'Glutathione-dependent PGD synthase', 'Coagulation factor IIa', 'Coagulation factor Xa', 'Angiotensinogenase renin', 'Human immunodeficiency virus Protease', 'Influenza Neuraminidase', 'Thymidine kinase 1', 'Steroid 17-alpha-monooxygenase', 'Protein kinase C gamma', 'Fyn tyrosine protein kinase', 'Cationic tryps

### manually add unmapped targets

In [10]:
## manually add some mappings that were not captured

# Serotin Transporter [Sodium-dependent serotonin transporter and Serotonin transporter (SERT)]
# normalise names
mapped_targets['Serotonin transporter'] = ['Sodium-dependent serotonin transporter', 'Serotonin transporter (SERT)']

# drug list: Catechol-O-methyl-transferase (COMT)
# polymer list: Catechol O-methyltransferase
mapped_targets['Catechol-O-methyl-transferase'] = ['Catechol O-methyltransferase']

# drug list: Dopamine D2 receptor
# polymer list: D(2) dopamine receptor
mapped_targets['Dopamine D2 receptor'] = ['D(2) dopamine receptor', 'D2 dopamine receptor']

# drug list: Aromatic-L-amino-acid decarboxylase (DDC)
# polymer list: Aromatic-L-amino-acid decarboxylase
mapped_targets['Aromatic-L-amino-acid decarboxylase'] = ['Aromatic-L-amino-acid decarboxylase']

# drug list: Monoamine oxidase type B (MAO-B)
# polymer list: Amine oxidase [flavin-containing] B
mapped_targets['Monoamine oxidase type B'] = ['Amine oxidase [flavin-containing] B']

### create new df

In [11]:
# using drug_target_final and polymer_final, create new dataframe that maps drug names to pdb ids via target names / display names
# create new df
drug_polymer_mapping = []

for target, display_names in mapped_targets.items():
    # get all drugs for this target
    drugs = drug_target_final[drug_target_final['TARGNAME'].str.contains(target, na=False)]['DRUGNAME'].unique().tolist()
    
    for display_name in display_names:
        # get all pdb ids for this display name
        pdb_ids = polymer_data_filtered[polymer_data_filtered['display_name'] == display_name]['pdb_ids'].unique().tolist()
        
        for drug in drugs:
            for pdb_id in pdb_ids:
                drug_polymer_mapping.append({
                    'DRUGNAME': drug,
                    'TARGNAME': target,
                    'DISPLAY_NAME': display_name,
                    'PDB_ID': pdb_id
                })
drug_polymer_df = pd.DataFrame(drug_polymer_mapping)
print(np.shape(drug_polymer_df))


(1017, 4)


drug_polymer_df.head()

In [12]:
all_drugs_path = config["processed_paths"]["all_drugs"]
drug_polymer_df.to_csv(all_drugs_path, index=False)