In [None]:
import pandas as pd
import re
import numpy as np

In [None]:
def get_target_name(df):
    target_names = df["TARGNAME"].unique().tolist()
    target_names = [name.split(" (")[0] for name in target_names]
    return target_names


def normalise_name(name: str) -> str:
    """
    Normalises target / display names so they can be joined consistently.
    
    Steps:
    1. Lowercase the name
    2. Remove any content in parentheses e.g. "FGFR1 (Receptor)" -> "fgfr1"
    3. Remove punctuation like hyphens, commas, semicolons
    4. Collapse multiple spaces
    """
    if not isinstance(name, str):
        return ""

    # lowercase
    name = name.lower()

    # remove text in parentheses
    name = re.sub(r"\(.*?\)", "", name)

    # remove punctuation
    name = re.sub(r"[^a-z0-9\s]", " ", name)

    # collapse multiple spaces
    name = re.sub(r"\s+", " ", name)

    # strip leading/trailing whitespace
    return name.strip()


def _keynorm(s: str) -> str:
    return normalise_name(s).casefold()  # case-insensitive + unicode-safe

In [None]:
drug_target_final = pd.read_csv("path/to/filtered/drugtarget")
polymer_data_filtered = pd.read_csv("path/to/filtered/polymer")

In [None]:
# map target names from drug_target to polymer_data
target_names = get_target_name(drug_target_final)
polymer_display_names = polymer_data_filtered['display_name'].unique().tolist()

# for target in target names, append matched polymer display names
mapped_targets = {}
for target in target_names:
    norm_target = _keynorm(target)
    matched_polymers = [
        display_name
        for display_name in polymer_display_names
        if _keynorm(display_name) == norm_target
    ]
    if matched_polymers:
        mapped_targets[target] = matched_polymers
print(f"Mapped {len(mapped_targets)} targets to polymer display names.")


In [None]:
print(mapped_targets)

### check unmapped targets


In [None]:
# check targets that were not mapped
unmapped_targets = [
    target
    for target in target_names
    if target not in mapped_targets
]
print(f"Unmapped targets ({len(unmapped_targets)}): {unmapped_targets}")

# save unmapped targets to a text file
with open('unmapped_targets.txt', 'w') as f:
    for target in unmapped_targets:
        f.write(f"{target}\n")

### manually add unmapped targets

In [None]:
## manually add some mappings that were not captured

# Serotin Transporter [Sodium-dependent serotonin transporter and Serotonin transporter (SERT)]
# normalise names
mapped_targets['Serotonin transporter'] = ['Sodium-dependent serotonin transporter', 'Serotonin transporter (SERT)']

# drug list: Catechol-O-methyl-transferase (COMT)
# polymer list: Catechol O-methyltransferase
mapped_targets['Catechol-O-methyl-transferase'] = ['Catechol O-methyltransferase']

# drug list: Dopamine D2 receptor
# polymer list: D(2) dopamine receptor
mapped_targets['Dopamine D2 receptor'] = ['D(2) dopamine receptor', 'D2 dopamine receptor']

# drug list: Aromatic-L-amino-acid decarboxylase (DDC)
# polymer list: Aromatic-L-amino-acid decarboxylase
mapped_targets['Aromatic-L-amino-acid decarboxylase'] = ['Aromatic-L-amino-acid decarboxylase']

# drug list: Monoamine oxidase type B (MAO-B)
# polymer list: Amine oxidase [flavin-containing] B
mapped_targets['Monoamine oxidase type B'] = ['Amine oxidase [flavin-containing] B']

### create new df

In [None]:
# using drug_target_final and polymer_final, create new dataframe that maps drug names to pdb ids via target names / display names
# create new df
drug_polymer_mapping = []

for target, display_names in mapped_targets.items():
    # get all drugs for this target
    drugs = drug_target_final[drug_target_final['TARGNAME'].str.contains(target, na=False)]['DRUGNAME'].unique().tolist()
    
    for display_name in display_names:
        # get all pdb ids for this display name
        pdb_ids = polymer_data_filtered[polymer_data_filtered['display_name'] == display_name]['pdb_ids'].unique().tolist()
        
        for drug in drugs:
            for pdb_id in pdb_ids:
                drug_polymer_mapping.append({
                    'DRUGNAME': drug,
                    'TARGNAME': target,
                    'DISPLAY_NAME': display_name,
                    'PDB_ID': pdb_id
                })
drug_polymer_df = pd.DataFrame(drug_polymer_mapping)
print(np.shape(drug_polymer_df))


drug_polymer_df.head()

In [None]:
drug_polymer_df.to_csv('path/to/alldrugs.csv', index=False)