# <ins>Project Off-Target Fingerprinting</ins>

# Jupyter Notebook for **Ligand Processing**

This Jupyter Notebook was used to extract the SMILES codes of an existing list of ligands, based on their CHEMBL_IDs.

Additionally, the molecule names from the existing list were matched against all synonyms linked on ChEMBL. This helped to correct errors in the original list.

After that, ligand preparation was conducted via a KNIME workflow.

## Ligand Processing

### SMILES Extractor via CHEMBL_ID

In [2]:
import csv
import requests
import time

input_filename = 'input.csv'
output_filename = 'output.csv'

with open(input_filename, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    data = list(reader)

fieldnames = reader.fieldnames + ['canonical_smiles']

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
    writer.writeheader()

    for row in data:
        chembl_id = row.get('CHEMBL_ID', '')
        smiles = ''
        if chembl_id:
            url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{chembl_id}?format=json"
            max_attempts = 7
            for attempt in range(max_attempts):
                response = requests.get(url)
                if response.status_code == 200:
                    try:
                        molecule_data = response.json()
                        structures = molecule_data.get("molecule_structures")
                        if structures:
                            smiles = structures.get("canonical_smiles", '')
                            if smiles:
                                break
                    except Exception:
                        pass
                # wait a bit to prevent rate limiting
                time.sleep(0.2)
            if not smiles:
                smiles = "ERROR"
        row["canonical_smiles"] = smiles
        writer.writerow(row)


### Molecule Name and Synonyms Comparison

On the basis of a CHEMBL_ID, the pref_name and all synonyms of a compound are exported to a CSV file

In [None]:
import csv
from chembl_webresource_client.new_client import new_client
import time

# returns a stripped string or None if it is no string
def safe_strip(value):
    return value.strip() if isinstance(value, str) and value else ''

input_filename = 'input.csv'
output_filename = 'synonyms.csv'

molecule_client = new_client.molecule
results_list = []

with open(input_filename, newline='', encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    headers = reader.fieldnames
    print("Header:", headers)
    for row in reader:
        chembl_id = safe_strip(row.get('CHEMBL_ID', ''))
        
        active_substance = row.get('active substance')
        if active_substance is None:
            for key in row:
                if key.lower() == 'active substance':
                    active_substance = row[key]
                    break
            else:
                active_substance = ''
        active_substance = safe_strip(active_substance)
        
        entry = {
            'CHEMBL_ID': chembl_id,
            'active substance': active_substance,
            'pref_name': '',
            'synonyms': []
        }
        
        if chembl_id:
            attempt = 0
            while attempt < 5:
                try:
                    results = molecule_client.filter(molecule_chembl_id=chembl_id)
                    if results:
                        mol_data = results[0]
                        candidate_pref_name = safe_strip(mol_data.get('pref_name'))
                        candidate_synonyms = []
                        for syn in mol_data.get('molecule_synonyms', []):
                            candidate = safe_strip(syn.get('molecule_synonym'))
                            if candidate:
                                candidate_synonyms.append(candidate)
                        # break loop if valid pref_name or synonym found
                        if candidate_pref_name or candidate_synonyms:
                            entry['pref_name'] = candidate_pref_name
                            entry['synonyms'] = candidate_synonyms
                            break
                except Exception as e:
                    print(f"Error loading CHEMBL_ID {chembl_id}, retry attempt {attempt+1}: {e}")
                time.sleep(0.2)
                attempt += 1
            if attempt == 5 and (not entry['pref_name'] and not entry['synonyms']):
                print(f"Error loading CHEMBL_ID {chembl_id}")
        results_list.append(entry)

# get max count of synonyms, used to generate columns later
max_syn = max(len(entry['synonyms']) for entry in results_list)

# Define columns: CHEMBL_ID, active substance, pref_name und synonym1 ... synonymN
fieldnames = ['CHEMBL_ID', 'active substance', 'pref_name'] + [f'synonym{i+1}' for i in range(max_syn)]

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
    writer.writeheader()
    for entry in results_list:
        out_row = {
            'CHEMBL_ID': entry['CHEMBL_ID'],
            'active substance': entry['active substance'],
            'pref_name': entry['pref_name']
        }
        for i in range(max_syn):
            out_row[f'synonym{i+1}'] = entry['synonyms'][i] if i < len(entry['synonyms']) else ''
        writer.writerow(out_row)


Existing names from the original list is matched against all synonyms.

In [None]:
import csv

synonyms_filename = 'synonyms.csv'
original_filename = 'input.csv'
output_filename = 'output.csv'

# read synonyms file
synonym_rows = []
with open(synonyms_filename, newline='', encoding='utf-8-sig', errors='replace') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    for row in reader:
        synonym_rows.append(row)

# compare value in "active substance" 
probe_statuses = []
for row in synonym_rows:
    active_value = row.get('active substance', '').strip().lower()
    match_found = False
    # compare all others except CHEMBL_ID
    for key, value in row.items():
        if key in ['CHEMBL_ID', 'active substance']:
            continue
        if value.strip().lower() == active_value and active_value != "":
            match_found = True
            break
    probe_statuses.append("yes" if match_found else "no")

# read original csv
original_rows = []
with open(original_filename, newline='', encoding='utf-8-sig', errors='replace') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    for row in reader:
        original_rows.append(row)

if len(original_rows) != len(probe_statuses):
    print("Warning: The number of rows in the original file and the synonym file do not match!")

# add column "matching_passed"
fieldnames = list(original_rows[0].keys()) + ['matching_passed']

with open(output_filename, 'w', newline='', encoding='utf-8-sig', errors='replace') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
    writer.writeheader()
    for i, row in enumerate(original_rows):
        row['matching_passed'] = probe_statuses[i]
        writer.writerow(row)
