# <ins>Project Off-Target Fingerprinting</ins>

# Jupyter Notebook for **Protein Data Retrieval**

This Jupyter Notebook was used to extract data from specific protein structures into a CSV file. Basically, each cell reads an existing CSV, gathers new information, adds it to the existing data, and outputs an updated version of the CSV.

Starting from each protein's CHEMBL_ID or UNIPROT_ID, the PDB_IDs of all linked 3D structures were first extracted. Based on those PDB_IDs, relevant data of each PDB_ID was extracted and added to the CSV file.


## Data Retrieval from UniProt and CHEMBL

### CHEMBL_ID to UNIPROT_ID Converter

In [1]:
import csv
from chembl_webresource_client.new_client import new_client

input_filename = 'input.csv'
output_filename = 'output.csv'

# read input
with open(input_filename, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    rows = list(reader)
    fieldnames = reader.fieldnames if reader.fieldnames else []

# add UNIPROT_ID column
if 'UNIPROT_ID' not in fieldnames:
    fieldnames.append('UNIPROT_ID')

# load uniprot_id from chembl websource client api, add the data and save it to a csv file
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
    writer.writeheader()
    target_client = new_client.target

    for row in rows:
        chembl_id = row.get('CHEMBL_ID', '').strip()
        uniprot_ids = []

        try:
            results = target_client.filter(target_chembl_id=chembl_id)
            if results:
                # get all unitprot_ids
                components = results[0].get('target_components', [])
                for comp in components:
                    acc = comp.get('accession')
                    if acc:
                        uniprot_ids.append(acc)
        except:
            pass

        # leave empty if not found
        if not uniprot_ids:
            row['UNIPROT_ID'] = ''
            writer.writerow(row)
        else:
            for uid in uniprot_ids:
                new_row = row.copy()
                new_row['UNIPROT_ID'] = uid
                writer.writerow(new_row)


## Extract PDB_IDs via UniProt

In [3]:
import requests
import csv

input_filename = 'input.csv'
output_filename = 'output.csv'

# read input
with open(input_filename, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    data = list(reader)

# add column PDF_ID
fieldnames = ['CHEMBL_ID', 'target_nr', 'UNIPROT_ID', 'pref_name', 'gene_symbol', 'target_type', 'PDB_ID']

# load PDB_ID from uniprot, add the data and save it to a csv file
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
    writer.writeheader()
    
    for row in data:
        uniprot_id = row.get('UNIPROT_ID', '')

        url = "https://rest.uniprot.org/uniprotkb/" + uniprot_id
        response = requests.get(url)

        data = response.json()
        pdb_references = [ref for ref in data.get("uniProtKBCrossReferences", []) if ref.get("database") == "PDB"]

        # export all PDB_IDs
        for ref in pdb_references:
            pdb_id = ref.get("id")
            row["PDB_ID"] = pdb_id
            writer.writerow(row)

## Visualize number of extracted PDB_IDs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('input.csv', delimiter=';')

# count unique PDB_IDs per gene symbol
structure_counts = data.groupby('gene_symbol')['PDB_ID'].nunique()

plt.figure(figsize=(17, 10))
structure_counts.sort_values(ascending=False).plot(kind='bar', width=0.6)
plt.title('Number of Structures per Off-Target')
plt.xlabel('gene symbol')
plt.ylabel('number of structures (PDB_IDs)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Use PDBs to extract relevant data

We use the PDB_IDs to extract the following data:
- Method
- Resolution & R-Values
- Certain information from the title
- Mutation information
- Release Date

## Method Extractor

X-Ray Diffraction, Electron Microscopy or Solution NMR

In [None]:
import pandas as pd
import asyncio
import nest_asyncio
from aiohttp import ClientSession
from asyncio import Semaphore

nest_asyncio.apply()

input_df = pd.read_csv('input.csv', delimiter=';')
pdb_ids = input_df['PDB_ID'].dropna().unique() # ignore NaN values

# use multithreading to retrieve data for different PDBs at the same time
# but limit concurrent requests to 5 (prevent overloading the server)
MAX_CONCURRENT_REQUESTS = 5
semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)

# async method executed by aiohttp
async def fetch_method(pdb_id, session):
    url = f'https://data.rcsb.org/rest/v1/core/entry/{pdb_id}'
    async with semaphore:
        for attempt in range(5):  # retry for max 3 tries
            try:
                async with session.get(url) as response:
                    if response.status == 200:
                        data = await response.json()
                        methods = [expt.get('method') for expt in data.get('exptl', [])]
                        method = ', '.join(methods)
                        return pdb_id, method if method else 'Unknown'
                    elif response.status == 429:
                        # retry (e.g. when executing too many request at the same time)
                        retry_after = int(response.headers.get('Retry-After', 5))
                        print(f"Rate limit exceeded {pdb_id}, wait for {retry_after} seconds.")
                        await asyncio.sleep(retry_after)
                    else:
                        print(f"Failed to load data for PDB_ID {pdb_id}: Status {response.status}")
                        break
            except Exception as e:
                print(f"Failed to connect to server {pdb_id}, retry {attempt + 1}: {e}")

            # wait and retry
            await asyncio.sleep(2 ** attempt)
            
        # In case of any error, write it to the output file
        return pdb_id, 'Error'

# main method
async def get_structure_methods(pdb_ids):
    async with ClientSession() as session:
        tasks = [fetch_method(pdb_id, session) for pdb_id in pdb_ids]
        results = await asyncio.gather(*tasks)
        return {pdb_id: method for pdb_id, method in results}

# execute
structure_methods = await get_structure_methods(pdb_ids)

# convert results to data frame
structure_methods_df = pd.DataFrame(list(structure_methods.items()), columns=['PDB_ID', 'Method'])

# update and save input_df with new data
input_updated_df = input_df.merge(structure_methods_df, on='PDB_ID', how='left')
input_updated_df.to_csv('output.csv', index=False, sep=';')

### Resolution and R-Values Extractor

In [None]:
import pandas as pd
import aiohttp
import asyncio
import nest_asyncio
from aiohttp import ClientTimeout

nest_asyncio.apply()

# use multithreading to retrieve data for different PDBs at the same time
# but limit concurrent requests to 5 (prevent overloading the server)
MAX_CONCURRENT_REQUESTS = 10

# async method executed by aiohttp
async def fetch_pdb_data(session, pdb_id, semaphore, max_retries=3):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
    for attempt in range(max_retries):
        try:
            async with semaphore:
                async with session.get(url, timeout=ClientTimeout(total=60)) as response:
                    if response.status == 200:
                        data = await response.json()
                        resolution_list = data.get('rcsb_entry_info', {}).get('resolution_combined', [])
                        resolution = resolution_list[0] if resolution_list else None

                        # extract r-values
                        refine_list = data.get('refine', [])
                        if refine_list:
                            refine_data = refine_list[0]
                            r_free = refine_data.get('ls_rfactor_rfree', None)
                            r_work = refine_data.get('ls_rfactor_rwork', None)
                            r_observed = refine_data.get('ls_rfactor_obs', None)
                        else:
                            r_free = r_work = r_observed = None

                        return {
                            "resolution": f"{float(resolution):.2f}" if resolution is not None else None,
                            "r_free": f"{float(r_free):.3f}" if r_free is not None else None,
                            "r_work": f"{float(r_work):.3f}" if r_work is not None else None,
                            "r_observed": f"{float(r_observed):.3f}" if r_observed is not None else None,
                        }
                    else:
                        print(f"Failed to fetch data for {pdb_id}, status code: {response.status}")
        except (asyncio.TimeoutError, aiohttp.ClientError) as e:
            print(f"Attempt {attempt+1}/{max_retries} - Error fetching data for {pdb_id}: {e}")
            await asyncio.sleep(1) 
        except Exception as e:
            print(f"Unexpected error fetching data for {pdb_id}: {e}")
            break
    print(f"Failed to fetch data for {pdb_id} after {max_retries} retries")
    return {
        "resolution": None,
        "r_free": None,
        "r_work": None,
        "r_observed": None,
    }

async def fetch_all_pdb_data(pdb_ids):
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_pdb_data(session, pdb_id, semaphore) for pdb_id in pdb_ids]
        results = await asyncio.gather(*tasks)
        return dict(zip(pdb_ids, results))

async def main():
    # read input
    csv_data = pd.read_csv("input.csv", delimiter=';')

    pdb_ids = csv_data['PDB_ID'].dropna().unique()
    print(f"Unique PDB-IDs: {len(pdb_ids)}")

    # split up into batches
    batch_size = 1000
    all_data = {}

    for i in range(0, len(pdb_ids), batch_size):
        batch_pdb_ids = pdb_ids[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}: {len(batch_pdb_ids)} PDB IDs")
        batch_data = await fetch_all_pdb_data(batch_pdb_ids)
        all_data.update(batch_data)
        print(f"Completed batch {i//batch_size + 1}")

    # extract r values
    csv_data['resolution'] = csv_data['PDB_ID'].map(lambda x: all_data.get(x, {}).get('resolution'))
    csv_data['r_free'] = csv_data['PDB_ID'].map(lambda x: all_data.get(x, {}).get('r_free'))
    csv_data['r_work'] = csv_data['PDB_ID'].map(lambda x: all_data.get(x, {}).get('r_work'))
    csv_data['r_observed'] = csv_data['PDB_ID'].map(lambda x: all_data.get(x, {}).get('r_observed'))

    # save output file
    csv_data.to_csv("output.csv", index=False, sep=';')

await main()


### Title Analyzer

To obtain clues about potentially complexed ligands

In [13]:
import pandas as pd
import asyncio
import nest_asyncio
from aiohttp import ClientSession
from asyncio import Semaphore

nest_asyncio.apply()

input_df = pd.read_csv('input.csv', delimiter=';')
pdb_ids = input_df['PDB_ID'].dropna().unique()

MAX_CONCURRENT_REQUESTS = 5
semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)

# The title of each PDB_ID is searched for the following text passages in descending order
search_terms = [
    "in complex with an ",
    "in complex with a ",
    "in complex with the ",
    "in complex with ",
    "complexed with ",
    "bound to an ",
    "bound to a ",
    "bound to the ",
    "bound to ",
    "bound with ",
    "with small molecule ",
    "with bound ",
    "with an ",
    "with "
]

async def fetch_title_info(pdb_id, session):
    url = f'https://data.rcsb.org/rest/v1/core/entry/{pdb_id}'
    async with semaphore:
        for attempt in range(5):
            try:
                async with session.get(url) as response:
                    if response.status == 200:
                        data = await response.json()
                        title = data.get('struct', {}).get('title', '').strip()
                        
                        # check for title_ligand_info
                        ligand_info = "no hint"
                        lower_title = title.lower()
                        for term in search_terms:
                            if term in lower_title:
                                # Text nach dem gefundenen Begriff extrahieren
                                after_text = title[lower_title.find(term) + len(term):].strip()
                                ligand_info = after_text if after_text else "no hint"
                                break
                        
                        # check for title_says_agonist_activator
                        inhibitors = ["agonist", "activator"]
                        says_ago_act = "Yes" if any(word in lower_title for word in inhibitors) else "No"

                        # check for title_says_antagonist_inhibitor
                        inhibitors = ["antagonist", "inhibitor"]
                        says_anta_inhi = "Yes" if any(word in lower_title for word in inhibitors) else "No"

                        # check for title_says_complex_complexed
                        says_complex = "Yes" if ("complex" in lower_title or "complexed" in lower_title) else "No"

                        return pdb_id, ligand_info, says_ago_act, says_anta_inhi, says_complex
                    elif response.status == 429:
                        # rate limit, wait and retry
                        retry_after = int(response.headers.get('Retry-After', 5))
                        await asyncio.sleep(retry_after)
                    else:
                        break
            except:
                pass
            await asyncio.sleep(2 ** attempt)
        # default in case everything fails
        return pdb_id, "no hint", "No", "No", "No"

async def process_all(pdb_ids):
    async with ClientSession() as session:
        tasks = [fetch_title_info(pdb_id, session) for pdb_id in pdb_ids]
        results = await asyncio.gather(*tasks)
        return results

# execute
results = asyncio.run(process_all(pdb_ids))

# result dataframe
columns = ['PDB_ID', 'title_ligand_info', 'title_says_agonist_activator', 'title_says_antagonist_inhibitor', 'title_says_complex_complexed']
results_df = pd.DataFrame(results, columns=columns)

# merge and save results
updated_df = input_df.merge(results_df, on='PDB_ID', how='left')
updated_df.to_csv('output.csv', index=False, sep=';')


### Mutation Information Extractor

The basic information "Mutation(s): Yes / No" is unfortunately not available via the API, so a workaround via browser is used.

In [15]:
import pandas as pd
import asyncio
import nest_asyncio
from aiohttp import ClientSession
from asyncio import Semaphore
from selenium import webdriver
from selenium.webdriver.common.by import By

nest_asyncio.apply()

# lock webdriver, only one can be executed at the same time
driver_lock = asyncio.Lock()

# load data
input_df = pd.read_csv('input.csv', delimiter=';')
pdb_ids = input_df['PDB_ID'].dropna().unique()

# used to execute in parallel
MAX_CONCURRENT_REQUESTS = 5
semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)

# start webdriver, openes a chrome browserwindow
driver = webdriver.Chrome()

# load mutationinfo from header_mutation
async def fetch_method(pdb_id, session):
    url = f'https://www.rcsb.org/structure/{pdb_id}'
    async with driver_lock:
        driver.get(url)
        mutation = driver.find_element(By.XPATH, "//li[contains(@id,'header_mutation')]").text
    return pdb_id, "Yes" if "Yes" in mutation else "No"

async def get_mutation_status(pdb_ids):
    async with ClientSession() as session:
        tasks = [fetch_method(pdb_id, session) for pdb_id in pdb_ids]
        results = await asyncio.gather(*tasks)
        return {pdb_id: mutation_status for pdb_id, mutation_status in results}

mutation_status = await get_mutation_status(pdb_ids)

# merge and save results
mutation_status_df = pd.DataFrame(list(mutation_status.items()), columns=['PDB_ID', 'Mutations'])
output_df = input_df.merge(mutation_status_df, on='PDB_ID', how='left')

output_df.to_csv('output.csv', index=False, sep=';')

# close browser window
driver.close()


### Release Date Extractor

In [16]:
import pandas as pd
import asyncio
import nest_asyncio
from aiohttp import ClientSession
from asyncio import Semaphore
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

nest_asyncio.apply()

driver_lock = asyncio.Lock()

# read input
mothersheet_df = pd.read_csv('input.csv', delimiter=';')
pdb_ids = mothersheet_df['PDB_ID'].dropna().unique()

MAX_CONCURRENT_REQUESTS = 5
semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)

driver = webdriver.Chrome()

async def fetch_method(pdb_id, session):
    url = f'https://www.rcsb.org/structure/{pdb_id}'
    async with driver_lock:
        driver.get(url)
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//li[contains(@id,'header_deposited-released-dates')]"))
            )
            dates = element.text
            date = dates.split("Released:")[-1].strip()
        except (NoSuchElementException, TimeoutException):
            date = "Not found"
    return pdb_id, date

async def get_release_dates(pdb_ids):
    async with ClientSession() as session:
        tasks = [fetch_method(pdb_id, session) for pdb_id in pdb_ids]
        results = await asyncio.gather(*tasks)
        return {pdb_id: release_date for pdb_id, release_date in results}

release_dates = await get_release_dates(pdb_ids)

# merge and save results
release_dates_df = pd.DataFrame(list(release_dates.items()), columns=['PDB_ID', 'Release Date'])
output_df = mothersheet_df.merge(release_dates_df, on='PDB_ID', how='left')

output_df.to_csv('output.csv', index=False, sep=';')

# close browser window
driver.close()

After successfull execution of all data extraction cells, the resulting CSV file was converted to an .xlsx and the data was seperated into different sheets within this file. Therefore, the Excel macro SplitByTarget was used, to generate one sheet per target and seperate the data into the corresponding sheets.

## Download of selected 3D Structures (.cif files)

### Generate download links

In [1]:
import csv

input_path = "input.csv"
output_path = "links.csv"
base_url = "https://files.rcsb.org/download/{}.cif"

with open(input_path, newline="", encoding="utf-8-sig") as infile, open(output_path, "w", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    for row in reader:
        if not row or not row[0].strip():
            continue
        pdb_id = row[0].strip().upper()
        writer.writerow([base_url.format(pdb_id)])


### Download files

In [None]:
import os
import pandas as pd
import requests

csv_path = "links.csv"
output_dir = "cifs"
max_retries = 5
timeout = 10

os.makedirs(output_dir, exist_ok=True)
df = pd.read_csv(csv_path, header=None)

for index, url in enumerate(df[0], start=1):
    filename = url.strip().split("/")[-1]
    output_path = os.path.join(output_dir, filename)

    print(f"[{index}/{len(df)}] Loading {filename} ...", end=" ")

    for attempt in range(1, max_retries + 1):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print("Done")
            break
        except Exception as e:
            if attempt == max_retries:
                print(f"Error ({e})")
