# <ins>Project Off-Target Fingerprinting</ins>

# Jupyter Notebook for **Results Visualisation**

This Jupyter Notebook was used to properly color the matrix, generate the final fingerprints and project the fingerprint data into a UMAP.

## Fingerprint Generation

### Staining

The lower/better the docking score, the darker red the cell color - with an exponent applied: only exceptionally good scores appear dark red. This makes the fingerprints easier to read and helps spot patterns more quickly. Very high/bad scores above zero and missing values are colored white.

In [None]:
import os
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
from openpyxl.utils import column_index_from_string

target_file    = 'input.xlsx'
base, ext      = 'output', '.xlsx'
min_col, max_col = 'D', 'EI'
min_row, max_row = 9, 788
exponent       = 0.7  # exponent for a moderate red-white-gradient

wb = load_workbook(target_file)
ws = wb.active

# get min and max values
min_col_idx = column_index_from_string(min_col)
max_col_idx = column_index_from_string(max_col)

values = []
for row_cells in ws.iter_rows(min_row=min_row, max_row=max_row,
                              min_col=min_col_idx, max_col=max_col_idx):
    for cell in row_cells:
        v = cell.value
        if v is None or v == "":
            values.append(10.0)
        else:
            try:
                val = float(v.replace(',', '.')) if isinstance(v, str) else float(v)
                values.append(val)
            except ValueError:
                print(f"Error converting cell {cell.coordinate}, value: '{v}'")
                continue  # Problematische Zelle überspringen

min_neg = min((v for v in values if v <= 0), default=0)

# color cells based on an exponential red-white-gradient
for row_cells in ws.iter_rows(min_row=min_row, max_row=max_row,
                              min_col=min_col_idx, max_col=max_col_idx):
    for cell in row_cells:
        v = cell.value
        if v is None or v == "":
            color = "FFFFFF"
        else:
            try:
                val = float(v.replace(',', '.')) if isinstance(v, str) else float(v)
            except ValueError:
                color = "FFFFFF"
                print(f"Error coloring cell {cell.coordinate}, value: '{v}'")
                cell.fill = PatternFill(fill_type="solid", start_color=color, end_color=color)
                continue

            if val <= 0:
                rel_lin = (val - min_neg) / (0 - min_neg) if (0 - min_neg) != 0 else 1
                rel     = rel_lin ** exponent
                rel     = max(0.0, min(rel, 1.0))
                gb      = int(255 * rel)
                color   = f"FF{gb:02X}{gb:02X}"
            else:
                color = "FFFFFF"

        cell.fill = PatternFill(fill_type="solid", start_color=color, end_color=color)

# save as new file
new_name = base + ext
i = 1
while os.path.exists(new_name):
    new_name = f"{base}_{i}{ext}"
    i += 1

wb.save(new_name)


### Survival of the fittest color

For some targets, there are still 2 or more docking scores, that are currently colored differently. This needs to be simplified: better docking score = more intense red.

The best score for each target determines the color. The lighter color is removed and replaced by the darker one. However, the individual values remain in the cells, only the color is changed. By inspecting the values, it is still possible to see in detail whether a molecule docks better into the agonistic or antagonistic conformation of a protein.


Insert 2 (or more) columns manually to which the code should be applied, or load a prepared CSV file.

In [None]:
import os
import re
import csv
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
from openpyxl.utils import column_index_from_string

target_file = 'input.xlsx'
base, ext   = 'output', '.xlsx'
input_csv   = 'helper_csv_survival_of_the_fittest.csv'  # note: csv without header

wb = load_workbook(target_file)
ws = wb.active

mode = input("Select a csv file or manual input? Enter 'manual' or 'csv':\n> ").strip().lower()

if mode == 'csv':
    try:
        with open(input_csv, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter=';')
            eingaben = [row[0] for row in reader if row and row[0].strip()]
    except Exception as e:
        print(f"Error reading CSV: {e}")
        eingaben = []
else:
    eingaben = []

index = 0
while True:
    if index < len(eingaben):
        cols_input = eingaben[index]
        print(f"[CSV] Input {index + 1}: {cols_input}")
        index += 1
    else:
        if mode == 'csv':
            print("All entries from the CSV processed.")
            break
        cols_input = input(
            "Which columns should be recoloured according to the lowest value in the row??\n"
            "(Format: Y, Z, AA, ...; '#' to exit)\n> "
        ).strip()
        if cols_input == '#':
            print("Done.")
            break

    # validation
    if not re.fullmatch(r'[A-Z]+(?:\s*,\s*[A-Z]+)+', cols_input):
        print(f"Invalid Format: '{cols_input}' - expecting at least two valid column names.")
        continue

    cols = [c.strip() for c in cols_input.split(',')]
    try:
        col_idxs = [column_index_from_string(c) for c in cols]
    except ValueError as e:
        print(f"Failed to convert columns '{cols_input}': {e}")
        continue

    # colour cells
    for row in range(9, 789):
        cells = [ws.cell(row=row, column=ci) for ci in col_idxs]
        vals = []
        skip = False
        for cell in cells:
            v = cell.value
            if v is None or v == "":
                vals.append(10.0)
            else:
                try:
                    val = float(v.replace(',', '.')) if isinstance(v, str) else float(v)
                    vals.append(val)
                except Exception:
                    print(f"Non-numerical value in column {cell.coordinate}: '{v}' - skipped.")
                    skip = True
                    break
        if skip:
            continue

        min_idx = vals.index(min(vals))
        source_color = cells[min_idx].fill.start_color.rgb or "FFFFFFFF"

        for cell in cells:
            cell.fill = PatternFill(fill_type="solid",
                                    start_color=source_color,
                                    end_color=source_color)

    print(f"{', '.join(cols)} columns processed.\n")

# save as new file
new_name = base + ext
i = 1
while os.path.exists(new_name):
    new_name = f"{base}_{i}{ext}"
    i += 1

wb.save(new_name)
print(f"Saved as {new_name}")


### Insert blank row after each row

For seperation of the individual fingerprints. 

Row height and column width were then adjusted in Excel to achieve a readable format.

In [None]:
import os
from openpyxl import load_workbook, Workbook
from openpyxl.cell.cell import MergedCell
from copy import copy

target_file = 'input.xlsx'
base, ext   = 'output', '.xlsx'
start_row   = 8  # start row

wb_in = load_workbook(target_file)
ws_in = wb_in.active
wb_out = Workbook()
ws_out = wb_out.active

# set column width
for col_letter, dim in ws_in.column_dimensions.items():
    ws_out.column_dimensions[col_letter].width = dim.width

# merge cells
for merged_range in ws_in.merged_cells.ranges:
    if merged_range.min_row < start_row:
        ws_out.merge_cells(str(merged_range))

max_row = ws_in.max_row
max_col = ws_in.max_column
out_row = 1

# copy cells
for in_row in range(1, max_row + 1):
    # use same cell height
    src_row_dim = ws_in.row_dimensions.get(in_row)
    if src_row_dim and src_row_dim.height:
        ws_out.row_dimensions[out_row].height = src_row_dim.height

    # copy cells
    for col in range(1, max_col + 1):
        src = ws_in.cell(row=in_row, column=col)
        tgt = ws_out.cell(row=out_row, column=col)

        # copy only cells
        if not isinstance(src, MergedCell):
            tgt.value = src.value
            tgt.fill = copy(src.fill)
            tgt.font = copy(src.font)
            tgt.border = copy(src.border)
            tgt.alignment = copy(src.alignment)
            tgt.number_format = src.number_format

    # add only after start_row
    if in_row >= start_row:
        out_row += 1

    out_row += 1

# save as new file
new_name = f"{base}{ext}"
i = 1
while os.path.exists(new_name):
    new_name = f"{base}_{i}{ext}"
    i += 1

wb_out.save(new_name)
print(f"Saved as {new_name}")


## UMAP Generation

### Survival of the fittest value

Similar to 'survival of the fittest color', except that here the worse values from two (or more) selected columns are replaced by the better ones.

The goal was, to represent the similarities of the visualized fingerprints in a UMAP. This creates a temporary file, which is then used to generate the UMAP.

In [None]:
import os
import re
import csv
from openpyxl import load_workbook
from openpyxl.utils import column_index_from_string

target_file = 'input.xlsx'
base, ext   = 'output', '.xlsx'
input_csv   = 'helper_csv_survival_of_the_fittest.csv'

wb = load_workbook(target_file)
ws = wb.active

mode = input("Select a csv file or manual input? Enter 'manual' or 'csv':\n> ").strip().lower()
if mode == 'csv':
    try:
        with open(input_csv, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter=';')
            eingaben = [row[0] for row in reader if row and row[0].strip()]
    except Exception as e:
        print(f"Error reading CSV: {e}")
        eingaben = []
else:
    eingaben = []

index = 0
while True:
    if index < len(eingaben):
        cols_input = eingaben[index]
        print(f"[CSV] Enter {index + 1}: {cols_input}")
        index += 1
    else:
        if mode == 'csv':
            break
        cols_input = input("Rows (z.B. A, B; '#' to exit)\n> ").strip()
        if cols_input == '#':
            break

    if not re.fullmatch(r'[A-Z]+(?:\s*,\s*[A-Z]+)+', cols_input):
        print("Invalid format.")
        continue

    cols = [c.strip() for c in cols_input.split(',')]
    try:
        col_idxs = [column_index_from_string(c) for c in cols]
    except ValueError as e:
        print(f"Error processing column: {e}")
        continue

    for row in range(9, 789):
        cells = [ws.cell(row=row, column=ci) for ci in col_idxs]
        values = []
        missing = []

        for i, cell in enumerate(cells):
            val = cell.value
            if val is None or val == "":
                missing.append(i)
                values.append(None)
            else:
                try:
                    num = float(str(val).replace(',', '.')) if isinstance(val, str) else float(val)
                    values.append(num)
                except Exception:
                    print(f"Non-numerical value in cell {cell.coordinate}: '{val}'")
                    values = None
                    break

        if values is None:
            continue

        num_values = [v for v in values if v is not None]

        if len(num_values) == len(values):
            min_val = min(num_values)
            for cell in cells:
                cell.value = min_val
        elif len(num_values) == 1:
            val = num_values[0]
            for i in missing:
                cells[i].value = val

    print(f"{', '.join(cols)} columns processed.\n")

# save as new file
new_name = base + ext
i = 1
while os.path.exists(new_name):
    new_name = f"{base}_{i}{ext}"
    i += 1

wb.save(new_name)
print(f"Saved as: {new_name}")
print(f"Path: {os.path.abspath(new_name)}")


### Bad-Fingerprint Identification

Empty cells get the value +10. This distorts the data in such a way, that compounds with bad fingerprints (i.e., compounds with a large number of missing values) form a clearly visible cluster. These can then be removed or flagged.

In [None]:
import pandas as pd
import numpy as np
import umap.umap_ as umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

excel_file = 'input.xlsx'
data_range_rows = (9, 788)
data_range_cols = "D:EI"
id_col = 'B'
output_pdf = "BAD_FINGERPRINTS.pdf"

bad_fingerprints_raw = """
ERYTHROMYCIN
TADALAFIL
PIMECROLIMUS
CICLOSPORIN
CLARITHROMYCIN
MIDOSTAURIN
TACROLIMUS
RIFAMPICIN
SIROLIMUS
ORITAVANCIN
TEMSIROLIMUS
ANIDULAFUNGIN
EVEROLIMUS
MICAFUNGIN
VANCOMYCIN
TERLIPRESSIN
MIVACURIUM
CASPOFUNGIN
CETRORELIX
BACITRACIN
TRIPTORELIN
NYSTATIN
AMPHOTERICIN B
ATOSIBAN
LEDIPASVIR
OMBITASVIR
ELBASVIR
CISATRACURIUM
ATRACURIUM
OXYTOCIN
FIDAXOMICIN
CARBETOCIN
ROCURONIUM
DESMOPRESSIN
POSACONAZOLE
CABAZITAXEL
RIFABUTIN
GLECAPREVIR
AZITHROMYCIN
RIFAXIMIN
DOCETAXEL
VINORELBINE
VINCRISTINE
VINBLASTINE
VINFLUNINE
TRABECTEDIN
VINDESINE
"""

gruppe1_raw = """ """
gruppe2_raw = """ """
gruppe3_raw = """ """
gruppe4_raw = """ """
gruppe5_raw = """ """
gruppe6_raw = """ """
gruppe7_raw = """ """
gruppe8_raw = """ """
gruppe9_raw = """ """
gruppe10_raw = """ """

def parse_raw_list(raw_string):
    return set(line.strip() for line in raw_string.strip().splitlines() if line.strip())

groups = {
    "Bad Fingerprints": {"data": parse_raw_list(bad_fingerprints_raw), "color": "#eeeeee", "text": "dimgray"},
    "Group 1": {"data": parse_raw_list(gruppe1_raw), "color": "#ff3030", "text": "black"},
    "Group 2": {"data": parse_raw_list(gruppe2_raw), "color": "#e67e22", "text": "black"},
    "Group 3": {"data": parse_raw_list(gruppe3_raw), "color": "#f1c232", "text": "black"},
    "Group 4": {"data": parse_raw_list(gruppe4_raw), "color": "hotpink", "text": "black"},
    "Group 5": {"data": parse_raw_list(gruppe5_raw), "color": "#b57edc", "text": "black"},
    "Group 6": {"data": parse_raw_list(gruppe6_raw), "color": "#5dade2", "text": "black"},
    "Group 7": {"data": parse_raw_list(gruppe7_raw), "color": "#1f77b4", "text": "black"},
    "Group 8": {"data": parse_raw_list(gruppe8_raw), "color": "#c0ff3e", "text": "black"},
    "Group 9": {"data": parse_raw_list(gruppe9_raw), "color": "mediumseagreen", "text": "black"},
    "Group 10": {"data": parse_raw_list(gruppe10_raw), "color": "#cd853f", "text": "black"},
}

skiprows = data_range_rows[0] - 1
nrows = data_range_rows[1] - data_range_rows[0] + 1
df = pd.read_excel(excel_file, usecols=data_range_cols, skiprows=skiprows, nrows=nrows, engine='openpyxl')
df.columns = df.columns.astype(str)
df = df.fillna(10)
df = df.applymap(lambda x: float(str(x).replace(',', '.')) if x != "" else 10.0)

id_df = pd.read_excel(excel_file, usecols=id_col, skiprows=skiprows, nrows=nrows, engine='openpyxl')
ids = id_df.astype(str).squeeze().tolist()
ids_upper = [id.upper() for id in ids]

scaler = StandardScaler()
X = scaler.fit_transform(df)

reducer = umap.UMAP(n_neighbors=15, min_dist=0.9, metric='euclidean', random_state=42) # these parameters can be adjusted to achieve different looks
embedding = reducer.fit_transform(X)

point_colors = []
text_colors = []

for id in ids:
    id_upper = id.upper()
    assigned = False
    for group in groups.values():
        if id_upper in group["data"]:
            point_colors.append(group["color"])
            text_colors.append(group["text"])
            assigned = True
            break
    if not assigned:
        point_colors.append("#999999")
        text_colors.append("black")

for group_name, group_info in groups.items():
    group_set = group_info["data"]
    matched = set()
    not_found = set()
    for entry in group_set:
        if entry in ids_upper:
            matched.update(entry for i, val in enumerate(ids_upper) if val == entry)
            for i, val in enumerate(ids_upper):
                if val == entry:
                    print(f"{group_name}: {ids[i]} marked")
        else:
            not_found.add(entry)
    if not not_found:
        print(f"All entries from {group_name} marked.")
    else:
        print(f"Not found in {group_name}: {', '.join(not_found)}")

plt.figure(figsize=(34, 24), dpi=300)
plt.scatter(embedding[:, 0], embedding[:, 1], alpha=0.8, s=180, color=point_colors)

def wrap_label(label, max_line_len=7):
    label = label.strip()
    if len(label) <= max_line_len:
        return label
    split = (len(label) + 1) // 2
    return label[:split] + '\n' + label[split:]

for i, txt in enumerate(ids):
    wrapped_txt = wrap_label(txt, max_line_len=7)
    plt.text(embedding[i, 0], embedding[i, 1], wrapped_txt,
             fontsize=3.5, ha='center', va='center', color=text_colors[i])

plt.title("UMAP of Fingerprints", fontsize=16)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True)
plt.tight_layout()

plt.savefig(output_pdf, format='pdf')
plt.show()


### U-Map (without distortion)

Empty cells get the average value of the respective dimension.

In [None]:
import pandas as pd
import numpy as np
import umap.umap_ as umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# File and range settings
excel_file = 'input.xlsx'
data_range_rows = (9, 788)
data_range_cols = "D:EI"
id_col = 'B'
output_pdf = "UMAP.pdf"

# Group definitions
bad_fingerprints_raw = """
ERYTHROMYCIN
TADALAFIL
PIMECROLIMUS
CICLOSPORIN
CLARITHROMYCIN
MIDOSTAURIN
TACROLIMUS
RIFAMPICIN
SIROLIMUS
ORITAVANCIN
TEMSIROLIMUS
ANIDULAFUNGIN
EVEROLIMUS
MICAFUNGIN
VANCOMYCIN
TERLIPRESSIN
MIVACURIUM
CASPOFUNGIN
CETRORELIX
BACITRACIN
TRIPTORELIN
NYSTATIN
AMPHOTERICIN B
ATOSIBAN
LEDIPASVIR
OMBITASVIR
ELBASVIR
CISATRACURIUM
ATRACURIUM
OXYTOCIN
FIDAXOMICIN
CARBETOCIN
ROCURONIUM
DESMOPRESSIN
POSACONAZOLE
CABAZITAXEL
RIFABUTIN
GLECAPREVIR
AZITHROMYCIN
RIFAXIMIN
DOCETAXEL
VINORELBINE
VINCRISTINE
VINBLASTINE
VINFLUNINE
TRABECTEDIN
VINDESINE
"""

gruppe1_raw = """ """
gruppe2_raw = """ """
gruppe3_raw = """ """
gruppe4_raw = """ """
gruppe5_raw = """ """
gruppe6_raw = """ """
gruppe7_raw = """ """
gruppe8_raw = """ """
gruppe9_raw = """ """
gruppe10_raw = """ """

# Convert raw strings to sets
def parse_raw_list(raw_string):
    return set(line.strip() for line in raw_string.strip().splitlines() if line.strip())

groups = {
    "Bad Fingerprints": {"data": parse_raw_list(bad_fingerprints_raw), "color": "#eeeeee", "text": "dimgray"},
    "Group 1": {"data": parse_raw_list(gruppe1_raw), "color": "#ff3030", "text": "black"},
    "Group 2": {"data": parse_raw_list(gruppe2_raw), "color": "#e67e22", "text": "black"},
    "Group 3": {"data": parse_raw_list(gruppe3_raw), "color": "#f1c232", "text": "black"},
    "Group 4": {"data": parse_raw_list(gruppe4_raw), "color": "hotpink", "text": "black"},
    "Group 5": {"data": parse_raw_list(gruppe5_raw), "color": "#b57edc", "text": "black"},
    "Group 6": {"data": parse_raw_list(gruppe6_raw), "color": "#5dade2", "text": "black"},
    "Group 7": {"data": parse_raw_list(gruppe7_raw), "color": "#1f77b4", "text": "black"},
    "Group 8": {"data": parse_raw_list(gruppe8_raw), "color": "#c0ff3e", "text": "black"},
    "Group 9": {"data": parse_raw_list(gruppe9_raw), "color": "mediumseagreen", "text": "black"},
    "Group 10": {"data": parse_raw_list(gruppe10_raw), "color": "#cd853f", "text": "black"},
}

# Read Excel data
skiprows = data_range_rows[0] - 1
nrows = data_range_rows[1] - data_range_rows[0] + 1

df = pd.read_excel(excel_file, usecols=data_range_cols, skiprows=skiprows, nrows=nrows, engine='openpyxl')
df.columns = df.columns.astype(str)
df = df.replace("", np.nan)
df = df.applymap(lambda x: float(str(x).replace(',', '.')) if pd.notnull(x) else np.nan)

id_df = pd.read_excel(excel_file, usecols=id_col, skiprows=skiprows, nrows=nrows, engine='openpyxl')
ids = id_df.astype(str).squeeze().tolist()
ids_upper = [id.upper() for id in ids]

# Standardize and apply UMAP
scaler = StandardScaler()
X = scaler.fit_transform(df.fillna(df.mean()))

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42) # these parameters can be adjusted to achieve different looks
embedding = reducer.fit_transform(X)

# Assign colors
point_colors = []
text_colors = []

for id in ids:
    id_upper = id.upper()
    assigned = False
    for group in groups.values():
        if id_upper in group["data"]:
            point_colors.append(group["color"])
            text_colors.append(group["text"])
            assigned = True
            break
    if not assigned:
        point_colors.append("#999999")
        text_colors.append("black")

# Group match logging
for group_name, group_info in groups.items():
    group_set = group_info["data"]
    matched = set()
    not_found = set()
    for entry in group_set:
        if entry in ids_upper:
            matched.update(entry for i, val in enumerate(ids_upper) if val == entry)
            for i, val in enumerate(ids_upper):
                if val == entry:
                    print(f"{group_name}: {ids[i]} marked")
        else:
            not_found.add(entry)
    if not not_found:
        print(f"All entries from {group_name} marked.")
    else:
        print(f"Not found in {group_name}: {', '.join(not_found)}")

# Plot UMAP
plt.figure(figsize=(34, 24), dpi=300)
plt.scatter(embedding[:, 0], embedding[:, 1], alpha=0.8, s=180, color=point_colors)

def wrap_label(label, max_line_len=7):
    label = label.strip()
    if len(label) <= max_line_len:
        return label
    split = (len(label) + 1) // 2
    return label[:split] + '\n' + label[split:]

for i, txt in enumerate(ids):
    wrapped_txt = wrap_label(txt, max_line_len=7)
    plt.text(embedding[i, 0], embedding[i, 1], wrapped_txt,
             fontsize=3.5, ha='center', va='center', color=text_colors[i])

plt.title("UMAP Fingerprints", fontsize=16)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True)
plt.tight_layout()
plt.savefig(output_pdf, format='pdf')
plt.show()
