# Protein Visualizations

In [1]:
%pip install nglview

Collecting nglview
  Downloading nglview-3.1.4.tar.gz (21.9 MB)
     ---------------------------------------- 0.0/21.9 MB ? eta -:--:--
     ---------------------------------------- 0.2/21.9 MB 6.6 MB/s eta 0:00:04
     - -------------------------------------- 0.6/21.9 MB 7.5 MB/s eta 0:00:03
     - -------------------------------------- 1.0/21.9 MB 7.8 MB/s eta 0:00:03
     -- ------------------------------------- 1.4/21.9 MB 8.0 MB/s eta 0:00:03
     --- ------------------------------------ 1.8/21.9 MB 8.0 MB/s eta 0:00:03
     --- ------------------------------------ 2.2/21.9 MB 8.0 MB/s eta 0:00:03
     ---- ----------------------------------- 2.6/21.9 MB 8.1 MB/s eta 0:00:03
     ----- ---------------------------------- 2.9/21.9 MB 8.0 MB/s eta 0:00:03
     ----- ---------------------------------- 3.3/21.9 MB 8.0 MB/s eta 0:00:03
     ------ --------------------------------- 3.7/21.9 MB 8.0 MB/s eta 0:00:03
     ------- -------------------------------- 4.1/21.9 MB 8.1 MB/s eta 0:0


[notice] A new release of pip is available: 23.3.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
%pip install kinase-library

^C
Note: you may need to restart the kernel to use updated packages.


Collecting kinase-library
  Downloading kinase_library-1.1.0-py3-none-any.whl.metadata (10 kB)
Collecting adjustText~=1.3.0 (from kinase-library)
  Downloading adjustText-1.3.0-py3-none-any.whl.metadata (3.1 kB)
Collecting Bio~=1.7.1 (from kinase-library)
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting gseapy~=1.1.2 (from kinase-library)
  Downloading gseapy-1.1.7-cp311-cp311-win_amd64.whl.metadata (12 kB)
Collecting matplotlib~=3.8.3 (from kinase-library)
  Downloading matplotlib-3.8.4-cp311-cp311-win_amd64.whl.metadata (5.9 kB)
Collecting natsort~=8.3.1 (from kinase-library)
  Downloading natsort-8.3.1-py3-none-any.whl.metadata (22 kB)
Collecting numpy~=1.26.4 (from kinase-library)
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Collecting pandas~=2.2.3 (from kinase-library)
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pyarrow~=18.0.0 (from kinase-library)
  Downloading pyarrow-18.0.0-cp311-cp311-win_amd

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.14.0 requires tensorboard<2.15,>=2.14, but you have tensorboard 2.16.2 which is incompatible.

[notice] A new release of pip is available: 23.3.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nglview as nv
import ipywidgets as widgets
from IPython.display import display
from functools import partial
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import os


def coloring_func(id, data, cmap):
    return cmap(data[id])

def visualize_protein_predictions(pdb_file, binary_labels, predictions, attention_scores = None):
    """
    Visualizes protein structure with residue-level coloring for binary labels, predictions, and attention scores.

    Args:
        pdb_file (str): Path to the PDB file of the protein structure.
        binary_labels (list or np.array): List or array of binary labels (0 or 1) for each residue.
                                         Must be in the same order as residues in the PDB.
        predictions (list or np.array): List or array of prediction scores (0.0 to 1.0) for each residue.
                                        Must be in the same order as residues in the PDB.
        attention_scores (list or np.array): List or array of attention scores for each residue.
                                           Must be in the same order as residues in the PDB.

    Returns:
        nglview.NGLWidget: An NGLView widget containing the three visualizations.
    """

    view = nv.show_structure_file(pdb_file)
    view.clear_representations() # Start with a clean view
    cmap_pred = cm.get_cmap('plasma') # Choose a colormap (e.g., 'viridis', 'plasma', 'RdBu')


    # --- View 1: Binary Labels ---
    print("Creating View 1: Binary Labels...")
    label_colors = []
    for i, label in enumerate(binary_labels):
        if label == 1:
            label_colors.append(["green", str(i + 1)])
        else:
            label_colors.append(["blue", str(i + 1)])

    label_scheme = nv.color._ColorScheme(label_colors, label='Binary labels')
    view.add_representation('surface','protein', color=label_scheme, label='Binary Labels')
    # --- View 2: Prediction Scores ---
    print("Creating View 2: Prediction Scores...")
    prediction_colors = [(colors.to_hex(cmap_pred(p)), str(i + 1)) for i, p in enumerate(predictions)]

    
    pred_scheme = nv.color._ColorScheme(prediction_colors, label='Prediction Scores')
    view.add_representation('surface', 'protein', color=pred_scheme, label='Prediction Scores')

    if attention_scores is not None:
        # --- View 3: Attention Scores ---
        print("Creating View 3: Attention Scores...")
        cmap_attn = cm.get_cmap('plasma') # Choose a colormap (e.g., 'plasma', 'magma', 'YlOrRd')
        attention_colors = [(colors.to_hex(cmap_attn(a)), str(i)) for i, a in enumerate(attention_scores)]

        view.add_representation('surface', 'protein', color=attention_colors, label='Attention Scores')

    return view


In [None]:
import json
import pandas as pd

prot_info = pd.read_json('../data/phosphosite_sequences/phosphosite_df.json').set_index('id')
pred_path = '../data/preds/encoder_S_60_focal_preds_annotated.pt' 
with open(pred_path, 'r') as f:
    preds = json.load(f)

i = 0
for prot in preds:
    path = f'..\\data\\pdbs\\{prot}.pdb'
    if not os.path.exists(path):
        continue
    prot_preds = preds[prot]
    seq_len = len(prot_info.loc[prot]['sequence'])
    residue_preds = prot_preds['preds']
    labels = prot_preds['labels']
    indices = np.asarray(prot_preds['pred_indices'])
    pred_input = np.zeros(seq_len)
    pred_input[indices - 1] = residue_preds
    label_input = np.zeros(seq_len)
    label_input[indices - 1] = labels

    view = visualize_protein_predictions(path, label_input, pred_input)
    i += 1
    if i >= 6:
        print(prot_preds)
        print(labels)
        break

In [121]:
view.center()
view.display(True)

NGLWidget(gui_style='ngl')

In [82]:
from Bio.PDB import PDBParser

parser = PDBParser()
pdb = parser.get_structure('A0A024R4G9', '..\\data\\pdbs\\A0A024R4G9.pdb')

In [83]:
len(list(pdb.get_residues()))

117