# Sample From Mesh Files & Load Proteins

In this notebook, point clouds are created from mesh files using poisson disk sampling.

In [6]:
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import matplotlib.pyplot as plt
import open3d as o3d
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def load_mesh_and_sample_points(file_name, num_points=5000):
    # Default number of points as indicated in the thesis
    mesh = o3d.io.read_triangle_mesh(file_name)
    pc = mesh.sample_points_poisson_disk(num_points)
    return np.asarray(pc.points)

In [None]:
# Load data and convert to point clouds
path = 'mesh_files_new'
count = 0
for idx, file in tqdm(enumerate(os.listdir(path))):
    file_path = os.path.join(path, file)
    if os.path.isfile(file_path):
        label = ""
        for s in file[:-3]:
            if s.isalpha():
                label += s
        point_cloud = load_mesh_and_sample_points(file_path)
        np.savez('point_clouds/' + label + str(count) + '.npz', objects=point_cloud, classes=label)
        count += 1

In [None]:
# This step was only needed once
# Can be ignored
path = 'point_clouds'
objects = []
labels = []
for idx, file in tqdm(enumerate(os.listdir(path))):
    file_path = os.path.join(path, file)
    if os.path.isfile(file_path):
        load_table = np.load(file_path, allow_pickle=True)
        if len(load_table['objects'].shape) > 2:
            for i in range(load_table['objects'].shape[0]):
                objects.append(load_table['objects'][i])
                labels.append(load_table['classes'][i])
        else:
            objects.append(load_table['objects'])
            labels.append(load_table['classes'])

In [None]:
np.savez('point_clouds/all_point_clouds', objects=objects, labels=labels)

In [7]:
load_clouds = np.load('point_clouds/all_point_clouds.npz', allow_pickle=True)
point_cloud_data = [load_clouds['objects'], load_clouds['labels']]

In [None]:
def create_scatterplot_3d(data, descriptor):
    figure = plt.figure(figsize=(5, 5))
    ax = figure.add_subplot(111, projection='3d')

    # Map classes to colors
    unique_classes = data['class'].unique()
    colors = plt.cm.tab10(range(len(unique_classes)))
    class_color_map = dict(zip(unique_classes, colors))

    # Plot points by class
    for class_label in unique_classes:
        class_data = data[data['class'] == class_label]
        ax.scatter(
            class_data['evrap_x'], class_data['evrap_y'], class_data['evrap_z'],
            color=class_color_map[class_label], label=class_label, s=50
        )

    ax.legend()
    ax.view_init(80, 10)
    ax.set_title(descriptor)


def create_scatterplot_2d(data, descriptor):
    figure = plt.figure(figsize=(5, 5))
    ax = figure.add_subplot(111)

    # Map classes to colors
    unique_classes = data['class'].unique()
    colors = plt.cm.tab10(range(len(unique_classes)))
    class_color_map = dict(zip(unique_classes, colors))

    # Plot points by class
    for class_label in unique_classes:
        class_data = data[data['class'] == class_label]
        ax.scatter(
            class_data['samp_x'], class_data['samp_y'],
            color=class_color_map[class_label], label=class_label, s=50
        )

    ax.legend()
    ax.set_title(descriptor)


def create_scatterplot_1d(data, descriptor):
    figure = plt.figure(figsize=(5, 5))
    ax = figure.add_subplot(111)

    # Map classes to colors
    unique_classes = data['class'].unique()
    colors = plt.cm.tab10(range(len(unique_classes)))
    class_color_map = dict(zip(unique_classes, colors))

    # Plot points by class
    for class_label in unique_classes:
        class_data = data[data['class'] == class_label]
        x = np.arange(len(class_data[descriptor]))
        ax.scatter(
            x, class_data[descriptor], color=class_color_map[class_label], label=class_label,
        )

    ax.legend()
    ax.set_title(descriptor)

## Runtime Overview

The descriptors used have the following complexities. 

- EVARP: $O(nm)$
- SAMP: $O(nm)$
- SCOMP: $O(n \cdot (m$ log $m)$
- SIRM: $O(nm)$
- Shell Model: $O(nm)$
- Sector Model: $O(nm)$
- Combined Model: $O(n(m + m))$
- FPFH: $O(nmk)$

Notation:
- n: Number of point clouds
- m: Number of points per point cloud
- k: A hyperparameter of FPFH (relatively small), similar to k in k-NN

## Load Proteins

In [1]:
import requests

In [14]:
# https://search.rcsb.org/index.html#building-search-request
# https://search.rcsb.org/index.html#search-example-4
url = 'https://search.rcsb.org/rcsbsearch/v2/query'

query_dict = {
    "query": {
        "type": "terminal",
        "service": "structure",          # structural similarity is what we want
        "parameters": {
            "value": {
                "entry_id": "1CLL",
                "assembly_id": "1"
            }
        }
    },
    "return_type": "entry",
    "request_options": {
        "paginate": {
            "start": 0,
            "rows": 20
        }
    }
}

In [15]:
response = requests.post(url, json=query_dict)
results = response.json()

In [16]:
results

{'query_id': '557f5d0b-56ab-4cf9-b0bc-ec4ae1aa2023',
 'result_type': 'entry',
 'total_count': 18,
 'result_set': [{'identifier': '1CLL', 'score': 1.0},
  {'identifier': '1CLM', 'score': 0.8264158267282838},
  {'identifier': '3CLN', 'score': 0.7895704167549658},
  {'identifier': '1OSA', 'score': 0.73525576468782},
  {'identifier': '5E1P', 'score': 0.7281498103877214},
  {'identifier': '5E1N', 'score': 0.7145077657697297},
  {'identifier': '5E1K', 'score': 0.7074197905830122},
  {'identifier': '1EXR', 'score': 0.7040225418508469},
  {'identifier': '2V02', 'score': 0.7028028258758835},
  {'identifier': '2V01', 'score': 0.6910993479091333},
  {'identifier': '4CLN', 'score': 0.6565741132447246},
  {'identifier': '1UP5', 'score': 0.5124798037022233},
  {'identifier': '4BW8', 'score': 0.4312863158696412},
  {'identifier': '1OOJ', 'score': 0.4233689857933641},
  {'identifier': '4BW7', 'score': 0.20946535079797327},
  {'identifier': '1RFJ', 'score': 0.045702729285794365},
  {'identifier': '5A2H

In [17]:
pdb_ids = [entry["identifier"] for entry in results["result_set"]]
print(pdb_ids)

['1CLL', '1CLM', '3CLN', '1OSA', '5E1P', '5E1N', '5E1K', '1EXR', '2V02', '2V01', '4CLN', '1UP5', '4BW8', '1OOJ', '4BW7', '1RFJ', '5A2H', '6DAH']


In [26]:
response = requests.get("https://data.rcsb.org/rest/v1/core/entry/1CLL")
result = response.json()

In [30]:
result["struct"]

{'title': 'CALMODULIN STRUCTURE REFINED AT 1.7 ANGSTROMS RESOLUTION'}

In [28]:
pdb_id = "1CLL"
url = f"https://files.rcsb.org/download/{pdb_id}.cif"
response = requests.get(url)
if response.status_code == 200:
    with open(f"{pdb_id}.cif", "wb") as f:
        f.write(response.content)
    print(f"Downloaded {pdb_id}.cif")

Downloaded 1CLL.cif


In [27]:
result.keys()

dict_keys(['audit_author', 'cell', 'citation', 'diffrn', 'diffrn_radiation', 'entry', 'exptl', 'exptl_crystal', 'pdbx_audit_revision_category', 'pdbx_audit_revision_details', 'pdbx_audit_revision_group', 'pdbx_audit_revision_history', 'pdbx_audit_revision_item', 'pdbx_database_status', 'rcsb_accession_info', 'rcsb_entry_container_identifiers', 'rcsb_entry_info', 'rcsb_primary_citation', 'refine', 'refine_hist', 'refine_ls_restr', 'software', 'struct', 'struct_keywords', 'symmetry', 'rcsb_id'])

In [24]:
result['cell']

{'angle_alpha': 93.62,
 'angle_beta': 97.3,
 'angle_gamma': 89.17,
 'length_a': 30.17,
 'length_b': 53.6,
 'length_c': 25.14,
 'zpdb': 1}

Problem: RCSB only has information about whether a protein is in its native confirmation in some cases, therefore we can not guarantee to build clusters of similar items. AlphaFold DB on the other hand has no tool to query similar proteins.

Generally, what do we want to cluster? Just get proteins in similar structure from the database, since they commonly also have similar functions? This creates a high bias, but is probably the only way.

Next: How similar should proteins be based on their similarity score?

Protein suggestions:

- 1A4U - Hemoglobin, a protein responsible for oxygen transport in the blood.
- 1GZX - Lysozyme, an enzyme that breaks down bacterial cell walls.
- 1UBQ - Ubiquitin, a small regulatory protein involved in protein degradation.
- 3MHT - DNA polymerase I, an enzyme that synthesizes DNA molecules.
- 4HHB - Myoglobin, a protein that stores oxygen in muscle tissues.
- 6VXX - SARS-CoV-2 spike glycoprotein, involved in viral entry into host cells.
- 2RH1 - Beta-2 adrenergic receptor, a G-protein coupled receptor (GPCR).
- 5XTL - Insulin receptor, important for glucose metabolism regulation.
- 3KZ8 - Cytochrome c oxidase, involved in the electron transport chain.
- 2C9T - Glutamate receptor, a ligand-gated ion channel in the nervous system.