# Trajectory Length Analysis

This notebook analyzes the trajectory lengths and amino acid counts for molecular dynamics simulations in the run1 and run2 directories.

In [22]:
import os
import glob
import pandas as pd
import numpy as np
import mdtraj as md
from collections import defaultdict
import sys
import nglview as nv
%matplotlib inline
import sys, os
from math import ceil
import seaborn as sb
from scipy.spatial.transform import Rotation as R
from scipy import stats
import tables as tb
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from mpl_toolkits.mplot3d import Axes3D
import ipywidgets as widgets
import subprocess as sp
# import matplotlib.matlab.hist as hist

sys.path.append('../utils')
import mdtraj_upside as mu

In [2]:
def get_trajectory_files():
    """Get all trajectory files from run1 and run2 directories."""
    run1_files = glob.glob('../run1/*-out_*.h5')
    run2_files = glob.glob('../run2/*-out_*.h5')
    
    all_files = run1_files + run2_files
    print(f"Found {len(run1_files)} files in run1 and {len(run2_files)} files in run2")
    print(f"Total files: {len(all_files)}")
    
    return all_files
def extract_pdb_id(filename):
    """Extract PDB ID from filename by splitting on '-' and taking first element."""
    basename = os.path.basename(filename)
    return basename.split('-')[0]
def get_unique_pdb_files(all_files):
    """Get one representative file for each unique PDB ID."""
    pdb_to_file = {}
    
    for filepath in all_files:
        pdb_id = extract_pdb_id(filepath)
        if pdb_id not in pdb_to_file:
            pdb_to_file[pdb_id] = filepath
    
    print(f"Found {len(pdb_to_file)} unique PDB IDs:")
    for pdb_id in sorted(pdb_to_file.keys()):
        print(f"  {pdb_id}: {os.path.basename(pdb_to_file[pdb_id])}")
    
    return pdb_to_file

get_unique_pdb_files(get_trajectory_files())

Found 0 files in run1 and 0 files in run2
Total files: 0
Found 0 unique PDB IDs:


{}

In [3]:
up_indir = r"/persistent/inputs"
base_dir = r"/persistent/inputss"
pdb_id = "a3d"
traj_fn = os.path.join(up_indir,"{}.run.up".format(pdb_id))

init_fn = os.path.join(up_indir,"{}.initial.npy".format(pdb_id))
fasta_fn = os.path.join(up_indir,"{}.fasta".format(pdb_id))

traj = mu.load_upside_traj(traj_fn,stride=1)

In [4]:
def protein_length(traj):
    """
    Returns the number of amino acid residues in the protein(s) in the trajectory.

    Parameters
    ----------
    traj : md.Trajectory
        The MDTraj trajectory object.

    Returns
    -------
    int
        Number of amino acid residues in the protein(s).
    """
    # List of standard amino acid residue names
    standard_aa = set([
        'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY',
        'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER',
        'THR', 'TRP', 'TYR', 'VAL'
    ])
    
    # Count residues that are amino acids
    count = 0
    for residue in traj.topology.residues:
        if residue.name in standard_aa:
            count += 1
    return count

protein_length(traj)

73

In [5]:
def center_of_mass(traj):
    xyz = traj.xyz
    # Get masses (n_atoms,)
    if traj.topology is not None and traj.topology.n_atoms == traj.n_atoms:
        masses = np.array([atom.element.mass for atom in traj.topology.atoms])
    else:
        raise ValueError("Trajectory must have topology with elemental masses.")

    masses = masses.astype(np.float64)
    total_mass = np.sum(masses)

    # Compute center of mass for each frame
    com = np.sum(xyz * masses[None, :, None], axis=1) / total_mass  # shape: (n_frames, 3)

    return com

coms=center_of_mass(traj)

def radius_of_gyration(traj):
    # Get positions (n_frames, n_atoms, 3)
    xyz = traj.xyz
    # Get masses (n_atoms,)
    if traj.topology is not None and traj.topology.n_atoms == traj.n_atoms:
        masses = np.array([atom.element.mass for atom in traj.topology.atoms])
    else:
        raise ValueError("Trajectory must have topology with elemental masses.")

    masses = masses.astype(np.float64)
    total_mass = np.sum(masses)

    # Compute center of mass for each frame
    com = np.sum(xyz * masses[None, :, None], axis=1) / total_mass  # shape: (n_frames, 3)

    # Compute squared distances from COM for each atom and frame
    diff = xyz - com[:, None, :]  # shape: (n_frames, n_atoms, 3)
    sq_dist = np.sum(diff ** 2, axis=2)  # shape: (n_frames, n_atoms)

    # Weighted sum and sqrt for each frame
    rg_sq = np.sum(masses[None, :] * sq_dist, axis=1) / total_mass  # shape: (n_frames,)
    rg = np.sqrt(rg_sq)
    return rg

rgs=radius_of_gyration(traj)

In [6]:
view = nv.NGLWidget()
view.add_trajectory(traj)
view.clear_representations()
view.add_cartoon(color='gray')

view.camera = 'orthographic'
view.parameters = {
    "clipNear": -100, "clipFar": 1000,
    "fogNear": 0, "fogFar": 1000,
}
view

NGLWidget(max_frame=28308)

In [10]:
sphere_radii = 1 / np.array(rgs) * 10

positions = coms  # shape: (n_spheres, 3)
radii = sphere_radii

view = nv.NGLWidget()
view.add_trajectory(traj)
view.clear_representations()
view.add_cartoon(color='gray')

# Create a Shape object
shape = nv.Shape('spheres')
for pos, rad in zip(positions, radii):
    shape.add_sphere(list(pos), color=[0, 1, 0], radius=float(rad))  # green spheres

view.add_shape(shape)

view.camera = 'orthographic'
view.parameters = {
    "clipNear": -100, "clipFar": 1000,
    "fogNear": 0, "fogFar": 1000,
}

view

AttributeError: module 'nglview' has no attribute 'Shape'

In [16]:
def process_pdb_files(source_dir, command_script):
    """
    Finds all .pdb files in the source directory and executes a command on them.
    
    Args:
        source_dir (str): The path to the directory containing PDB files.
        command_script (str): The Python script to execute on each PDB file.
    """
    print("--- Starting PDB File Processing ---")
    for filename in os.listdir(source_dir):
        if filename.lower().endswith('.pdb'):
            source_path = os.path.join(source_dir, filename)
            if os.path.isfile(source_path):
                print(f"Processing {filename}...")
                try:
                    # Command: python PDB_to_initial_sturcture.py path/to/file.pdb
                    sp.run(['python', command_script, source_path], check=True, capture_output=True, text=True)
                except sp.CalledProcessError as e:
                    print(f"  -> Error processing {filename}.")
                    print(f"  -> Stderr: {e.stderr.strip()}")
                except FileNotFoundError:
                    print(f"  -> FATAL ERROR: Script '{command_script}' not found.")
                    return # Stop if the script is missing

def move_non_pdb_files(source_dir, dest_dir):
    """
    Moves all files that are not .pdb files from the source to the destination directory.
    
    Args:
        source_dir (str): The directory to move files from.
        dest_dir (str): The directory to move files to.
    """
    print("\n--- Moving Non-PDB Files ---")
    os.makedirs(dest_dir, exist_ok=True) # Ensure dest_dir exists
    
    for filename in os.listdir(source_dir):
        if not filename.lower().endswith('.pdb'):
            source_path = os.path.join(source_dir, filename)
            if os.path.isfile(source_path):
                print(f"Moving {filename}...")
                dest_path = os.path.join(dest_dir, filename)
                shutil.move(source_path, dest_path)



In [19]:
process_pdb_files("/persistent/data/native/","/upside2-md/py/PDB_to_initial_structure.py")

--- Starting PDB File Processing ---


FileNotFoundError: [Errno 2] No such file or directory: '/persistent/data/native/'

In [30]:
import os

# Use the relative path from the notebook's perspective
relative_path = '../data/native/'

print(f"Checking relative path: '{relative_path}'")
print(f"Does it exist? -> {os.path.exists(relative_path)}")
print(f"Is it a directory? -> {os.path.isdir(relative_path)}")

if os.path.exists(relative_path):
    print("\nSuccess! Contents:")
    print(os.listdir(relative_path)[:10]) # Print first 10 items

Checking relative path: '../data/native/'
Does it exist? -> False
Is it a directory? -> False


In [23]:
print("hello")

hello


NameError: name 'get_trajectory_files' is not defined