TITLE: example.ipynb

PURPOSE: <GOES HERE>

PROJECT: <GOES HERE>

AUTHOR: Jacob Lloyd North

INSTITUTION: Oregon State University

PRECONDITIONS: 

INPUTS: 

OUTPUTS: 

SECTION 1: Include commonly used libraries

In [None]:
# Maths and visualization libraries
import numpy as np          # NumPy
import scipy        # Import all of SciPy
import networkx 
# import pathpy2
import matplotlib.pyplot as plt     # Matplotlib
import umap

In [None]:
%matplotlib inline

In [None]:
# Machine learning libraries
# import sklearn
# import torch             # Import all of PyTorch
# import fastai            # Import all of FastAi
# import pydbm            # boltzmann machines

In [None]:
# BIOLOGY-SPECIFIC LIBRARIES
# Bioinformatics
import biopandas
import Bio          # Biopython
import pdbtools     # Useful for dealing with pdbs

# Structural Biology
import RamachanDraw
# import biskit       # BUILD FAILS
# import aleph        # Molecular replacement library
from ensemblator.ensemblator_core import analyze, prepare_input     # Clark, Brereton, Karplus

# MD analysis
import mdtraj           # Import all of MDTraj
import MDAnalysis

# Protein Dynamics
import prody as pd            # Protein dynamics
# import pydtmc           # discrete-time markov chains
# Normal modes of motion
# import pydmd            # Dynamic mode decomposition
# import pynamical        # Dynamical systems 
import pyemma
# import pypcazip

In [None]:
# PHYSICAL CHEMISTRY LIBRARIES

# Quantum chemistry libraries
# import quantum_dynamics
# import qutip

# Statistical thermodynamics
# import curp         # energy (heat) flow analysis -- ONLY in Python2 currently!

In [None]:
# Cellular biology
# import pysces       # Will copy stuff to Pysces directory for model!

In [None]:
# UTILITY LIBRARIES
import wget         # to download pdb files

In [None]:
# DEBUG
# import mdbenchmark  # For optimizing core usage in low-resources machines

SECTION 2: MAIN

In [None]:
# Get PDB
PDB_id = input("Please enter a PDB ID:")
pro_name = input("Enter a name label:")
# Print Ramachandran plot of the protein
from RamachanDraw import fetch, phi_psi, plot
plot(fetch(PDB_id))     # Draw the Ramachandran plot

# Generating a dictionary to store the phi and psi angles, also return the ignored AA
phi_psi_dict, ignored_res = phi_psi(fetch(PDB_id), return_ignored=True)

# ProDy testing
prot = pd.parsePDB(PDB_id)

In [None]:
# Print useful statistics
print("Radius of gyration:", pd.calcGyradius(prot))
print("Number of atoms:", prot.numAtoms())
print("Number of Coordinate sets:", prot.numCoordsets())
print("Number of residues:", prot.numResidues())
pd.showProtein(prot)

PRINCIPAL COMPONENT ANALYSIS

In [None]:
# Prepare the ensemble
pro = pd.parsePDB(PDB_id, subset='calpha')
pro_selection = pro.select('resnum < ' + str(prot.numResidues()))
pro_ensemble = pd.Ensemble(pro_selection)
# pro_ensemble.setCoords()
pro_ensemble.iterpose()

In [None]:
# Run PCA 
pca = pd.PCA(pro_name)
pca.buildCovariance(pro_ensemble)
pca.calcModes()

In [None]:
# Observer top 4 ranked principal components
for mode in pca[:6]:
    print(pd.calcFractVariance(mode).round(2))
# Save the principal modes
# pd.saveModel(pca)

ANISOTROPIC NETWORK MODEL


In [None]:
anm = pd.ANM(pro_name) # instantiate ANM object
anm.buildHessian(pro_selection) # build Hessian matrix for selected atoms
anm.calcModes() # calculate normal modes
# saveModel(anm)

In [None]:
# Access individual mode instances
slowest_mode = anm[0]
print( slowest_mode )
print( slowest_mode.getEigval().round(3) )

In [None]:
# Confirm mode orthogonality - dot product of mode vectors
print((anm[0] * anm[1]).round(10))
print((anm[0] * anm[2]).round(10))

COMPARING EXPERIMENTAL AND THEORETICAL RESULTS

In [None]:
# Compare overlap table of PCA and ANM
pd.printOverlapTable(pca[:6], anm[:6])
pd.showOverlapTable(pca[:6], anm[:6])

DATA OUTPUT

In [None]:
# Write Normal Modes for PCA data
pd.writeNMD(pro_name + '_' + PDB_id + '_pca.nmd', pca[:6], pro_selection)         # NMD format for nm wizard
# writeArray('ubi_pca_modes.txt', pca.getArray(), format='%8.3f')     # text

In [None]:
# Write Normal Modes for ANM data
pd.writeNMD(pro_name + '_' + PDB_id + '_anm.nmd', anm[:6], pro_selection)         # NMD format for nm wizard

In [None]:
# pd.pathVMD('/Users/jacobnorth/Applications/VMD\ 1.9.4.app/Contents/MacOS/startup.command')
# pd.viewNMDinVMD('ubi_pca.nmd')

EVOLUTIONARY ANALYSIS - EVOL

In [None]:
type(pkey)
(list(pkey))[0]

In [None]:
# Download the full MSA file for protein family
pkey = pd.searchPfam(PDB_id).keys()      # obtain the key
full_MSA = pd.fetchPfamMSA((list(pkey))[0])        # Fetch the full MSA
msa = pd.parseMSA(full_MSA)        # Parse the MSA

# Refine MSA to remove gappy entries
msa_refine = pd.refineMSA(msa, label=pro_name, rowocc=0.8, seqid=0.98)

In [None]:
# Occupancy calculation
pd.showMSAOccupancy(msa_refine, occ='res')
calcMSAOccupancy(msa_refine, occ='res').min()   # Find the minimum

# Shannon entropy
entropy = calcShannonEntropy(msa_refine)
showShannonEntropy(entropy, indices)
mutinfo = buildMutinfoMatrix(msa_refine)
mutinfo_norm = applyMutinfoNorm(mutinfo, entropy, norm='minent')
mutinfo_corr = applyMutinfoCorr(mutinfo, corr='apc')
showMutinfoMatrix(mutinfo)
showMutinfoMatrix(mutinfo_corr, clim=[0, mutinfo_corr.max()], xlabel=pro_name)
writeArray(pro_name + 'array.txt', mutinfo)
# Sequence-structure comparison - http://prody.csb.pitt.edu/tutorials/evol_tutorial/comparison.html