In [1]:
from Bio.PDB import *
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import nglview as nv
from IPython.display import display



In [2]:
cif_parser = MMCIFParser()
structure = cif_parser.get_structure("ABL1", "data/3oxz.cif")
view = nv.show_biopython(structure)
view

NGLWidget()

In [3]:
mmcif_dict = MMCIF2Dict.MMCIF2Dict("data/3oxz.cif")
len(mmcif_dict)

614

In [4]:
for model in structure:
    for residue in model.get_residues():
        print(residue)

<Residue GLY het=  resseq=228 icode= >
<Residue SER het=  resseq=229 icode= >
<Residue PRO het=  resseq=230 icode= >
<Residue ASN het=  resseq=231 icode= >
<Residue TYR het=  resseq=232 icode= >
<Residue ASP het=  resseq=233 icode= >
<Residue LYS het=  resseq=234 icode= >
<Residue TRP het=  resseq=235 icode= >
<Residue GLU het=  resseq=236 icode= >
<Residue MET het=  resseq=237 icode= >
<Residue GLU het=  resseq=238 icode= >
<Residue ARG het=  resseq=239 icode= >
<Residue THR het=  resseq=240 icode= >
<Residue ASP het=  resseq=241 icode= >
<Residue ILE het=  resseq=242 icode= >
<Residue THR het=  resseq=243 icode= >
<Residue MET het=  resseq=244 icode= >
<Residue LYS het=  resseq=245 icode= >
<Residue HIS het=  resseq=246 icode= >
<Residue LYS het=  resseq=247 icode= >
<Residue LEU het=  resseq=248 icode= >
<Residue GLY het=  resseq=249 icode= >
<Residue GLY het=  resseq=250 icode= >
<Residue GLY het=  resseq=251 icode= >
<Residue GLN het=  resseq=252 icode= >
<Residue TYR het=  resseq

In [5]:
residues = structure.get_residues()
[item for item in residues]

[<Residue GLY het=  resseq=228 icode= >,
 <Residue SER het=  resseq=229 icode= >,
 <Residue PRO het=  resseq=230 icode= >,
 <Residue ASN het=  resseq=231 icode= >,
 <Residue TYR het=  resseq=232 icode= >,
 <Residue ASP het=  resseq=233 icode= >,
 <Residue LYS het=  resseq=234 icode= >,
 <Residue TRP het=  resseq=235 icode= >,
 <Residue GLU het=  resseq=236 icode= >,
 <Residue MET het=  resseq=237 icode= >,
 <Residue GLU het=  resseq=238 icode= >,
 <Residue ARG het=  resseq=239 icode= >,
 <Residue THR het=  resseq=240 icode= >,
 <Residue ASP het=  resseq=241 icode= >,
 <Residue ILE het=  resseq=242 icode= >,
 <Residue THR het=  resseq=243 icode= >,
 <Residue MET het=  resseq=244 icode= >,
 <Residue LYS het=  resseq=245 icode= >,
 <Residue HIS het=  resseq=246 icode= >,
 <Residue LYS het=  resseq=247 icode= >,
 <Residue LEU het=  resseq=248 icode= >,
 <Residue GLY het=  resseq=249 icode= >,
 <Residue GLY het=  resseq=250 icode= >,
 <Residue GLY het=  resseq=251 icode= >,
 <Residue GLN he

In [6]:
Selection.unfold_entities(structure, "R")

[<Residue GLY het=  resseq=228 icode= >,
 <Residue SER het=  resseq=229 icode= >,
 <Residue PRO het=  resseq=230 icode= >,
 <Residue ASN het=  resseq=231 icode= >,
 <Residue TYR het=  resseq=232 icode= >,
 <Residue ASP het=  resseq=233 icode= >,
 <Residue LYS het=  resseq=234 icode= >,
 <Residue TRP het=  resseq=235 icode= >,
 <Residue GLU het=  resseq=236 icode= >,
 <Residue MET het=  resseq=237 icode= >,
 <Residue GLU het=  resseq=238 icode= >,
 <Residue ARG het=  resseq=239 icode= >,
 <Residue THR het=  resseq=240 icode= >,
 <Residue ASP het=  resseq=241 icode= >,
 <Residue ILE het=  resseq=242 icode= >,
 <Residue THR het=  resseq=243 icode= >,
 <Residue MET het=  resseq=244 icode= >,
 <Residue LYS het=  resseq=245 icode= >,
 <Residue HIS het=  resseq=246 icode= >,
 <Residue LYS het=  resseq=247 icode= >,
 <Residue LEU het=  resseq=248 icode= >,
 <Residue GLY het=  resseq=249 icode= >,
 <Residue GLY het=  resseq=250 icode= >,
 <Residue GLY het=  resseq=251 icode= >,
 <Residue GLN he

In [7]:
polypeptide_builder = CaPPBuilder()
all_seqs = []
seq_number = 1
for pp in polypeptide_builder.build_peptides(structure):
    seq_info = {}
    seq = pp.get_sequence()
    analyzed_seq = ProteinAnalysis(str(seq))
    seq_info['Sequence Number'] = seq_number
    seq_info['Sequence'] = seq
    seq_info['Sequence Length'] = len(seq)
    seq_info['Molecular Weight'] = analyzed_seq.molecular_weight()
    seq_info['GRAVY'] = analyzed_seq.gravy()
    seq_info['AA Count'] = analyzed_seq.count_amino_acids()
    seq_info['AA Percent'] = analyzed_seq.get_amino_acids_percent()
    seq_info['Secondary Structure'] = analyzed_seq.secondary_structure_fraction()
    all_seqs.append(seq_info)
    seq_number += 1

In [8]:
for seq in all_seqs:
    display(seq)

{'Sequence Number': 1,
 'Sequence': Seq('GSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTL'),
 'Sequence Length': 46,
 'Molecular Weight': 5294.966800000002,
 'GRAVY': -0.78695652173913,
 'AA Count': {'A': 1,
  'C': 0,
  'D': 2,
  'E': 4,
  'F': 0,
  'G': 6,
  'H': 1,
  'I': 1,
  'K': 6,
  'L': 3,
  'M': 2,
  'N': 1,
  'P': 1,
  'Q': 1,
  'R': 1,
  'S': 2,
  'T': 4,
  'V': 4,
  'W': 2,
  'Y': 4},
 'AA Percent': {'A': 0.021739130434782608,
  'C': 0.0,
  'D': 0.043478260869565216,
  'E': 0.08695652173913043,
  'F': 0.0,
  'G': 0.13043478260869565,
  'H': 0.021739130434782608,
  'I': 0.021739130434782608,
  'K': 0.13043478260869565,
  'L': 0.06521739130434782,
  'M': 0.043478260869565216,
  'N': 0.021739130434782608,
  'P': 0.021739130434782608,
  'Q': 0.021739130434782608,
  'R': 0.021739130434782608,
  'S': 0.043478260869565216,
  'T': 0.08695652173913043,
  'V': 0.08695652173913043,
  'W': 0.043478260869565216,
  'Y': 0.08695652173913043},
 'Secondary Structure': (0.3043478260869565,
  0.

{'Sequence Number': 2,
 'Sequence': Seq('VEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQ...GLS'),
 'Sequence Length': 106,
 'Molecular Weight': 12182.06579999999,
 'GRAVY': 0.05943396226415092,
 'AA Count': {'A': 8,
  'C': 3,
  'D': 3,
  'E': 11,
  'F': 5,
  'G': 4,
  'H': 3,
  'I': 5,
  'K': 6,
  'L': 14,
  'M': 4,
  'N': 6,
  'P': 3,
  'Q': 3,
  'R': 5,
  'S': 4,
  'T': 4,
  'V': 10,
  'W': 0,
  'Y': 5},
 'AA Percent': {'A': 0.07547169811320754,
  'C': 0.02830188679245283,
  'D': 0.02830188679245283,
  'E': 0.10377358490566038,
  'F': 0.04716981132075472,
  'G': 0.03773584905660377,
  'H': 0.02830188679245283,
  'I': 0.04716981132075472,
  'K': 0.05660377358490566,
  'L': 0.1320754716981132,
  'M': 0.03773584905660377,
  'N': 0.05660377358490566,
  'P': 0.02830188679245283,
  'Q': 0.02830188679245283,
  'R': 0.04716981132075472,
  'S': 0.03773584905660377,
  'T': 0.03773584905660377,
  'V': 0.09433962264150944,
  'W': 0.0,
  'Y': 0.04716981132075472},
 'Secondary Structure': (

{'Sequence Number': 3,
 'Sequence': Seq('YT'),
 'Sequence Length': 2,
 'Molecular Weight': 282.2924,
 'GRAVY': -1.0,
 'AA Count': {'A': 0,
  'C': 0,
  'D': 0,
  'E': 0,
  'F': 0,
  'G': 0,
  'H': 0,
  'I': 0,
  'K': 0,
  'L': 0,
  'M': 0,
  'N': 0,
  'P': 0,
  'Q': 0,
  'R': 0,
  'S': 0,
  'T': 1,
  'V': 0,
  'W': 0,
  'Y': 1},
 'AA Percent': {'A': 0.0,
  'C': 0.0,
  'D': 0.0,
  'E': 0.0,
  'F': 0.0,
  'G': 0.0,
  'H': 0.0,
  'I': 0.0,
  'K': 0.0,
  'L': 0.0,
  'M': 0.0,
  'N': 0.0,
  'P': 0.0,
  'Q': 0.0,
  'R': 0.0,
  'S': 0.0,
  'T': 0.5,
  'V': 0.0,
  'W': 0.0,
  'Y': 0.5},
 'Secondary Structure': (0.5, 0.0, 0.0)}

{'Sequence Number': 4,
 'Sequence': Seq('GAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYEL...ELG'),
 'Sequence Length': 114,
 'Molecular Weight': 13266.858499999995,
 'GRAVY': -0.4508771929824562,
 'AA Count': {'A': 8,
  'C': 2,
  'D': 5,
  'E': 14,
  'F': 6,
  'G': 6,
  'H': 1,
  'I': 6,
  'K': 7,
  'L': 8,
  'M': 4,
  'N': 2,
  'P': 8,
  'Q': 4,
  'R': 4,
  'S': 10,
  'T': 3,
  'V': 5,
  'W': 5,
  'Y': 6},
 'AA Percent': {'A': 0.07017543859649122,
  'C': 0.017543859649122806,
  'D': 0.043859649122807015,
  'E': 0.12280701754385964,
  'F': 0.05263157894736842,
  'G': 0.05263157894736842,
  'H': 0.008771929824561403,
  'I': 0.05263157894736842,
  'K': 0.06140350877192982,
  'L': 0.07017543859649122,
  'M': 0.03508771929824561,
  'N': 0.017543859649122806,
  'P': 0.07017543859649122,
  'Q': 0.03508771929824561,
  'R': 0.03508771929824561,
  'S': 0.08771929824561403,
  'T': 0.02631578947368421,
  'V': 0.043859649122807015,
  'W': 0.043859649122807015,
  'Y': 0.05263157894736842},
 '