In [1]:
from Bio.PDB import *
import nglview as nv



In [2]:
pdb_parser = PDBParser()
structure = pdb_parser.get_structure("PHA-L", "Task/target.pdb")
view = nv.show_biopython(structure)

In [6]:
print(len(structure))

1


In [3]:
for model in structure:
    for residue in model.get_residues():
        print(residue)

<Residue GLU het=  resseq=2 icode= >
<Residue VAL het=  resseq=3 icode= >
<Residue ASN het=  resseq=4 icode= >
<Residue SER het=  resseq=5 icode= >
<Residue PHE het=  resseq=6 icode= >
<Residue SER het=  resseq=7 icode= >
<Residue GLY het=  resseq=8 icode= >
<Residue TYR het=  resseq=9 icode= >
<Residue LEU het=  resseq=10 icode= >
<Residue LYS het=  resseq=11 icode= >
<Residue LEU het=  resseq=12 icode= >
<Residue THR het=  resseq=13 icode= >
<Residue ASP het=  resseq=14 icode= >
<Residue ASN het=  resseq=15 icode= >
<Residue VAL het=  resseq=16 icode= >
<Residue TYR het=  resseq=17 icode= >
<Residue ILE het=  resseq=18 icode= >
<Residue LYS het=  resseq=19 icode= >
<Residue ASN het=  resseq=20 icode= >
<Residue ALA het=  resseq=21 icode= >
<Residue ASP het=  resseq=22 icode= >
<Residue ILE het=  resseq=23 icode= >
<Residue VAL het=  resseq=24 icode= >
<Residue GLU het=  resseq=25 icode= >
<Residue GLU het=  resseq=26 icode= >
<Residue ALA het=  resseq=27 icode= >
<Residue LYS het=  r

In [8]:
polypeptide_builder = CaPPBuilder()
counter = 1
for polypeptide in polypeptide_builder.build_peptides(structure):
    seq = polypeptide.get_sequence()
    print(f"Sequence: {counter}, Length: {len(seq)}")
    print(seq)
    counter += 1

Sequence: 1, Length: 169
EVNSFSGYLKLTDNVYIKNADIVEEAKKVKPTVVVNAANVYLKHGGGVAGALNKATNNAMQVESDDYIATNGPLKVGGSCVLSGHNLAKHCLHVVGPNVNKGEDIQLLKSAYENFNQHEVLLAPLLSAGIFGADPIHSLRVCVDTVRTNVYLAVFDKNLYDKLVSSFLE


In [9]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [13]:
analyzed_seq = ProteinAnalysis(str(seq))
analyzed_seq

<Bio.SeqUtils.ProtParam.ProteinAnalysis at 0x7ff9d51f0940>

In [14]:
analyzed_seq.molecular_weight()

18197.526100000006

In [15]:
analyzed_seq.gravy()

0.015384615384615392

In [16]:
analyzed_seq.count_amino_acids()

{'A': 15,
 'C': 3,
 'D': 9,
 'E': 8,
 'F': 5,
 'G': 13,
 'H': 6,
 'I': 6,
 'K': 13,
 'L': 19,
 'M': 1,
 'N': 16,
 'P': 5,
 'Q': 3,
 'R': 2,
 'S': 10,
 'T': 6,
 'V': 22,
 'W': 0,
 'Y': 7}

In [17]:
analyzed_seq.secondary_structure_fraction()

(0.34911242603550297, 0.2603550295857988, 0.25443786982248523)

In [20]:
def get_params():

    m = Chem.MolFromSmiles(mol)
    pdb = Chem.rdmolfiles.MolToPDBBlock(m)
    Chem.MolToPDBFile(m, 'PDB/converted.pdb')
    polypeptide_builder = CaPPBuilder()
    pdb_parser = PDBParser()
    structure = pdb_parser.get_structure("PHA-L", "PDB/converted.pdb")
    # Create empty list for chains
    all_seqs = []
    counter = 1
    # For each polypeptide in the structure, run protein analysis methods and store in dict
    for pp in ppb.build_peptides(structure):
        seq_info = {} # create an empty dict
        seq = pp.get_sequence() # get the sequence like above
        analyzed_seq = ProteinAnalysis(str(seq)) # needs to be a str 
        # Specify dict keys and values    
        seq_info['Sequence Number'] = counter # set sequence id
        seq_info['Sequence'] = seq # store BioPython Seq() object
        seq_info['Sequence Length'] = len(seq) # length of seq
        seq_info['Molecular Weight'] = analyzed_seq.molecular_weight()
        seq_info['GRAVY'] = analyzed_seq.gravy() # hydrophobicity 
        seq_info['AA Count'] = analyzed_seq.count_amino_acids() 
        seq_info['AA Percent'] = analyzed_seq.get_amino_acids_percent()
        # tuple of (helix, turn, sheet)
        seq_info['Secondary Structure'] = \
            analyzed_seq.secondary_structure_fraction()

        # Update all_seqs list and increase counter
        all_seqs.append(seq_info)
        counter += 1
    return all_seqs

In [21]:
all_seqs

[{'Sequence Number': 1,
  'Sequence': Seq('EVNSFSGYLKLTDNVYIKNADIVEEAKKVKPTVVVNAANVYLKHGGGVAGALNK...FLE'),
  'Sequence Length': 169,
  'Molecular Weight': 18197.526100000006,
  'GRAVY': 0.015384615384615392,
  'AA Count': {'A': 15,
   'C': 3,
   'D': 9,
   'E': 8,
   'F': 5,
   'G': 13,
   'H': 6,
   'I': 6,
   'K': 13,
   'L': 19,
   'M': 1,
   'N': 16,
   'P': 5,
   'Q': 3,
   'R': 2,
   'S': 10,
   'T': 6,
   'V': 22,
   'W': 0,
   'Y': 7},
  'AA Percent': {'A': 0.08875739644970414,
   'C': 0.01775147928994083,
   'D': 0.05325443786982249,
   'E': 0.047337278106508875,
   'F': 0.029585798816568046,
   'G': 0.07692307692307693,
   'H': 0.03550295857988166,
   'I': 0.03550295857988166,
   'K': 0.07692307692307693,
   'L': 0.11242603550295859,
   'M': 0.005917159763313609,
   'N': 0.09467455621301775,
   'P': 0.029585798816568046,
   'Q': 0.01775147928994083,
   'R': 0.011834319526627219,
   'S': 0.05917159763313609,
   'T': 0.03550295857988166,
   'V': 0.1301775147928994,
   'W': 0.0,

In [25]:
import pandas as pd
data = pd.read_csv("Task/train.csv")

In [26]:
data = data.loc[:, "Smiles":]

In [31]:
from rdkit import Chem
m = Chem.MolFromSmiles(data["Smiles"][0])
pdb = Chem.rdmolfiles.MolToPDBBlock(m)
Chem.MolToPDBFile(m, 'converted.pdb')

In [30]:
pdb

'HETATM    1  C1  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM    2  O1  UNL     1       0.000   0.000   0.000  1.00  0.00           O  \nHETATM    3  C2  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM    4  C3  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM    5  C4  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM    6  C5  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM    7  N1  UNL     1       0.000   0.000   0.000  1.00  0.00           N  \nHETATM    8  C6  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM    9  C7  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM   10  C8  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM   11  C9  UNL     1       0.000   0.000   0.000  1.00  0.00           C  \nHETATM   12  N2  UNL     1       0.000   0.000   0.000  1.00  0.00           N  \nHETATM   13  C1