In [1]:
from Bio.PDB import *
import urllib.request
import numpy as np
import pandas as pd
from math import sqrt
import time

<h1><center>Loading basic data</center></h1>

In [2]:
peptidasesList = pd.read_csv("Y:/Yuan/temp/MCSA_EC3.4_peptidases.csv") #for big machine
#peptidasesList = pd.read_csv("/Volumes/Lab_Public/Yuan/temp/MCSA_EC3.4_peptidases.csv") #for mac
peptidasesList = peptidasesList[peptidasesList.iloc[:,4] == "residue"]
peptidasesList = peptidasesList.reset_index(drop=True)

In [3]:
peptidasesList[0:3]

Unnamed: 0,M-CSA ID,Uniprot IDs,PDB,EC,residue/reactant/product/cofactor,PDB code,chain/kegg compound,resid/chebi id,function location/name,role,role type,role group
0,M0587,P00727,1lam,3.4.11.1,residue,Lys,A,262.0,side_chain,electrostatic stabiliser,spectator,electrostatic interaction
1,M0587,P00727,1lam,3.4.11.1,residue,Arg,A,336.0,side_chain,electrostatic stabiliser,spectator,electrostatic interaction
2,M0167,Q01693,1lok,3.4.11.10,residue,His,A,97.0,side_chain,metal ligand,interaction,


In [4]:
bindingSiteDic = {}
for i in range(len(peptidasesList)):
    #print(bindingSiteDic)
    if peptidasesList.loc[i,"PDB"] not in bindingSiteDic:
        bindingSiteDic[peptidasesList.loc[i,"PDB"]] = {peptidasesList.loc[i,"chain/kegg compound"]: [peptidasesList.loc[i,"resid/chebi id"]]}
    elif peptidasesList.loc[i,"chain/kegg compound"] not in bindingSiteDic[peptidasesList.loc[i,"PDB"]]:
         bindingSiteDic[peptidasesList.loc[i,"PDB"]] = {peptidasesList.loc[i,"chain/kegg compound"]: [peptidasesList.loc[i,"resid/chebi id"]]}
    else:
        bindingSiteDic[peptidasesList.loc[i,"PDB"]][peptidasesList.loc[i,"chain/kegg compound"]].append(peptidasesList.loc[i,"resid/chebi id"])
for protein in bindingSiteDic:
    for chain in bindingSiteDic[protein]:
        bindingSiteDic[protein][chain] = list(set(bindingSiteDic[protein][chain])) 

In [5]:
uniqueList = peptidasesList[["PDB","chain/kegg compound"]].drop_duplicates()
uniqueList.reset_index(drop = True).iloc[0:10,]

Unnamed: 0,PDB,chain/kegg compound
0,1lam,A
1,1lok,A
2,1xgm,A
3,1b65,A
4,1ei5,A
5,1azw,A
6,1a16,A
7,1itq,A
8,1fy2,A
9,1r44,A


In [6]:
backbone = ["N","CA","C","O"]
aminoAcidCodes = ["ALA","ARG","ASN","ASP","CYS","GLN","GLY","GLU","HIS","ILE","LEU","LYS",
                 "MET","PHE","PRO","PYL","SER","SEC","THR","TRP","TYR","TRP","VAL"]

<h2><center> One chain sample</center></h2>

In [20]:
pdbID = uniqueList.iloc[0,0]
chainOrder = uniqueList.iloc[0,1]
PDB = PDBList()
PDB.retrieve_pdb_file(pdb_code = pdbID, pdir = "/Volumes/Lab_Public/Yuan/temp", file_format="pdb")
p = PDBParser()
structure = p.get_structure("X","/Volumes/Lab_Public/Yuan/temp/pdb"+pdbID+".ent")

Downloading PDB structure '1lam'...


In [24]:
oneChain = pd.DataFrame(columns = ["Seq","Residue","Center","Direction","CA_position"])#center here means center of R group
if structure.header["resolution"]<=3.0:
    if chainOrder in [x.id for x in list(structure[0].get_chains())]: #Chain information not in pdb file
        for residue in structure[0][chainOrder]: 
            if residue.get_resname() in aminoAcidCodes: # Only treat common amino acid
                if len(list(residue.get_atoms())) >3:
                    if residue.get_resname() != "GLY":  # Glysine as a special case
                        point = vectors.Vector([0,0,0])
                        for atom in residue:
                            if(atom.get_name() not in backbone):
                                point = point + atom.get_vector()
                        center = point.__div__(len(residue)-4)
                        cToRGroup = residue["CA"].get_vector()-center
                        oneChain.loc[len(oneChain)] = [residue.get_id()[1],residue.get_resname(),center,cToRGroup,residue["CA"].get_vector()]        
                    else:
                        center = residue["CA"].get_vector()
                        cToRGroup = center - (residue["C"].get_vector()+residue["N"].get_vector()+residue["O"].get_vector()).__div__(3)
                        oneChain.loc[len(oneChain)] = [residue.get_id()[1],residue.get_resname(),center,cToRGroup,residue["CA"].get_vector()]                   

In [30]:
oneChain.iloc[0:5,0:5]

Unnamed: 0,Seq,Residue,Center,Direction,CA_position
0,1,THR,"<Vector 18.17, 67.29, 13.91>","<Vector 1.44, -1.38, -0.22>","<Vector 19.60, 65.91, 13.69>"
1,2,LYS,"<Vector 21.21, 72.18, 14.21>","<Vector 0.50, -3.45, 0.96>","<Vector 21.71, 68.74, 15.17>"
2,3,GLY,"<Vector 23.52, 68.89, 18.50>","<Vector -0.48, -1.09, 0.59>","<Vector 23.52, 68.89, 18.50>"
3,4,LEU,"<Vector 23.93, 74.32, 20.98>","<Vector 0.91, -2.44, -0.51>","<Vector 24.84, 71.88, 20.47>"
4,5,VAL,"<Vector 29.62, 71.23, 22.52>","<Vector -1.71, 0.96, 0.19>","<Vector 27.92, 72.19, 22.71>"


In [26]:
distanceMatrix = pd.DataFrame(columns = list(oneChain.iloc[:,0]),index = list(oneChain.iloc[:,0]))
angleMatrix = pd.DataFrame(columns = list(oneChain.iloc[:,0]),index = list(oneChain.iloc[:,0]))
numResidue = len(oneChain)
for row in range(0,3):
    if row % 50 ==0:
        print(str(row)+"th row")
    for column in range(0,numResidue):
        coordinatesSubstraction = list(oneChain.loc[row,"Center"] - oneChain.loc[column,"Center"])
        distanceMatrix.iloc[row,column] = sqrt(sum(list(map(lambda x:x*x, coordinatesSubstraction))))
        v1 = oneChain.loc[row,"Direction"] 
        v2 = oneChain.loc[column,"Center"] - oneChain.loc[row,"CA_position"] 
        angleMatrix.iloc[row,column] = 180*np.arccos(np.dot(v1,v2)/(np.linalg.norm(v1) * np.linalg.norm(v2)))/np.pi
        

0th row


  if sys.path[0] == '':
  if sys.path[0] == '':


<h1><center> Insert data into mysql</center></h1>

In [7]:
mysql_configure = pd.read_csv("Y:/Yuan/temp/mysql_connection.csv",index_col=0)
import pymysql
import pandas as pd
from sshtunnel import SSHTunnelForwarder
from os.path import expanduser
import getpass

sql_hostname = mysql_configure.loc["sql_hostname",]["value"]
sql_username = mysql_configure.loc["sql_username",]["value"]
sql_password = mysql_configure.loc["sql_password",]["value"]
sql_main_database = mysql_configure.loc["sql_main_database",]["value"]
sql_port = int(mysql_configure.loc["sql_port",]["value"])
ssh_host = mysql_configure.loc["ssh_host",]["value"]
ssh_user = mysql_configure.loc["ssh_user",]["value"]
ssh_password = mysql_configure.loc["ssh_password",]["value"]
ssh_port = int(mysql_configure.loc["ssh_port",]["value"])

In [20]:
pymysql.converters.encoders[np.float64] = pymysql.converters.escape_float
pymysql.converters.conversions = pymysql.converters.encoders.copy()
pymysql.converters.conversions.update(pymysql.converters.decoders)

# Insert position, distance_angle

with SSHTunnelForwarder(
        (ssh_host, ssh_port),
        ssh_username=ssh_user,
        ssh_password=ssh_password,
        remote_bind_address=('127.0.0.1', sql_port)) as tunnel:
    conn = pymysql.connect(host='127.0.0.1', user=sql_username,
                           passwd=sql_password, db=sql_main_database,
                           port=tunnel.local_bind_port)
    print('SSH connected')
    for index, oneRow in uniqueList.iterrows():
        try:
            pdbID = oneRow["PDB"]
            chainOrder = oneRow["chain/kegg compound"]
            PDB = PDBList()
            PDB.retrieve_pdb_file(pdb_code=pdbID, pdir="Y:/Lab_Public/Yuan/temp", file_format="pdb")
            p = PDBParser()
            structure = p.get_structure("X", "Y:/Lab_Public/Yuan/temp/pdb" + pdbID + ".ent")
            oneChain = pd.DataFrame(columns = ["Seq","Residue","Center","Direction","CA_position"])#center here means center of R group
            if structure.header["resolution"] <= 3.0:
                if chainOrder in [x.id for x in list(structure[0].get_chains())]:  # Chain information not in pdb file
                    for residue in structure[0][chainOrder]:
                        if residue.get_resname() in aminoAcidCodes:  # Only treat common amino acid
                            if residue.get_id()[2] == " ":  # ignore insertions
                                if len(list(residue.get_atoms())) > 3:
                                    if residue.get_resname() != "GLY":  # Glysine as a special case
                                        point = vectors.Vector([0, 0, 0])
                                        for atom in residue:
                                            if (atom.get_name() not in backbone):
                                                point = point + atom.get_vector()
                                        center = point.__div__(len(residue) - 4)
                                        cToRGroup = residue["CA"].get_vector() - center
                                        oneChain.loc[len(oneChain)] = [residue.get_id()[1], residue.get_resname(),center, cToRGroup,residue["CA"].get_vector()]
                                    else:
                                        center = residue["CA"].get_vector()
                                        cToRGroup = center - (residue["C"].get_vector() + residue["N"].get_vector() + residue["O"].get_vector()).__div__(3)
                                        oneChain.loc[len(oneChain)] = [residue.get_id()[1], residue.get_resname(),center, cToRGroup,residue["CA"].get_vector()]
                                    #query = '''INSERT INTO `ResiduePosition` (`pdbID`, `chain`,`seq`,`type`,`Coordinate1`,`Coordinate2`,`Coordinate3`) VALUES (%s, %s, %s,%s,%s,%s,%s);'''
                                    #value = (pdbID, chainOrder, residue.get_id()[1], residue.get_resname(), float(center[0]), float(center[1]),float(center[2]))
                                    #print(value)
                                    #conn.cursor().execute(query, value)
                                    #conn.commit()
            distanceMatrix = pd.DataFrame(columns=list(oneChain.iloc[:, 0]), index=list(oneChain.iloc[:,0]))
            angleMatrix = pd.DataFrame(columns=list(oneChain.iloc[:, 0]), index=list(oneChain.iloc[:, 0]))
            numResidue = len(oneChain)
            print(numResidue)
            for row in range(0, numResidue):
                if row % 100 == 0:
                    print(str(row) + "th row")
                for column in range(0, numResidue):#not symetric anymore
                    coordinatesSubstraction = list(oneChain.loc[row, "Center"] - oneChain.loc[column, "Center"])
                    pairDistance = sqrt(sum(list(map(lambda x: x * x, coordinatesSubstraction))))
                    distanceMatrix.iloc[row, column] = pairDistance
                    v1 = oneChain.loc[row, "Direction"]
                    v2 = oneChain.loc[column,"Center"] - oneChain.loc[row,"CA_position"] 
                    if abs(pairDistance-0)>0.00001: #Exclue 0 distance, e.g. self to self
                        pairAngle = 180 * np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) / np.pi
                        
                    else:
                        pairAngle = 0
                    angleMatrix.iloc[row, column] = pairAngle
                    if not np.isnan(pairDistance):
                        query = '''INSERT INTO `Distance_angle2.0` (`pdbID`, `chain`,`ID_1`,`Res_1`,`ID_2`,`Res_2`,`Distance`,`Angle`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);'''
                        value = (pdbID,chainOrder,oneChain.iloc[row,0],oneChain.iloc[row,1],oneChain.iloc[column,0],oneChain.iloc[column,1],float(pairDistance),float(pairAngle))
                        #print(value)
                        conn.cursor().execute(query, value)
                conn.commit()
        except Exception as e:
            print(e)
            print(pdbID, "skipped")

print("Finished")
conn.close()


SSH connected
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1lam.ent' 
484
0th row
100th row
200th row
300th row
400th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1lok.ent' 
291
0th row
100th row
200th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1xgm.ent' 
295
0th row
100th row
200th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1b65.ent' 
363
0th row
100th row
200th row
300th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1ei5.ent' 
518
0th row
100th row
200th row
300th row
400th row
500th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1azw.ent' 
313
0th row
100th row
200th row
300th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1a16.ent' 
441
0th row
100th row
200th row
300th row
400th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1itq.ent' 
369
0th row
100th row
200th row
300th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1fy2.ent' 
220
0th row
100th row
200th row
Structure exists: 'Y:/Lab_Public/Yuan/temp\pdb1r44.ent' 
202
0th row
100th row
200th 