# PINIR Data Preprocessing - Page-3

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

In [5]:
pd.set_option('display.max_colwidth',2000)

In [6]:
pd.reset_option('display.max_colwidth')

## Disulphide Bonds in Domains
File Used *PINIR_Domains_DiSuplhide_Bonds.csv* created using python program *DSBondFinder.py*.  Online Utility : *http://disulfind.disi.unitn.it/*. This website predicts the probable Disulphide Bonds within the submitted Amino Sequences.

In [7]:
domainDSBonds=pd.read_csv('data/PINIR_Domains_DiSuplhide_Bonds.csv')

In [8]:
domainDSBonds.head(2)

Unnamed: 0,domainID,sequence,dsBonds,bondCount,confidence
0,IRD-1,QICINCCAGKKGCNYFSADGTFICEGESEYVSEVNDNLEKHCPRNC...,"['bond(3,46)', 'bond(6,24)', 'bond(7,42)', 'bo...",4,0.644382
1,IRD-2,QICTNCCAGKKGCMYFSDDGTFICEGESEYVSEVPVDNKPCPRNCD...,"['bond(3,45)', 'bond(6,24)', 'bond(7,41)', 'bo...",4,0.715053


### To parse the DS Bonds we need to convert dsBonds field into a List. For this we require Abstract Syntax Trees library

In [9]:
import ast # Abstract Syntax Trees

In [10]:
ast.literal_eval(domainDSBonds['dsBonds'][0])## Evaluating dsBonds field of first record to List

['bond(3,46)', 'bond(6,24)', 'bond(7,42)', 'bond(13,55)']

In [11]:
ast.literal_eval(domainDSBonds['dsBonds'][0])[0]

'bond(3,46)'

In [12]:
## Adding a new Column dsBonds_list which will contain the list format of dsBonds

In [13]:
domainDSBonds['dsBonds_list']=domainDSBonds['dsBonds'].apply(lambda x:ast.literal_eval(x))

In [14]:
domainDSBonds['dsBonds_list'].head()

0    [bond(3,46), bond(6,24), bond(7,42), bond(13,55)]
1    [bond(3,45), bond(6,24), bond(7,41), bond(13,54)]
2    [bond(3,45), bond(6,24), bond(7,41), bond(13,54)]
3    [bond(3,43), bond(6,24), bond(7,39), bond(13,52)]
4                            [bond(7,49), bond(24,53)]
Name: dsBonds_list, dtype: object

In [15]:
# Parsing the Start and End Position of DS Bonds of first Record

In [16]:
for bonds in domainDSBonds['dsBonds_list'][0]:
    print(bonds.split(','))
    print('Start Position of DS Bond : '+ bonds.split(',')[0][5:])
    print('End Position of DS Bond : '+ bonds.split(',')[1][:-1])
    print('----------------------------')

['bond(3', '46)']
Start Position of DS Bond : 3
End Position of DS Bond : 46
----------------------------
['bond(6', '24)']
Start Position of DS Bond : 6
End Position of DS Bond : 24
----------------------------
['bond(7', '42)']
Start Position of DS Bond : 7
End Position of DS Bond : 42
----------------------------
['bond(13', '55)']
Start Position of DS Bond : 13
End Position of DS Bond : 55
----------------------------


In [17]:
domainDSBonds.head(2)

Unnamed: 0,domainID,sequence,dsBonds,bondCount,confidence,dsBonds_list
0,IRD-1,QICINCCAGKKGCNYFSADGTFICEGESEYVSEVNDNLEKHCPRNC...,"['bond(3,46)', 'bond(6,24)', 'bond(7,42)', 'bo...",4,0.644382,"[bond(3,46), bond(6,24), bond(7,42), bond(13,55)]"
1,IRD-2,QICTNCCAGKKGCMYFSDDGTFICEGESEYVSEVPVDNKPCPRNCD...,"['bond(3,45)', 'bond(6,24)', 'bond(7,41)', 'bo...",4,0.715053,"[bond(3,45), bond(6,24), bond(7,41), bond(13,54)]"


In [18]:
domainDSBonds.columns

Index(['domainID', 'sequence', 'dsBonds', 'bondCount', 'confidence',
       'dsBonds_list'],
      dtype='object')

In [19]:
## Writing to File : 20_Domains_Disulphide_Bonds.csv

In [20]:
# domainDSBonds.to_csv('data/20_Domains_Disulphide_Bonds.csv',index=False,columns=['domainID', 'sequence', 'dsBonds_list', 'bondCount', 'confidence'])

In [21]:
domainDSBonds=pd.read_csv('data/20_Domains_Disulphide_Bonds.csv')

In [22]:
domainDSBonds.head(2)

Unnamed: 0,domainID,sequence,dsBonds_list,bondCount,confidence
0,IRD-1,QICINCCAGKKGCNYFSADGTFICEGESEYVSEVNDNLEKHCPRNC...,"['bond(3,46)', 'bond(6,24)', 'bond(7,42)', 'bo...",4,0.644382
1,IRD-2,QICTNCCAGKKGCMYFSDDGTFICEGESEYVSEVPVDNKPCPRNCD...,"['bond(3,45)', 'bond(6,24)', 'bond(7,41)', 'bo...",4,0.715053


In [23]:
domainDSBonds.bondCount.value_counts()

4    647
3     28
2     15
0      7
1      1
Name: bondCount, dtype: int64

## Iso Electric Point of Sequences
Isoelectric point, the pH at which a particular molecule carries no net electrical charge. To calculate this property of sequences we use the website: [http://isoelectric.org/]. We submit the sequences in the FASTA format to the utility, which resulted in Isoelectric Points details of the sequences.

In [24]:
uniprot_isoElecPoint=pd.read_csv('data/IsoElectricPoint_Uniprot.csv')

In [25]:
uniprot_isoElecPoint.head(2)

Unnamed: 0,header,sequence,molecular_weight,average_pI,IPC_protein,IPC_peptide,Toseland,Thurlkill,Nozaki_Tanford,Dawson,...,Patrickios,Rodwell,Sillero,Solomon,Lehninger,Wikipedia,ProMoST,pH_5.5_charge,pH_7.4_charge,pH_8.0_charge
0,sp|P05119|IP21_SOLLC Wound-induced proteinase ...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,16292.75374,6.879,6.726,7.451,6.177,7.625,8.178,7.422,...,0.19,7.433,8.01,7.455,7.482,7.316,7.252,3.2,-4.8,-10.7
1,sp|P84813|POTM1_SOLTU Potamin-1 (Fragment) OS=...,DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,4833.46574,6.2,6.148,6.708,5.945,6.977,6.884,6.664,...,0.301,6.655,7.111,6.702,6.717,6.624,5.453,0.7,-3.0,-5.5


In [26]:
uniprot_isoElecPoint.header.count()

452

In [27]:
uniprot_isoElecPoint.header[0]

'sp|P05119|IP21_SOLLC Wound-induced proteinase inhibitor 2 OS=Solanum lycopersicum OX=4081 PE=1 SV=1'

In [28]:
uniprot_isoElecPoint.header[0].split('|')

['sp',
 'P05119',
 'IP21_SOLLC Wound-induced proteinase inhibitor 2 OS=Solanum lycopersicum OX=4081 PE=1 SV=1']

In [29]:
uniprot_isoElecPoint.header[0].split('|')[1]

'P05119'

In [30]:
uniprot_isoElecPoint['UniprotID']=uniprot_isoElecPoint.header.apply(lambda x:x.split('|')[1])

In [31]:
uniprot_isoElecPoint['UniprotID'].head()

0    P05119
1    P84813
2    P01080
3    P83241
4    P01078
Name: UniprotID, dtype: object

In [32]:
uniprot_isoElecPoint.columns

Index(['header', 'sequence', 'molecular_weight', 'average_pI', 'IPC_protein',
       'IPC_peptide', 'Toseland', 'Thurlkill', 'Nozaki_Tanford', 'Dawson',
       'DTASelect', 'EMBOSS', 'Grimsley', 'Patrickios', 'Rodwell', 'Sillero',
       'Solomon', 'Lehninger', 'Wikipedia', 'ProMoST', 'pH_5.5_charge',
       'pH_7.4_charge', 'pH_8.0_charge', 'UniprotID'],
      dtype='object')

In [33]:
uniprot_isoElecPoint=uniprot_isoElecPoint[['UniprotID', 'sequence', 'molecular_weight', 'average_pI', 'pH_5.5_charge',
       'pH_7.4_charge', 'pH_8.0_charge']]

In [34]:
uniprot_isoElecPoint.head()

Unnamed: 0,UniprotID,sequence,molecular_weight,average_pI,pH_5.5_charge,pH_7.4_charge,pH_8.0_charge
0,P05119,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,16292.75374,6.879,3.2,-4.8,-10.7
1,P84813,DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,4833.46574,6.2,0.7,-3.0,-5.5
2,P01080,MDVHKEVNFVAYLLIVLGLLVLVSAMDVDAKACIRECGNLGFGICP...,16505.04074,6.271,1.8,-6.8,-12.8
3,P83241,KACPRNCDTDIAYMVCPSSGERIIRKVCTNCCAAQKGCKLFRSNGS...,5603.57774,8.223,6.1,2.6,-0.4
4,P01078,QICTNCCAGRKGCSYFSEDGTFICKGESNPENPKACPRNCDGRIAY...,5579.31984,6.841,1.2,-2.4,-5.2


In [35]:
uniprot_isoElecPoint.count()

UniprotID           452
sequence            452
molecular_weight    452
average_pI          452
pH_5.5_charge       452
pH_7.4_charge       452
pH_8.0_charge       452
dtype: int64

In [36]:
## Saving the data to file 21_Sequence_IsoElectricPoints.csv

In [37]:
## uniprot_isoElecPoint.to_csv('data/21_Sequence_IsoElectricPoints.csv',index=False)

## Sequence/ Domains Amino Composition
##### To find the Amino Composition of any sequence we executed a Python program SequenceAminoComposition.py .This program finds the Amino Acids composition of Amino Acid Sequences. These Amino Acid sequences can be PI sequences or Domain sequences. It gives each Amino Acid's count and percentage in the sequences.

In [55]:
seqAminoComp=pd.read_csv('data/PINIR_PI_SequenceAminoComposition.csv')

In [56]:
seqAminoComp.head()

Unnamed: 0.1,Unnamed: 0,UniprotID,Sequence,Length,A,A_per,C,C_per,D,D_per,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,1,P05119,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,148,7,4.72973,16,10.810811,5,3.378378,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,2,P84813,DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,47,6,12.765957,7,14.893617,3,6.382979,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0
2,3,P01080,MDVHKEVNFVAYLLIVLGLLVLVSAMDVDAKACIRECGNLGFGICP...,153,11,7.189542,16,10.457516,6,3.921569,...,8,5.228758,0,0.0,0,0.0,10,6.535948,0,0
3,4,P83241,KACPRNCDTDIAYMVCPSSGERIIRKVCTNCCAAQKGCKLFRSNGS...,52,4,7.692308,8,15.384615,2,3.846154,...,2,3.846154,0,0.0,0,0.0,1,1.923077,0,0
4,5,P01078,QICTNCCAGRKGCSYFSEDGTFICKGESNPENPKACPRNCDGRIAY...,52,3,5.769231,8,15.384615,2,3.846154,...,0,0.0,0,0.0,0,0.0,2,3.846154,0,0


In [57]:
seqAminoComp.columns

Index(['Unnamed: 0', 'UniprotID', 'Sequence', 'Length', 'A', 'A_per', 'C',
       'C_per', 'D', 'D_per', 'E', 'E_per', 'F', 'F_per', 'G', 'G_per', 'H',
       'H_per', 'I', 'I_per', 'K', 'K_per', 'L', 'L_per', 'M', 'M_per', 'N',
       'N_per', 'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per', 'R', 'R_per', 'S',
       'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per', 'W', 'W_per', 'X',
       'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [63]:
seqAminoComp=seqAminoComp[['UniprotID','Length', 'A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']]

In [68]:
seqAminoComp.head()

Unnamed: 0,UniprotID,Length,A,C,D,E,F,G,H,I,...,Q,R,S,T,U,V,W,X,Y,Z
0,P05119,148,7,16,5,8,8,16,1,7,...,1,5,9,6,0,7,0,0,11,0
1,P84813,47,6,7,3,1,1,4,1,3,...,1,0,2,4,0,0,0,0,1,0
2,P01080,153,11,16,6,9,5,16,2,7,...,1,3,9,5,0,8,0,0,10,0
3,P83241,52,4,8,2,1,1,4,0,4,...,1,4,4,4,0,2,0,0,1,0
4,P01078,52,3,8,2,3,2,6,0,4,...,1,3,4,2,0,0,0,0,2,0


In [64]:
seqAminoComp.describe()

Unnamed: 0,Length,A,C,D,E,F,G,H,I,K,...,Q,R,S,T,U,V,W,X,Y,Z
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,...,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,165.044248,11.473451,17.39823,7.889381,9.732301,5.334071,13.561947,2.04646,8.20354,10.573009,...,2.643805,7.238938,11.287611,8.646018,0.0,7.349558,0.396018,0.006637,7.086283,0.0
std,139.169051,9.65694,12.081056,8.833828,9.452954,6.329209,10.915241,3.122774,7.206993,9.67574,...,4.665687,9.416478,10.60149,6.93354,0.0,8.523447,2.125817,0.081288,5.380352,0.0
min,38.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81.0,6.0,9.0,3.0,3.0,2.0,7.0,1.0,4.0,4.0,...,1.0,2.0,6.0,5.0,0.0,4.0,0.0,0.0,3.0,0.0
50%,140.5,9.0,13.0,5.0,8.0,4.0,10.0,1.0,7.0,8.0,...,2.0,4.0,9.0,7.0,0.0,6.0,0.0,0.0,6.0,0.0
75%,204.0,14.0,24.0,10.0,14.0,6.0,16.0,2.0,10.0,12.0,...,4.0,11.0,13.0,10.0,0.0,8.0,0.0,0.0,11.0,0.0
max,1643.0,121.0,88.0,84.0,80.0,74.0,133.0,42.0,79.0,79.0,...,61.0,102.0,140.0,79.0,0.0,130.0,39.0,1.0,43.0,0.0


In [44]:
## Saving the data to file 22_Sequence_Amino_Composition.csv

In [59]:
## seqAminoComp.to_csv('data/22_Sequence_Amino_Composition.csv',index=False)

### Domains (IRDs) Amino Composition 
File Used *PINIR_Domain_SequenceAminoComposition.csv* created using python program *SequenceAminoComposition.py*

In [47]:
domainAminoComp= pd.read_csv('data/PINIR_Domain_SequenceAminoComposition.csv')

In [48]:
domainAminoComp.head(2)

Unnamed: 0.1,Unnamed: 0,ID,Sequence,Length,A,A_per,C,C_per,D,D_per,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,1,IRD-1,QICINCCAGKKGCNYFSADGTFICEGESEYVSEVNDNLEKHCPRNC...,58,3,5.172414,8,13.793103,3,5.172414,...,3,5.172414,0,0.0,0,0.0,3,5.172414,0,0.0
1,2,IRD-2,QICTNCCAGKKGCMYFSDDGTFICEGESEYVSEVPVDNKPCPRNCD...,57,2,3.508772,8,14.035088,4,7.017544,...,3,5.263158,0,0.0,0,0.0,3,5.263158,0,0.0


In [49]:
domainAminoComp.columns

Index(['Unnamed: 0', 'ID', 'Sequence', 'Length', 'A', 'A_per', 'C', 'C_per',
       'D', 'D_per', 'E', 'E_per', 'F', 'F_per', 'G', 'G_per', 'H', 'H_per',
       'I', 'I_per', 'K', 'K_per', 'L', 'L_per', 'M', 'M_per', 'N', 'N_per',
       'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per', 'R', 'R_per', 'S', 'S_per',
       'T', 'T_per', 'U', 'U_per', 'V', 'V_per', 'W', 'W_per', 'X', 'X_per',
       'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [65]:
domainAminoComp=domainAminoComp[['ID','Length', 'A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']]

In [66]:
domainAminoComp.head()

Unnamed: 0,ID,Length,A,C,D,E,F,G,H,I,...,Q,R,S,T,U,V,W,X,Y,Z
0,IRD-1,58,3,8,3,5,2,4,1,5,...,1,3,4,1,0,3,0,0,3,0
1,IRD-2,57,2,8,4,4,2,6,0,4,...,1,2,4,2,0,3,0,0,3,0
2,IRD-3,57,2,8,3,4,2,6,0,4,...,1,2,4,2,0,3,0,0,3,0
3,IRD-4,55,3,8,1,3,1,6,0,4,...,1,2,3,6,0,1,0,0,3,0
4,IRD-5,60,2,7,1,8,1,6,0,5,...,0,2,3,5,0,3,0,0,3,0


In [67]:
domainAminoComp.describe()

Unnamed: 0,Length,A,C,D,E,F,G,H,I,K,...,Q,R,S,T,U,V,W,X,Y,Z
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,...,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,51.19914,2.911175,7.896848,2.548711,2.319484,1.648997,4.928367,0.302292,2.550143,3.575931,...,0.648997,2.747851,3.130372,3.234957,0.0,1.230659,0.022923,0.002865,2.802292,0.0
std,1.668047,1.17756,0.536438,1.110045,1.113215,0.877753,1.350072,0.58584,1.083822,1.406317,...,0.869542,1.535623,1.173069,1.250232,0.0,1.15503,0.149764,0.05349,1.13552,0.0
min,43.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50.0,2.0,8.0,2.0,2.0,1.0,4.0,0.0,2.0,3.0,...,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0
50%,51.0,3.0,8.0,3.0,2.0,2.0,5.0,0.0,3.0,3.0,...,0.0,3.0,3.0,3.0,0.0,1.0,0.0,0.0,3.0,0.0
75%,52.0,4.0,8.0,3.0,3.0,2.0,6.0,0.0,3.0,4.0,...,1.0,4.0,4.0,4.0,0.0,2.0,0.0,0.0,4.0,0.0
max,60.0,8.0,9.0,6.0,8.0,4.0,9.0,4.0,6.0,8.0,...,4.0,7.0,7.0,8.0,0.0,6.0,1.0,1.0,6.0,0.0


In [51]:
## Saving the data to file 23_Domain_Amino_Composition.csv

In [52]:
## domainAminoComp.to_csv('data/23_Domain_Amino_Composition.csv',index=False)

## Creating PI Master file.
This file has following combined information about Pin-II PIs:
**PI General Features**,**Isoelectric Points**, **Taxonomy Details**, and **Sequence Amino Composition**

In [69]:
sequence=pd.read_csv('data/1_pi_general_features.csv')

In [70]:
sequence.head(1)

Unnamed: 0,UniprotID,EntryName,Status,ProteinNames,Sequence,SignalPeptide,GeneNames,OrganismID,Organism,TaxonomicLineage(ALL),...,Function[CC],GeneOntologyIDs,GeneOntology(GO),GeneOntology(cellularcomponent),GeneOntology(molecularfunction),PubMedID,Cross-reference(InterPro),Cross-reference(Pfam),Cross-reference(MEROPS),Cross-reference(EMBL)
0,P05119,IP21_SOLLC,reviewed,Wound-induced proteinase inhibitor 2 (Wound-in...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,SIGNAL 1..25,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,"cellular organisms, Eukaryota, Viridiplantae, ...",...,FUNCTION: Potent inhibitor of both trypsin and...,GO:0004867; GO:0005576,extracellular region [GO:0005576]; serine-type...,extracellular region [GO:0005576],serine-type endopeptidase inhibitor activity [...,3838986; 12684499,IPR003465;,PF02428;,I20.003;,K03291;


In [71]:
sequence.columns

Index(['UniprotID', 'EntryName', 'Status', 'ProteinNames', 'Sequence',
       'SignalPeptide', 'GeneNames', 'OrganismID', 'Organism',
       'TaxonomicLineage(ALL)', 'GeneOntology(biologicalprocess)', 'Length',
       'Mass', 'Function[CC]', 'GeneOntologyIDs', 'GeneOntology(GO)',
       'GeneOntology(cellularcomponent)', 'GeneOntology(molecularfunction)',
       'PubMedID', 'Cross-reference(InterPro)', 'Cross-reference(Pfam)',
       'Cross-reference(MEROPS)', 'Cross-reference(EMBL)'],
      dtype='object')

In [73]:
sequence=sequence[['UniprotID', 'ProteinNames', 'Sequence', 'GeneNames', 'OrganismID',
       'Organism', 'Length', 'Mass', 'Function[CC]']]

In [74]:
sequence.head(1)

Unnamed: 0,UniprotID,ProteinNames,Sequence,GeneNames,OrganismID,Organism,Length,Mass,Function[CC]
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,148,16293,FUNCTION: Potent inhibitor of both trypsin and...


In [75]:
isoElecPts=pd.read_csv('data/21_Sequence_IsoElectricPoints.csv')

In [76]:
isoElecPts.head(1)

Unnamed: 0,UniprotID,sequence,molecular_weight,average_pI,pH_5.5_charge,pH_7.4_charge,pH_8.0_charge
0,P05119,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,16292.75374,6.879,3.2,-4.8,-10.7


In [77]:
isoElecPts.columns

Index(['UniprotID', 'sequence', 'molecular_weight', 'average_pI',
       'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge'],
      dtype='object')

In [78]:
isoElecPts=isoElecPts[['UniprotID','molecular_weight', 'average_pI',
       'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge']]

In [79]:
isoElecPts.head(1)

Unnamed: 0,UniprotID,molecular_weight,average_pI,pH_5.5_charge,pH_7.4_charge,pH_8.0_charge
0,P05119,16292.75374,6.879,3.2,-4.8,-10.7


In [80]:
seq_isoElecPts=pd.merge(sequence,isoElecPts,on='UniprotID',how='left')## Creating sequence data frame with Iso Electric Points information

In [81]:
seq_isoElecPts.head(2)

Unnamed: 0,UniprotID,ProteinNames,Sequence,GeneNames,OrganismID,Organism,Length,Mass,Function[CC],molecular_weight,average_pI,pH_5.5_charge,pH_7.4_charge,pH_8.0_charge
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,148,16293,FUNCTION: Potent inhibitor of both trypsin and...,16292.75374,6.879,3.2,-4.8,-10.7
1,P84813,Potamin-1 (PT-1) (Fragment),DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,,4113,Solanum tuberosum (Potato),47,4833,FUNCTION: Inhibitor of serine proteases chymot...,4833.46574,6.2,0.7,-3.0,-5.5


In [82]:
seq_isoElecPts.UniprotID.count()

452

In [83]:
aminoComp=pd.read_csv('data/22_Sequence_Amino_Composition.csv')

In [84]:
aminoComp.head(2)

Unnamed: 0.1,Unnamed: 0,UniprotID,Sequence,Length,A,A_per,C,C_per,D,D_per,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,1,P05119,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,148,7,4.72973,16,10.810811,5,3.378378,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,2,P84813,DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,47,6,12.765957,7,14.893617,3,6.382979,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0


In [85]:
aminoComp.columns

Index(['Unnamed: 0', 'UniprotID', 'Sequence', 'Length', 'A', 'A_per', 'C',
       'C_per', 'D', 'D_per', 'E', 'E_per', 'F', 'F_per', 'G', 'G_per', 'H',
       'H_per', 'I', 'I_per', 'K', 'K_per', 'L', 'L_per', 'M', 'M_per', 'N',
       'N_per', 'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per', 'R', 'R_per', 'S',
       'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per', 'W', 'W_per', 'X',
       'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [86]:
aminoComp=aminoComp[['UniprotID','Length', 'A', 'A_per', 'C',
       'C_per', 'D', 'D_per', 'E', 'E_per', 'F', 'F_per', 'G', 'G_per', 'H',
       'H_per', 'I', 'I_per', 'K', 'K_per', 'L', 'L_per', 'M', 'M_per', 'N',
       'N_per', 'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per', 'R', 'R_per', 'S',
       'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per', 'W', 'W_per', 'X',
       'X_per', 'Y', 'Y_per', 'Z', 'Z_per']]

In [87]:
aminoComp.head(2)

Unnamed: 0,UniprotID,Length,A,A_per,C,C_per,D,D_per,E,E_per,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,P05119,148,7,4.72973,16,10.810811,5,3.378378,8,5.405405,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,P84813,47,6,12.765957,7,14.893617,3,6.382979,1,2.12766,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0


In [88]:
## Creating sequence data frame with Iso Electric Points and amino composition 
## information

In [89]:
seq_isoElecPts_aminoComp=pd.merge(seq_isoElecPts,aminoComp,on='UniprotID',how='left')

In [90]:
seq_isoElecPts_aminoComp.head(2)

Unnamed: 0,UniprotID,ProteinNames,Sequence,GeneNames,OrganismID,Organism,Length_x,Mass,Function[CC],molecular_weight,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,148,16293,FUNCTION: Potent inhibitor of both trypsin and...,16292.75374,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,P84813,Potamin-1 (PT-1) (Fragment),DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,,4113,Solanum tuberosum (Potato),47,4833,FUNCTION: Inhibitor of serine proteases chymot...,4833.46574,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0


In [91]:
seq_isoElecPts_aminoComp.columns

Index(['UniprotID', 'ProteinNames', 'Sequence', 'GeneNames', 'OrganismID',
       'Organism', 'Length_x', 'Mass', 'Function[CC]', 'molecular_weight',
       'average_pI', 'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge',
       'Length_y', 'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per', 'F',
       'F_per', 'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per', 'L',
       'L_per', 'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per', 'Q',
       'Q_per', 'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per', 'V',
       'V_per', 'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [93]:
seq_isoElecPts_aminoComp=seq_isoElecPts_aminoComp[['UniprotID', 'ProteinNames', 'Sequence', 'GeneNames', 'OrganismID',
       'Organism', 'Length_x','Length_y', 'Mass','molecular_weight', 'Function[CC]', 
       'average_pI', 'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge',
       'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per', 'F',
       'F_per', 'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per', 'L',
       'L_per', 'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per', 'Q',
       'Q_per', 'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per', 'V',
       'V_per', 'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per']]

In [94]:
seq_isoElecPts_aminoComp.head(2)

Unnamed: 0,UniprotID,ProteinNames,Sequence,GeneNames,OrganismID,Organism,Length_x,Length_y,Mass,molecular_weight,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,148,148,16293,16292.75374,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,P84813,Potamin-1 (PT-1) (Fragment),DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,,4113,Solanum tuberosum (Potato),47,47,4833,4833.46574,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0


In [95]:
taxonomy=pd.read_csv('data/5_Organisms.csv')

In [96]:
taxonomy.head(2)

Unnamed: 0,OrganismID,Organism,Genus,Species,Solanaceae
0,3476,Parasponia andersonii (Sponia andersonii),Parasponia,Parasponia andersonii,0
1,3562,Spinacia oleracea (Spinach),Spinacia,Spinacia oleracea,0


In [97]:
taxonomy=taxonomy[['OrganismID', 'Genus', 'Species', 'Solanaceae']]

In [98]:
## Creating sequence data frame with Organism, Iso Electric Points and 
## amino composition information

In [99]:
seq_organism_isoElecPts_aminoComp=pd.merge(seq_isoElecPts_aminoComp,taxonomy,on='OrganismID',how='left')

In [101]:
seq_organism_isoElecPts_aminoComp.head(2)

Unnamed: 0,UniprotID,ProteinNames,Sequence,GeneNames,OrganismID,Organism,Length_x,Length_y,Mass,molecular_weight,...,W_per,X,X_per,Y,Y_per,Z,Z_per,Genus,Species,Solanaceae
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,148,148,16293,16292.75374,...,0.0,0,0.0,11,7.432432,0,0,Solanum,Solanum lycopersicum,1
1,P84813,Potamin-1 (PT-1) (Fragment),DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,,4113,Solanum tuberosum (Potato),47,47,4833,4833.46574,...,0.0,0,0.0,1,2.12766,0,0,Solanum,Solanum tuberosum,1


In [102]:
seq_organism_isoElecPts_aminoComp.columns

Index(['UniprotID', 'ProteinNames', 'Sequence', 'GeneNames', 'OrganismID',
       'Organism', 'Length_x', 'Length_y', 'Mass', 'molecular_weight',
       'Function[CC]', 'average_pI', 'pH_5.5_charge', 'pH_7.4_charge',
       'pH_8.0_charge', 'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per',
       'F', 'F_per', 'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per',
       'L', 'L_per', 'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per',
       'Q', 'Q_per', 'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per',
       'V', 'V_per', 'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per',
       'Genus', 'Species', 'Solanaceae'],
      dtype='object')

In [103]:
seq_organism_isoElecPts_aminoComp=seq_organism_isoElecPts_aminoComp[['UniprotID', 'ProteinNames', 'GeneNames', 'OrganismID',
       'Organism','Genus', 'Species', 'Solanaceae', 
       'Function[CC]','Sequence','Length_x', 'Length_y', 'Mass', 'molecular_weight','average_pI', 'pH_5.5_charge', 'pH_7.4_charge',
       'pH_8.0_charge', 'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per',
       'F', 'F_per', 'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per',
       'L', 'L_per', 'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per',
       'Q', 'Q_per', 'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per',
       'V', 'V_per', 'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per']]

In [104]:
seq_organism_isoElecPts_aminoComp.head(2)

Unnamed: 0,UniprotID,ProteinNames,GeneNames,OrganismID,Organism,Genus,Species,Solanaceae,Function[CC],Sequence,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,Solanum,Solanum lycopersicum,1,FUNCTION: Potent inhibitor of both trypsin and...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,P84813,Potamin-1 (PT-1) (Fragment),,4113,Solanum tuberosum (Potato),Solanum,Solanum tuberosum,1,FUNCTION: Inhibitor of serine proteases chymot...,DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0


In [105]:
seq_organism_isoElecPts_aminoComp.loc[seq_organism_isoElecPts_aminoComp.Length_x!=seq_organism_isoElecPts_aminoComp.Length_y,:].UniprotID.count()

0

In [107]:
seq_organism_isoElecPts_aminoComp.drop(labels='Length_y',axis=1,inplace=True)

In [108]:
seq_organism_isoElecPts_aminoComp.drop(labels='molecular_weight',axis=1,inplace=True)

In [110]:
seq_organism_isoElecPts_aminoComp.columns

Index(['UniprotID', 'ProteinNames', 'GeneNames', 'OrganismID', 'Organism',
       'Genus', 'Species', 'Solanaceae', 'Function[CC]', 'Sequence',
       'Length_x', 'Mass', 'average_pI', 'pH_5.5_charge', 'pH_7.4_charge',
       'pH_8.0_charge', 'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per',
       'F', 'F_per', 'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per',
       'L', 'L_per', 'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per',
       'Q', 'Q_per', 'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per',
       'V', 'V_per', 'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [111]:
seq_organism_isoElecPts_aminoComp.rename(columns={'Length_x':'Length'},inplace=True)

In [112]:
seq_organism_isoElecPts_aminoComp.UniprotID.count()

452

In [113]:
## Saving this Information: Sequence-Organism-IsoelectricPoint-AminoCOmposition 
## in 24_Sequence_Organism_IsoelectricPoint_Amino_Composition.csv

In [114]:
## seq_organism_isoElecPts_aminoComp.to_csv('data/24_Sequence_Organism_IsoelectricPoint_Amino_Composition.csv',index=False)

In [147]:
pin2PI_details=pd.read_csv('data/24_Sequence_Organism_IsoelectricPoint_Amino_Composition.csv')

In [148]:
pin2PI_details.head(2)

Unnamed: 0,UniprotID,ProteinNames,GeneNames,OrganismID,Organism,Genus,Species,Solanaceae,Function[CC],Sequence,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
0,P05119,Wound-induced proteinase inhibitor 2 (Wound-in...,,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,Solanum,Solanum lycopersicum,1,FUNCTION: Potent inhibitor of both trypsin and...,MAVHKEVNFVAYLLIVLGMFLYVDAKACTRECGNLGFGICPRSEGS...,...,7,4.72973,0,0.0,0,0.0,11,7.432432,0,0
1,P84813,Potamin-1 (PT-1) (Fragment),,4113,Solanum tuberosum (Potato),Solanum,Solanum tuberosum,1,FUNCTION: Inhibitor of serine proteases chymot...,DICTNCCAGTKGCNTTSANGAFICEGQSDPKKPKACPLNCDPHIAYA,...,0,0.0,0,0.0,0,0.0,1,2.12766,0,0


In [149]:
pin2PI_details.columns

Index(['UniprotID', 'ProteinNames', 'GeneNames', 'OrganismID', 'Organism',
       'Genus', 'Species', 'Solanaceae', 'Function[CC]', 'Sequence', 'Length',
       'Mass', 'average_pI', 'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge',
       'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per', 'F', 'F_per',
       'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per', 'L', 'L_per',
       'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per',
       'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per',
       'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [150]:
pin2PI_details.UniprotID.count()

452

In [151]:
pin2PI_details.GeneNames.nunique()

265

In [152]:
pin2PI_details.OrganismID.nunique()

125

In [153]:
pin2PI_details.Organism.value_counts().head(20)

Capsicum annuum (Capsicum pepper)                                   99
Solanum tuberosum (Potato)                                          52
Nicotiana tabacum (Common tobacco)                                  22
Solanum lycopersicum (Tomato) (Lycopersicon esculentum)             21
Capsicum chinense (Scotch bonnet) (Bonnet pepper)                   13
Capsicum baccatum (Peruvian pepper)                                 11
Solanum chilense (Tomato) (Lycopersicon chilense)                   11
Nicotiana attenuata (Coyote tobacco)                                11
Nicotiana sylvestris (Wood tobacco) (South American tobacco)        10
Selaginella moellendorffii (Spikemoss)                               7
Coffea canephora (Robusta coffee)                                    6
Solanum chacoense (Chaco potato)                                     5
Rosa chinensis (China rose)                                          5
Arachis hypogaea (Peanut)                                            4
Noccae

In [154]:
pin2PI_details.Solanaceae.value_counts()

1    288
0    164
Name: Solanaceae, dtype: int64

In [155]:
pin2PI_details.columns

Index(['UniprotID', 'ProteinNames', 'GeneNames', 'OrganismID', 'Organism',
       'Genus', 'Species', 'Solanaceae', 'Function[CC]', 'Sequence', 'Length',
       'Mass', 'average_pI', 'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge',
       'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per', 'F', 'F_per',
       'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per', 'L', 'L_per',
       'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per',
       'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per',
       'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [156]:
pin2PI_details[['Length','Mass','average_pI', 'pH_5.5_charge', 'pH_7.4_charge', 
                'pH_8.0_charge','A', 'A_per', 'C', 'C_per', 'D', 'D_per', 
                'E', 'E_per', 'F', 'F_per','G', 'G_per', 'H', 'H_per', 'I', 
                'I_per', 'K', 'K_per', 'L', 'L_per','M', 'M_per', 'N', 'N_per',
                'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per','R', 'R_per', 'S', 
                'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per','W', 'W_per',
                'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per']].describe()

Unnamed: 0,Length,Mass,average_pI,pH_5.5_charge,pH_7.4_charge,pH_8.0_charge,A,A_per,C,C_per,...,V,V_per,W,W_per,X,X_per,Y,Y_per,Z,Z_per
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,...,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,165.044248,17974.261062,6.075051,2.137168,-7.177434,-13.548894,11.473451,7.220966,17.39823,10.958341,...,7.349558,4.769175,0.396018,0.173246,0.006637,0.002832,7.086283,4.243356,0.0,0.0
std,139.169051,15478.092057,1.204624,4.824259,6.13344,9.931322,9.65694,2.141542,12.081056,2.023361,...,8.523447,2.243472,2.125817,0.456647,0.081288,0.035422,5.380352,1.53761,0.0,0.0
min,38.0,4263.0,3.456,-8.9,-38.5,-70.7,1.0,0.970874,3.0,2.24359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81.0,8637.0,5.1515,-0.3,-11.1,-19.9,6.0,5.658326,9.0,10.25641,...,4.0,3.271137,0.0,0.0,0.0,0.0,3.0,3.398478,0.0,0.0
50%,140.5,15367.0,6.16,1.35,-6.75,-11.05,9.0,7.228916,13.0,11.25,...,6.0,4.597701,0.0,0.0,0.0,0.0,6.0,3.924904,0.0,0.0
75%,204.0,22300.5,6.95225,3.825,-3.3,-6.4,14.0,8.75,24.0,12.195122,...,8.0,6.17284,0.0,0.0,0.0,0.0,11.0,5.392157,0.0,0.0
max,1643.0,181670.0,8.752,48.9,5.1,1.7,121.0,19.178082,88.0,17.391304,...,130.0,13.333333,39.0,3.053435,1.0,0.490196,43.0,8.737864,0.0,0.0
