# PINIR Data Analysis Page-2

In [1]:
import pandas as pd
import numpy as np

In [73]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [3]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

In [4]:
pd.set_option('display.max_colwidth',2000)

In [5]:
pd.reset_option('display.max_colwidth')

## Linker Diversity and Distribution in Pin-II PIs

In [6]:
pinir=pd.read_csv('data/19_Sequences_Domains_RCL_Linker_Organisms.csv')

In [7]:
pinir.columns

Index(['UniprotID', 'Sequence', 'OrganismID', 'Organism', 'Genus', 'Species',
       'Solanaceae', 'DomainID', 'Domain', 'DomainStartPosition',
       'DomainEndPosition', 'DomainType', 'RclID', 'RCL', 'P2Residue',
       'P1Residue', 'P1primeResidue', 'targetProtease', 'tpID',
       'RCL_startPosition', 'RCL_endPosition', 'linkerID', 'Linker',
       'LinkerType', 'Linker_startPosition', 'Linker_endPosition'],
      dtype='object')

In [8]:
seq_domain=pinir[['UniprotID', 'OrganismID', 'Organism', 'Genus', 'Species',
       'Solanaceae', 'DomainID','DomainType', 'RclID', 'RCL', 'P2Residue',
       'P1Residue', 'P1primeResidue', 'targetProtease', 'tpID',
       'linkerID', 'Linker',
       'LinkerType']]

In [9]:
seq_domain.head(2)

Unnamed: 0,UniprotID,OrganismID,Organism,Genus,Species,Solanaceae,DomainID,DomainType,RclID,RCL,P2Residue,P1Residue,P1primeResidue,targetProtease,tpID,linkerID,Linker,LinkerType
0,P05119,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,Solanum,Solanum lycopersicum,1,IRD-62,1.0,RL-7,CTFNC,T,F,N,Chymotrypsin,TP-2,L-21,DPKRP,1.0
1,P05119,4081,Solanum lycopersicum (Tomato) (Lycopersicon es...,Solanum,Solanum lycopersicum,1,IRD-405,3.0,RL-7,CTFNC,T,F,N,Chymotrypsin,TP-2,,,


In [10]:
linkers=pd.read_csv('data/8_Linkers.csv')

In [11]:
linkers.shape

(78, 3)

In [12]:
linkers.count()

linkerID    78
Linker      78
Type        78
dtype: int64

In [14]:
linkers.Type.value_counts()

2    54
1    24
Name: Type, dtype: int64

In [15]:
seq_domain.shape

(1309, 18)

In [18]:
seq_domain.LinkerType.isna().sum()

293

In [19]:
filt=(~seq_domain.LinkerType.isna())
seq_domain.loc[filt,:].UniprotID.count()

1016

In [17]:
seq_domain.LinkerType.value_counts() ## Distribution by Linker Type

2.0    511
1.0    505
Name: LinkerType, dtype: int64

In [20]:
domainDetails=pd.read_csv('data/11_Domain_RCL_Linker.csv')

In [21]:
domainDetails.shape

(695, 18)

In [22]:
domainDetails.columns

Index(['id', 'domainID', 'domain', 'DomainType', 'RclID', 'RCL', 'P2Residue',
       'P1Residue', 'P1primeResidue', 'targetProtease', 'tpID',
       'RCL_startPosition', 'RCL_endPosition', 'linkerID', 'Linker',
       'LinkerType', 'Linker_startPosition', 'Linker_endPosition'],
      dtype='object')

### Domains distribution by Domain Type

In [23]:
domainDetails.DomainType.value_counts()

2.0    256
1.0    238
3.0    201
Name: DomainType, dtype: int64

In [24]:
seq_domain.columns

Index(['UniprotID', 'OrganismID', 'Organism', 'Genus', 'Species', 'Solanaceae',
       'DomainID', 'DomainType', 'RclID', 'RCL', 'P2Residue', 'P1Residue',
       'P1primeResidue', 'targetProtease', 'tpID', 'linkerID', 'Linker',
       'LinkerType'],
      dtype='object')

In [25]:
seq_domain.shape

(1309, 18)

In [28]:
seq_domain.duplicated().sum()

59

In [29]:
## Records without duplicated records, due to presence of same domain multiple times in sequence

In [30]:
seq_domain.loc[~seq_domain.duplicated(),:].UniprotID.count()

1250

In [31]:
seq_domain_uniqueRecords=seq_domain.loc[~seq_domain.duplicated(),:]

In [32]:
seq_domain_uniqueRecords.duplicated().sum()

0

In [None]:
## Dropping UniprotID these many are duplicated records bcz of different sequences
## belonging to same organism

In [35]:
seq_domain_uniqueRecords[['OrganismID', 'Organism', 'Genus', 'Species', 'Solanaceae',
       'DomainID', 'DomainType', 'RclID', 'RCL', 'P2Residue', 'P1Residue',
       'P1primeResidue', 'targetProtease', 'tpID', 'linkerID', 'Linker',
       'LinkerType']].duplicated().sum()

448

### Finding Unique relation between Organisms and Domain found in them

In [36]:
org_domain=seq_domain_uniqueRecords[['OrganismID', 'Organism', 'Genus', 'Species', 'Solanaceae',
       'DomainID', 'DomainType', 'RclID', 'RCL', 'P2Residue', 'P1Residue',
       'P1primeResidue', 'targetProtease', 'tpID', 'linkerID', 'Linker',
       'LinkerType']]

In [38]:
org_domain.loc[org_domain.duplicated(),:].shape

(448, 17)

In [39]:
org_domain.loc[~org_domain.duplicated(),:].shape

(802, 17)

In [40]:
org_domain_uniqueRecords=org_domain.loc[~org_domain.duplicated(),:]

In [41]:
org_domain_uniqueRecords.OrganismID.count()

802

### Distribution of Organisms Domain Type wise

In [42]:
org_domain_uniqueRecords.groupby('DomainType').Organism.value_counts()

DomainType  Organism                                                     
1.0         Capsicum annuum (Capsicum pepper)                                68
            Nicotiana tabacum (Common tobacco)                               24
            Solanum tuberosum (Potato)                                       21
            Nicotiana sylvestris (Wood tobacco) (South American tobacco)     14
            Nicotiana attenuata (Coyote tobacco)                             13
                                                                             ..
3.0         Solanum phureja                                                   1
            Spinacia oleracea (Spinach)                                       1
            Triticum turgidum subsp. durum (Durum wheat) (Triticum durum)     1
            Triticum urartu (Red wild einkorn) (Crithodium urartu)            1
            Vitis vinifera (Grape)                                            1
Name: Organism, Length: 147, dtype: int64

In [43]:
## org_domain_uniqueRecords.groupby('DomainType').Organism.value_counts().to_csv('data/33_Distribution_of_Organism_by_domainType.csv')

## RCL distribution and specificity

In [44]:
seq_domain.shape

(1309, 18)

In [45]:
seq_domain.columns

Index(['UniprotID', 'OrganismID', 'Organism', 'Genus', 'Species', 'Solanaceae',
       'DomainID', 'DomainType', 'RclID', 'RCL', 'P2Residue', 'P1Residue',
       'P1primeResidue', 'targetProtease', 'tpID', 'linkerID', 'Linker',
       'LinkerType'],
      dtype='object')

### RCL Distribution in PIN-II PI Sequences

In [46]:
seq_domain.RCL.value_counts().head()

CPRNC    578
CTLNC    113
CPRYC     94
CPLNC     55
CTLEC     52
Name: RCL, dtype: int64

### RCL Distribution in IRDs Sequences

In [47]:
domainDetails.RCL.value_counts().head()

CPRNC    291
CTLNC     36
CPLNC     35
CTLEC     25
CPLYC     23
Name: RCL, dtype: int64

### RCL distribution Organism wise in all PI sequences

In [48]:
seq_domain.groupby('Organism').RCL.value_counts()

Organism                                                       RCL  
Aegilops tauschii (Tausch's goatgrass) (Aegilops squarrosa)    CHQYC    1
Aegilops tauschii subsp. strangulata (Goatgrass)               CHQYC    1
                                                               CPQYC    1
Ananas comosus (Pineapple) (Ananas ananas)                     CPQYC    2
Apostasia shenzhenica                                          CPLYC    1
                                                                       ..
Triticum aestivum (Wheat)                                      CHQYC    1
Triticum turgidum subsp. durum (Durum wheat) (Triticum durum)  CPQYC    1
Triticum urartu (Red wild einkorn) (Crithodium urartu)         CPQYC    1
Vitis vinifera (Grape)                                         CPLYC    1
Zea mays (Maize)                                               CPQFC    2
Name: RCL, Length: 273, dtype: int64

### RCL distribution Organism wise in Unique-Organism occurrences

In [49]:
org_domain_uniqueRecords.OrganismID.count()

802

In [50]:
org_domain_uniqueRecords.groupby('Organism').RCL.value_counts()

Organism                                                       RCL  
Aegilops tauschii (Tausch's goatgrass) (Aegilops squarrosa)    CHQYC    1
Aegilops tauschii subsp. strangulata (Goatgrass)               CHQYC    1
                                                               CPQYC    1
Ananas comosus (Pineapple) (Ananas ananas)                     CPQYC    2
Apostasia shenzhenica                                          CPLYC    1
                                                                       ..
Triticum aestivum (Wheat)                                      CHQYC    1
Triticum turgidum subsp. durum (Durum wheat) (Triticum durum)  CPQYC    1
Triticum urartu (Red wild einkorn) (Crithodium urartu)         CPQYC    1
Vitis vinifera (Grape)                                         CPLYC    1
Zea mays (Maize)                                               CPQFC    2
Name: RCL, Length: 273, dtype: int64

In [51]:
rcl=pd.read_csv('data/7_RCL_TP.csv')

In [52]:
rcl.count()

RclID             63
Rcl               63
targetProtease    63
tpID              63
dtype: int64

In [53]:
rcl.targetProtease.value_counts()

Unknown         25
Chymotrypsin    20
Trypsin         15
Elastase         3
Name: targetProtease, dtype: int64

### Residue Distribution in IRDs

In [54]:
domainDetails.columns

Index(['id', 'domainID', 'domain', 'DomainType', 'RclID', 'RCL', 'P2Residue',
       'P1Residue', 'P1primeResidue', 'targetProtease', 'tpID',
       'RCL_startPosition', 'RCL_endPosition', 'linkerID', 'Linker',
       'LinkerType', 'Linker_startPosition', 'Linker_endPosition'],
      dtype='object')

In [55]:
domainDetails.P1Residue.value_counts()

R    352
L    137
Q     85
K     44
F     36
A     10
M      9
P      6
T      4
S      3
G      2
I      2
E      2
Y      2
D      1
Name: P1Residue, dtype: int64

In [56]:
domainDetails.P2Residue.value_counts()

P    479
T    165
S     19
L     13
I      6
H      3
R      3
A      3
N      2
Y      2
Name: P2Residue, dtype: int64

In [57]:
domainDetails.P1primeResidue.value_counts()

N    421
E    115
Y     73
F     30
I     24
V     10
Q      8
D      7
H      4
K      2
T      1
Name: P1primeResidue, dtype: int64

### P1 Residue Distribution for Solanaceae and NON Solanaceae

In [58]:
org_domain_uniqueRecords.shape

(802, 17)

In [59]:
org_domain_uniqueRecords.OrganismID.count()

802

In [60]:
org_domain_uniqueRecords.columns

Index(['OrganismID', 'Organism', 'Genus', 'Species', 'Solanaceae', 'DomainID',
       'DomainType', 'RclID', 'RCL', 'P2Residue', 'P1Residue',
       'P1primeResidue', 'targetProtease', 'tpID', 'linkerID', 'Linker',
       'LinkerType'],
      dtype='object')

In [61]:
org_domain_uniqueRecords.groupby('Solanaceae').P1Residue.value_counts()

Solanaceae  P1Residue
0           Q             49
            L             39
            M              6
            K              5
            R              4
            F              3
            E              2
1           R            395
            L            115
            Q             58
            K             45
            F             37
            A             10
            M              6
            P              6
            T              6
            S              5
            Y              4
            G              2
            I              2
Name: P1Residue, dtype: int64

In [63]:
## org_domain_uniqueRecords.groupby('Solanaceae').P1Residue.value_counts().to_csv('data/34_P1Residue_distribution_for_Solanaceae.csv')

### RCL distribution Solanaceae wise

In [64]:
org_domain_uniqueRecords.groupby('Solanaceae').RCL.value_counts()

Solanaceae  RCL  
0           CPLYC    23
            CPQYC    22
            CPQFC    20
            CPLIC     7
            CAMYC     5
                     ..
1           CTPNC     2
            CIREC     1
            CLQKC     1
            CSFNC     1
            CTQDC     1
Name: RCL, Length: 71, dtype: int64

## Disulphide Bond variations in the Pin-II family

In [65]:
seq=pd.read_csv('data/24_Sequence_Organism_IsoelectricPoint_Amino_Composition.csv')

In [66]:
seq.shape

(452, 64)

In [68]:
seq.columns

Index(['UniprotID', 'ProteinNames', 'GeneNames', 'OrganismID', 'Organism',
       'Genus', 'Species', 'Solanaceae', 'Function[CC]', 'Sequence', 'Length',
       'Mass', 'average_pI', 'pH_5.5_charge', 'pH_7.4_charge', 'pH_8.0_charge',
       'A', 'A_per', 'C', 'C_per', 'D', 'D_per', 'E', 'E_per', 'F', 'F_per',
       'G', 'G_per', 'H', 'H_per', 'I', 'I_per', 'K', 'K_per', 'L', 'L_per',
       'M', 'M_per', 'N', 'N_per', 'O', 'O_per', 'P', 'P_per', 'Q', 'Q_per',
       'R', 'R_per', 'S', 'S_per', 'T', 'T_per', 'U', 'U_per', 'V', 'V_per',
       'W', 'W_per', 'X', 'X_per', 'Y', 'Y_per', 'Z', 'Z_per'],
      dtype='object')

In [69]:
seq_amino=seq[['UniprotID',
       'A', 'C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']]

In [70]:
seq_amino.head()

Unnamed: 0,UniprotID,A,C,D,E,F,G,H,I,K,...,Q,R,S,T,U,V,W,X,Y,Z
0,P05119,7,16,5,8,8,16,1,7,10,...,1,5,9,6,0,7,0,0,11,0
1,P84813,6,7,3,1,1,4,1,3,4,...,1,0,2,4,0,0,0,0,1,0
2,P01080,11,16,6,9,5,16,2,7,12,...,1,3,9,5,0,8,0,0,10,0
3,P83241,4,8,2,1,1,4,0,4,5,...,1,4,4,4,0,2,0,0,1,0
4,P01078,3,8,2,3,2,6,0,4,3,...,1,3,4,2,0,0,0,0,2,0


In [71]:
seq_amino.set_index('UniprotID',inplace=True)

In [72]:
seq_amino.head()

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,...,Q,R,S,T,U,V,W,X,Y,Z
UniprotID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P05119,7,16,5,8,8,16,1,7,10,7,...,1,5,9,6,0,7,0,0,11,0
P84813,6,7,3,1,1,4,1,3,4,1,...,1,0,2,4,0,0,0,0,1,0
P01080,11,16,6,9,5,16,2,7,12,9,...,1,3,9,5,0,8,0,0,10,0
P83241,4,8,2,1,1,4,0,4,5,1,...,1,4,4,4,0,2,0,0,1,0
P01078,3,8,2,3,2,6,0,4,3,1,...,1,3,4,2,0,0,0,0,2,0


### Display the Max,Min and Mean value of each Amino Acid