In [1]:
import pandas as pd
import numpy as np
import re

# Reading Data

- #### Pfam keyword search data

In [2]:
# Pfam proteins associated with nucleus terms and membrane terms in keyword search
df_pfam_locations = pd.read_csv('../input/pfam_nucleus_membrane_data.csv')

In [3]:
# The searched terms
nucleus_terms = ['chromossome', 'chromatin', 'nucleus']
membrane_terms = ['membrane', 'cytoplasm', 'cytoskeleton', 'cytosol']

# Add variable to identify if is a nucleus or membrane term
df_pfam_locations['flag_nucleus'] = np.where(df_pfam_locations['keyword'].isin(nucleus_terms), 1, 0)
df_pfam_locations['flag_membrane'] = np.where(df_pfam_locations['keyword'].isin(membrane_terms), 1, 0)
df_pfam_locations.shape

(2888, 6)

In [4]:
# The Pfam keyword search find the same accession in different terms
# so I will group these values
df_pfam_locations = df_pfam_locations.groupby('accession')[['flag_nucleus', 'flag_membrane']].max().reset_index()
df_pfam_locations.head(3)

Unnamed: 0,accession,flag_nucleus,flag_membrane
0,PF00001,0,1
1,PF00002,0,1
2,PF00003,0,1


In [5]:
df_pfam_locations.accession.unique().shape

(2574,)

- #### Pfam HMM Output

In [6]:
df_hmm = pd.read_csv('../output/df_t_cruzi_hmm.csv', low_memory=False)
df_hmm.shape

(61583, 19)

In [7]:
df_hmm.head(3)

Unnamed: 0,target_name,accession,query_name,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
0,Hus1,PF04005.12,TcCLB.505051.20,-,4.7e-75,252.4,0.0,5.3e-75,252.2,0.0,1.0,1,0,0,1,1,1,1,Hus1-like protein
1,BNR_3,PF13859.6,TcCLB.504593.10,-,2.9e-54,184.5,0.3,2.9e-54,184.5,0.3,2.8,3,1,0,3,3,3,1,BNR repeat-like domain
2,Tr-sialidase_C,PF11052.8,TcCLB.504593.10,-,9.5e-08,32.0,5.5,1.9e-07,31.1,5.5,1.5,1,0,0,1,1,1,1,Trans-sialidase of Trypanosoma hydrophobic C-t...


In [8]:
df_hmm.accession.unique().shape

(7690,)

In [9]:
# Remove any character after dot in accession ('.')
df_hmm.accession = df_hmm.accession.apply(lambda x: re.sub(r'\..*', '', x.strip()))
# Strip whitespace from target and query name
df_hmm.target_name = df_hmm.target_name.apply(lambda x: x.strip())
df_hmm.query_name = df_hmm.query_name.apply(lambda x: x.strip())
df_hmm.head(2)

Unnamed: 0,target_name,accession,query_name,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
0,Hus1,PF04005,TcCLB.505051.20,-,4.7e-75,252.4,0.0,5.3e-75,252.2,0.0,1.0,1,0,0,1,1,1,1,Hus1-like protein
1,BNR_3,PF13859,TcCLB.504593.10,-,2.9e-54,184.5,0.3,2.9e-54,184.5,0.3,2.8,3,1,0,3,3,3,1,BNR repeat-like domain


In [10]:
print("Unique target_name: " + str(df_hmm.target_name.unique().shape))
print("Unique accession: " + str(df_hmm.accession.unique().shape))
print("Unique query_name: " + str(df_hmm.query_name.unique().shape))
print("Unique query_name + accession: " + str((df_hmm.query_name + df_hmm.accession).unique().shape))

Unique target_name: (7690,)
Unique accession: (7690,)
Unique query_name: (17544,)
Unique query_name + accession: (61583,)


- #### Swissprot blast data

In [11]:
df_swissprot = pd.read_csv('../output/df_swiss_prot_t_cruzi.csv')
df_swissprot.drop('Unnamed: 0', axis=1, inplace=True)
df_swissprot.shape

(103, 4)

In [12]:
# Transforming variables
df_swissprot['accession_2'] = df_swissprot.iteration_query_def.str.split('|', expand=True).iloc[:,1]
df_swissprot['query_name'] = df_swissprot.query_id.str.split('|', expand=True).iloc[:,0]
df_swissprot['query_name'] = df_swissprot.query_name.apply(lambda x: str(x).strip())
df_swissprot.head(3)

Unnamed: 0,iteration_query_def,aln_length,flag_hit,query_id,accession_2,query_name
0,sp|Q9GT49|TRYS_TRYCC Trypanothione synthetase ...,647,1,TcCLB.509319.90 | organism=Trypanosoma_cruzi_C...,Q9GT49,TcCLB.509319.90
1,sp|P28593|TYTR_TRYCR Trypanothione reductase O...,492,1,TcCLB.503555.30 | organism=Trypanosoma_cruzi_C...,P28593,TcCLB.503555.30
2,sp|Q9U6Z1|KM11_TRYCR Kinetoplastid membrane pr...,92,1,TcCLB.510755.89 | organism=Trypanosoma_cruzi_C...,Q9U6Z1,TcCLB.510755.89


In [13]:
print("Unique accession_2: " + str(df_swissprot.accession_2.unique().shape))
print("Unique query_name: " + str(df_swissprot.query_name.unique().shape))

Unique accession_2: (103,)
Unique query_name: (94,)


In [14]:
df_swissprot.flag_hit.value_counts()

1     92
0      8
10     1
6      1
2      1
Name: flag_hit, dtype: int64

# Join Data

- ### Pfam (hmm) and Swissprot

In [60]:
df_join_inner = df_swissprot[['query_name', 'accession_2']].merge(df_hmm, how='inner')
df_join_inner.shape

(418, 20)

In [47]:
df_join = df_hmm[['query_name', 'accession']].merge(df_swissprot[['query_name', 'accession_2']], how='left')
df_join.shape

(61587, 3)

### There are duplicated query_name on swissprot data

In [50]:
# Query names duplicados no swissprot
df_swissprot.query_name.value_counts()[df_swissprot.query_name.value_counts()>1]

nan                8
TcCLB.506679.70    2
TcCLB.506795.80    2
Name: query_name, dtype: int64

- #### Searching these query_name

In [61]:
df_join_inner[df_join_inner.query_name=="TcCLB.506679.70"]

Unnamed: 0,query_name,accession_2,target_name,accession,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
319,TcCLB.506679.70,Q4E097,eIF-6,PF01912,-,1.7e-78,262.6,0.3,2.6e-78,261.9,0.3,1.3,1,0,0,1,1,1,1,eIF-6 family
320,TcCLB.506679.70,Q4E097,2_5_RNA_ligase2,PF13563,-,0.03,14.2,0.0,0.095,12.6,0.0,1.9,2,1,0,2,2,2,0,2'-5' RNA ligase superfamily
321,TcCLB.506679.70,Q9XYP3,eIF-6,PF01912,-,1.7e-78,262.6,0.3,2.6e-78,261.9,0.3,1.3,1,0,0,1,1,1,1,eIF-6 family
322,TcCLB.506679.70,Q9XYP3,2_5_RNA_ligase2,PF13563,-,0.03,14.2,0.0,0.095,12.6,0.0,1.9,2,1,0,2,2,2,0,2'-5' RNA ligase superfamily


Links para análises:
- Uniprot Q4E097: https://www.uniprot.org/uniprot/Q4E097
    - Faz referência para: http://pfam.xfam.org/protein/Q4E097 e http://pfam.xfam.org/family/PF01912
    
- Uniprot Q9XYP3: https://www.uniprot.org/uniprot/Q9XYP3
    - Faz referência para: http://pfam.xfam.org/protein/Q9XYP3 e http://pfam.xfam.org/family/PF01912
    
___
    
- Pfam PF01912: https://pfam.xfam.org/family/PF01912
- Pfam PF13563: https://pfam.xfam.org/family/2_5_RNA_ligase2

In [62]:
df_join_inner[df_join_inner.query_name=="TcCLB.506795.80"]

Unnamed: 0,query_name,accession_2,target_name,accession,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
93,TcCLB.506795.80,Q4DA80,Pro_racemase,PF05544,-,2.5e-137,457.3,0.1 2,9e-138,457.1,0.1,1.0,1,0,0,1,1,1,1,Proline racemase
94,TcCLB.506795.80,Q4DA80,PhzC-PhzF,PF02567,-,0.013,15.0,0.0,0.026,14.0,0.0,1.4,1,0,0,1,1,1,0,Phenazine biosynthesis-like protein
95,TcCLB.506795.80,Q868H8,Pro_racemase,PF05544,-,2.5e-137,457.3,0.1 2,9e-138,457.1,0.1,1.0,1,0,0,1,1,1,1,Proline racemase
96,TcCLB.506795.80,Q868H8,PhzC-PhzF,PF02567,-,0.013,15.0,0.0,0.026,14.0,0.0,1.4,1,0,0,1,1,1,0,Phenazine biosynthesis-like protein


Links para análises:
- Uniprot Q4DA80: https://www.uniprot.org/uniprot/Q4DA80
    - Faz referência para: http://pfam.xfam.org/protein/Q4DA80 e http://pfam.xfam.org/family/PF05544
    
- Uniprot Q868H8: https://www.uniprot.org/uniprot/Q868H8
    - Faz referência para: http://pfam.xfam.org/protein/Q868H8 e http://pfam.xfam.org/family/PF05544
    
___
    
- Pfam PF02567: http://pfam.xfam.org/family/PhzC-PhzF
- Pfam PF05544: https://pfam.xfam.org/family/Pro_racemase

In [64]:
df_join_inner.head(10)

Unnamed: 0,query_name,accession_2,target_name,accession,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
0,TcCLB.509319.90,Q9GT49,GSP_synth,PF03738,-,1.5e-75,254.8,0.0,1.6999999999999998e-75,254.5,0.0,1.1,1,0,0,1,1,1,1,Glutathionylspermidine synthase preATP-grasp
1,TcCLB.509319.90,Q9GT49,CHAP,PF05257,-,5.5e-10,39.7,0.0,1.6e-09,38.2,0.0,1.9,1,0,0,1,1,1,1,CHAP domain
2,TcCLB.503555.30,P28593,Pyr_redox_2,PF07992,-,6.5e-61,206.1,0.0,8.2e-61,205.8,0.0,1.1,1,0,0,1,1,1,1,Pyridine nucleotide-disulphide oxidoreductase
3,TcCLB.503555.30,P28593,Pyr_redox,PF00070,-,4.9e-23,81.5,1.1,1.5e-19,70.4,0.0,2.7,2,0,0,2,2,2,2,Pyridine nucleotide-disulphide oxidoreductase
4,TcCLB.503555.30,P28593,Pyr_redox_dim,PF02852,-,1.5e-21,76.7,0.0,3.7e-21,75.5,0.0,1.7,1,0,0,1,1,1,1,"Pyridine nucleotide-disulphide oxidoreductase,..."
5,TcCLB.503555.30,P28593,Pyr_redox_3,PF13738,-,1.8e-11,43.8,0.2,9.9e-09,34.9,0.0,2.2,2,0,0,2,2,2,2,Pyridine nucleotide-disulphide oxidoreductase
6,TcCLB.503555.30,P28593,FAD_binding_2,PF00890,-,9.1e-07,28.3,2.0,0.0041,16.3,0.7,2.5,3,0,0,3,3,3,2,FAD binding domain
7,TcCLB.503555.30,P28593,HI0933_like,PF03486,-,3.3e-05,22.8,0.3,0.015,14.0,0.1,2.5,2,0,0,2,2,2,2,HI0933-like protein
8,TcCLB.503555.30,P28593,GIDA,PF01134,-,0.0014,17.8,1.3,0.0019,17.4,0.3,1.7,2,0,0,2,2,2,1,Glucose inhibited division protein A
9,TcCLB.503555.30,P28593,Lycopene_cycl,PF05834,-,0.004,16.3,0.1,0.52,9.3,0.0,2.2,2,0,0,2,2,2,2,Lycopene cyclase protein


In [65]:
df_join_inner[df_join_inner.accession_2=='P28593']

Unnamed: 0,query_name,accession_2,target_name,accession,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
2,TcCLB.503555.30,P28593,Pyr_redox_2,PF07992,-,6.5e-61,206.1,0.0,8.2e-61,205.8,0.0,1.1,1,0,0,1,1,1,1,Pyridine nucleotide-disulphide oxidoreductase
3,TcCLB.503555.30,P28593,Pyr_redox,PF00070,-,4.9e-23,81.5,1.1,1.5e-19,70.4,0.0,2.7,2,0,0,2,2,2,2,Pyridine nucleotide-disulphide oxidoreductase
4,TcCLB.503555.30,P28593,Pyr_redox_dim,PF02852,-,1.5e-21,76.7,0.0,3.7e-21,75.5,0.0,1.7,1,0,0,1,1,1,1,"Pyridine nucleotide-disulphide oxidoreductase,..."
5,TcCLB.503555.30,P28593,Pyr_redox_3,PF13738,-,1.8e-11,43.8,0.2,9.9e-09,34.9,0.0,2.2,2,0,0,2,2,2,2,Pyridine nucleotide-disulphide oxidoreductase
6,TcCLB.503555.30,P28593,FAD_binding_2,PF00890,-,9.1e-07,28.3,2.0,0.0041,16.3,0.7,2.5,3,0,0,3,3,3,2,FAD binding domain
7,TcCLB.503555.30,P28593,HI0933_like,PF03486,-,3.3e-05,22.8,0.3,0.015,14.0,0.1,2.5,2,0,0,2,2,2,2,HI0933-like protein
8,TcCLB.503555.30,P28593,GIDA,PF01134,-,0.0014,17.8,1.3,0.0019,17.4,0.3,1.7,2,0,0,2,2,2,1,Glucose inhibited division protein A
9,TcCLB.503555.30,P28593,Lycopene_cycl,PF05834,-,0.004,16.3,0.1,0.52,9.3,0.0,2.2,2,0,0,2,2,2,2,Lycopene cyclase protein
10,TcCLB.503555.30,P28593,FAD_oxidored,PF12831,-,0.013,14.9,0.0,0.21,10.9,0.0,2.1,2,0,0,2,2,2,0,FAD dependent oxidoreductase
11,TcCLB.503555.30,P28593,Thi4,PF01946,-,0.015,14.6,0.1,0.027,13.7,0.1,1.4,1,0,0,1,1,1,0,Thi4 family


- Fazer um crawler?