In [1]:
import pandas as pd
import numpy as np
import re

# 1. Reading Data
This first part of the notebook is to read and pre process all the data, which are:
- Pfam keyword search data: data gathered by searching nucleus and membrane related words
- T. Cruzi HMM output: T. cruzi data aligned with Pfam database
- Swissprot data: 103 validated t cruzi proteins

## 1.1 Pfam keyword search data

In [2]:
# Pfam proteins associated with nucleus terms and membrane terms in keyword search
df_pfam_locations = pd.read_csv('../input/pfam_nucleus_membrane_data.csv')

In [3]:
# The searched terms
nucleus_terms = ['chromossome', 'chromatin', 'nucleus', 'nucleic']
membrane_terms = ['membrane', 'cytoplasm', 'cytoskeleton', 'cytosol']

# Add variable to identify if is a nucleus or membrane term
df_pfam_locations['flag_nucleus'] = np.where(df_pfam_locations['keyword'].isin(nucleus_terms), 1, 0)
df_pfam_locations['flag_membrane'] = np.where(df_pfam_locations['keyword'].isin(membrane_terms), 1, 0)
df_pfam_locations.shape

(3001, 10)

In [4]:
# The Pfam keyword search find the same accession in different terms
# so I will group these values
df_pfam_locations = df_pfam_locations.groupby('accession')[['flag_nucleus', 'flag_membrane']].max().reset_index()
df_pfam_locations.head(3)

Unnamed: 0,accession,flag_nucleus,flag_membrane
0,PF00001,0,1
1,PF00002,0,1
2,PF00003,0,1


In [5]:
df_pfam_locations.accession.unique().shape

(2682,)

## 1.2 Pfam HMM Output

In [6]:
df_hmm = pd.read_csv('../output/df_t_cruzi_hmm.csv', low_memory=False)
df_hmm.shape

(61583, 19)

In [7]:
df_hmm.head(3)

Unnamed: 0,target_name,accession,query_name,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
0,Hus1,PF04005.12,TcCLB.505051.20,-,4.7e-75,252.4,0.0,5.3e-75,252.2,0.0,1.0,1,0,0,1,1,1,1,Hus1-like protein
1,BNR_3,PF13859.6,TcCLB.504593.10,-,2.9e-54,184.5,0.3,2.9e-54,184.5,0.3,2.8,3,1,0,3,3,3,1,BNR repeat-like domain
2,Tr-sialidase_C,PF11052.8,TcCLB.504593.10,-,9.5e-08,32.0,5.5,1.9e-07,31.1,5.5,1.5,1,0,0,1,1,1,1,Trans-sialidase of Trypanosoma hydrophobic C-t...


In [8]:
df_hmm.accession.unique().shape

(7690,)

In [9]:
# Remove any character after dot in accession ('.')
df_hmm.accession = df_hmm.accession.apply(lambda x: re.sub(r'\..*', '', x.strip()))
# Strip whitespace from target, query name and score
df_hmm.target_name = df_hmm.target_name.apply(lambda x: x.strip())
df_hmm.query_name = df_hmm.query_name.apply(lambda x: x.strip())
df_hmm.head(2)

Unnamed: 0,target_name,accession,query_name,remove,e_value,score,bias,e_value2,score_2,bias_2,exp,reg,clu,ov,env,dom,rep,inc,description_of_target
0,Hus1,PF04005,TcCLB.505051.20,-,4.7e-75,252.4,0.0,5.3e-75,252.2,0.0,1.0,1,0,0,1,1,1,1,Hus1-like protein
1,BNR_3,PF13859,TcCLB.504593.10,-,2.9e-54,184.5,0.3,2.9e-54,184.5,0.3,2.8,3,1,0,3,3,3,1,BNR repeat-like domain


In [10]:
print("Unique target_name: " + str(len(df_hmm.target_name.unique())))
print("Unique accession: " + str(len(df_hmm.accession.unique())))
print("Unique query_name: " + str(len(df_hmm.query_name.unique())))
print("Unique query_name + accession: " + str(len((df_hmm.query_name + df_hmm.accession).unique())))

Unique target_name: 7690
Unique accession: 7690
Unique query_name: 17544
Unique query_name + accession: 61583


## 1.3 Swissprot blast data

In [11]:
df_swissprot = pd.read_csv('../output/df_swiss_prot_t_cruzi.csv')
df_swissprot.drop('Unnamed: 0', axis=1, inplace=True)
df_swissprot.shape

(103, 4)

In [12]:
# Transforming variables
df_swissprot['accession_2'] = df_swissprot.iteration_query_def.str.split('|', expand=True).iloc[:,1]
df_swissprot['query_name'] = df_swissprot.query_id.str.split('|', expand=True).iloc[:,0]
df_swissprot['query_name'] = df_swissprot.query_name.apply(lambda x: str(x).strip())
df_swissprot.head(3)

Unnamed: 0,iteration_query_def,aln_length,flag_hit,query_id,accession_2,query_name
0,sp|Q9GT49|TRYS_TRYCC Trypanothione synthetase ...,647,1,TcCLB.509319.90 | organism=Trypanosoma_cruzi_C...,Q9GT49,TcCLB.509319.90
1,sp|P28593|TYTR_TRYCR Trypanothione reductase O...,492,1,TcCLB.503555.30 | organism=Trypanosoma_cruzi_C...,P28593,TcCLB.503555.30
2,sp|Q9U6Z1|KM11_TRYCR Kinetoplastid membrane pr...,92,1,TcCLB.510755.89 | organism=Trypanosoma_cruzi_C...,Q9U6Z1,TcCLB.510755.89


In [13]:
print("Unique accession_2: " + str(len(df_swissprot.accession_2.unique())))
print("Unique query_name: " + str(len(df_swissprot.query_name.unique())))

Unique accession_2: 103
Unique query_name: 94


In [14]:
df_swissprot.flag_hit.value_counts()

1     92
0      8
10     1
6      1
2      1
Name: flag_hit, dtype: int64

In [15]:
df_swissprot = df_swissprot[df_swissprot.query_name!='nan']

# 2. Join Data
Now it will be required to join this 3 different datasets.

The 95 swissprot entries will be our training data and we'll join the T. Cruzi hmm output and join by 'query_name' to get 'acession' and 'score' information.

Now with the 'accession' column we can match our pfam keywords data, and create our training set.

## 2.1 Join T. Cruzi hmm on Swissprot data

In [29]:
df_train = df_swissprot[['query_name', 'accession_2']].merge(df_hmm[['accession', 'query_name', 'score']], how='left')
df_train.shape

(418, 4)

## 2.2 Add Pfam keywords

In [30]:
df_train = df_train.merge(df_pfam_locations, how='left')

In [31]:
df_train['score_nucleus'] = np.where(df_train.flag_nucleus.isna(), df_train.score, df_train.flag_nucleus*df_train.score)
df_train['score_membrane'] = np.where(df_train.flag_membrane.isna(), df_train.score, df_train.flag_membrane*df_train.score)

In [32]:
df_train = df_train.groupby('accession_2')[['score_nucleus', 'score_membrane']].sum().reset_index()

In [33]:
df_train['classification'] = np.where(df_train.score_nucleus>df_train.score_membrane, 1, 0)
df_train.loc[df_train.score_nucleus==df_train.score_membrane, 'classification'] = 'draw'

In [40]:
df_train.head()

Unnamed: 0,accession_2,score_nucleus,score_membrane,classification
0,O00822,98.0,87.0,1
1,O15885,245.0,245.0,draw
2,O15886,128.2,128.2,draw
3,O76240,297.3,297.3,draw
4,O96507,396.0,396.0,draw


In [38]:
df_train.classification.value_counts()

draw    65
0       24
1        6
Name: classification, dtype: int64

In [41]:
df_train.to_csv('../output/df_train.csv', index=False)

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut

In [102]:
rf_model = RandomForestClassifier(n_estimators=10)
loo = LeaveOneOut()

In [104]:
X = df_train.loc[df_train.classification!='draw', ['score_nucleus','score_membrane']]
X['score'] = X.score_nucleus + X.score_membrane
X = X['score'].reset_index(drop=True)

y = df_train.loc[df_train.classification!='draw', 'classification'].reset_index(drop=True)

In [105]:
loo.get_n_splits(X)

30

In [134]:
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rf_model.fit(np.array(X_train).reshape(-1,1), list(y_train))
    print("Pred: " + str(rf_model.predict(np.array(X_test).reshape(-1,1))))
    print("True: " + str(y_test.values) + "\n")

Pred: [0]
True: [1]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [1]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [1]
True: [1]

Pred: [0]
True: [1]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [1]
True: [1]

Pred: [0]
True: [0]

Pred: [0]
True: [0]

Pred: [0]
True: [1]

Pred: [0]
True: [1]

Pred: [0]
True: [0]

Pred: [1]
True: [0]



### There are duplicated query_name on swissprot data

In [None]:
# Query names duplicados no swissprot
df_swissprot.query_name.value_counts()[df_swissprot.query_name.value_counts()>1]

- Searching these query_name

In [None]:
df_join_inner[df_join_inner.query_name=="TcCLB.506679.70"]

Links para análises:
- Uniprot Q4E097: https://www.uniprot.org/uniprot/Q4E097
    - Faz referência para: http://pfam.xfam.org/protein/Q4E097 e http://pfam.xfam.org/family/PF01912
    
- Uniprot Q9XYP3: https://www.uniprot.org/uniprot/Q9XYP3
    - Faz referência para: http://pfam.xfam.org/protein/Q9XYP3 e http://pfam.xfam.org/family/PF01912
    
___
    
- Pfam PF01912: https://pfam.xfam.org/family/PF01912
- Pfam PF13563: https://pfam.xfam.org/family/2_5_RNA_ligase2

In [None]:
df_join_inner[df_join_inner.query_name=="TcCLB.506795.80"]

Links para análises:
- Uniprot Q4DA80: https://www.uniprot.org/uniprot/Q4DA80
    - Faz referência para: http://pfam.xfam.org/protein/Q4DA80 e http://pfam.xfam.org/family/PF05544
    
- Uniprot Q868H8: https://www.uniprot.org/uniprot/Q868H8
    - Faz referência para: http://pfam.xfam.org/protein/Q868H8 e http://pfam.xfam.org/family/PF05544
    
___
    
- Pfam PF02567: http://pfam.xfam.org/family/PhzC-PhzF
- Pfam PF05544: https://pfam.xfam.org/family/Pro_racemase

In [None]:
# 
df_join_inner[df_join_inner.accession_2=='P28593']

- Fazer um crawler?

In [None]:
col_names = ['target_name','accession','query_name','remove','e_value','score',
            'bias','e_value2','score_2','bias_2','exp','reg','clu','ov','env','dom',
            'rep','inc','description_of_target']


df = pd.DataFrame(columns=col_names)
lines = [line.rstrip('\n') for line in open('../output/output-file-tbl')]




In [None]:
len(lines)

In [None]:
i=0

In [None]:
[lines[i+3][:21],lines[i+3][21:32],lines[i+3][32:51],lines[i+3][51:65],
                 lines[i+3][65:75],lines[i+3][75:80],lines[i+3][80:89],lines[i+3][89:98],
                 lines[i+3][98:105],lines[i+3][105:112],lines[i+3][112:118],lines[i+3][118:122],
                 lines[i+3][122:126],lines[i+3][126:130],lines[i+3][130:134],lines[i+3][134:138],
                 lines[i+3][138:142],lines[i+3][142:144],lines[i+3][-(len(lines[i+3])-144):]]

In [None]:
for i in range(len(lines)-13):
    lines

In [None]:
import pandas as pd

wd = '/home/cirofdo/Documents/Multiresolution-nuclear-protein-classifier/'

col_names = ['target_name','accession','query_name','remove','e_value','score',
            'bias','e_value2','score_2','bias_2','exp','reg','clu','ov','env','dom',
            'rep','inc','description_of_target']


df = pd.DataFrame(columns=col_names)
lines = [line.rstrip('\n') for line in open('../output/output-file-tbl')]

j=0
for i in range(len(lines)-13):
    df.loc[i] = [lines[i+3][:21],lines[i+3][21:32],lines[i+3][32:51],lines[i+3][51:65],
                 lines[i+3][65:75],lines[i+3][75:80],lines[i+3][80:89],lines[i+3][89:98],
                 lines[i+3][98:105],lines[i+3][105:112],lines[i+3][112:118],lines[i+3][118:122],
                 lines[i+3][122:126],lines[i+3][126:130],lines[i+3][130:134],lines[i+3][134:138],
                 lines[i+3][138:142],lines[i+3][142:144],lines[i+3][-(len(lines[i+3])-144):]]
    j = j +1
    if j==5000:
        print(i)
        j=0

df.to_csv('../output/df_t_cruzi_hmm.csv', index=False)


In [None]:
df

In [None]:
df_hmm