In [55]:
!rm -r spaan
!rm -r NERVE
!git clone git://github.com/nicolagulmini/spaan
!git clone git://github.com/nicolagulmini/NERVE
!python -m pip install git+https://github.com/nicolagulmini/tmhmm.py
!pip install Bio

Cloning into 'spaan'...
remote: Enumerating objects: 195, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (155/155), done.[K
remote: Total 195 (delta 109), reused 91 (delta 37), pack-reused 0[K
Receiving objects: 100% (195/195), 5.63 MiB | 10.91 MiB/s, done.
Resolving deltas: 100% (109/109), done.
Cloning into 'NERVE'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 29 (delta 8), reused 20 (delta 4), pack-reused 0[K
Receiving objects: 100% (29/29), 9.98 MiB | 14.20 MiB/s, done.
Resolving deltas: 100% (8/8), done.
Collecting git+https://github.com/nicolagulmini/tmhmm.py
  Cloning https://github.com/nicolagulmini/tmhmm.py to /tmp/pip-req-build-uvm8pop4
  Running command git clone -q https://github.com/nicolagulmini/tmhmm.py /tmp/pip-req-build-uvm8pop4


In [56]:
import tmhmm                            # to predict transmembrane domains
from Bio import SeqIO                   # to handle .fasta files
from NERVE import protein               # to contain proteins information
from tensorflow import keras            # to use the spaan model (to predict the probability of a protein to be an adhesin)
import pandas                           # to read mhcpep.csv and write the final report
from spaan.data_processing import *     # to extract proteins features for the spaan model

In [57]:
p_ad_no_citoplasm_filter = 0.46
p_ad_extracellular_filter = 0.38
transmemb_dom_limit_filter = 2
e_value = 1e-10
similarity_function = 0.8
verbose = 1

In [58]:
# main program

# TODO: check parameters, license ecc.

path_to_fastas = "./NERVE/data/tre.fasta"
list_of_fasta_proteins = list(SeqIO.parse(path_to_fastas, "fasta")) # put the right path

list_of_proteins = []
for p in list_of_fasta_proteins:
	p_id = p.id
	p_seq = p.seq
	list_of_proteins.append(protein.protein(p_id, p_seq))

if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = None
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = None
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = None
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



In [59]:
# Subcelloc module
# ... cerca di capire come chiamare psortb da qua
# supponendo che ci sia l'output faccio il parsing di ogni final prediction
# final predictions parsing from the output of psortb program

In [60]:
# Adhesin module

model = keras.models.load_model('./spaan/model_trained_on_original_dataset.h5') # insert the path to location!
for p in list_of_proteins:
	p.p_ad = float(model.predict([
			     np.array([aminoacids_frequencies(p.sequence)]),
			     np.array([multiplet_frequencies(p.sequence, 3)]),
			     np.array([multiplet_frequencies(p.sequence, 4)]),
			     np.array([multiplet_frequencies(p.sequence, 5)]),
			     np.array([dipeptide_frequencies(p.sequence)]),
			     np.array([charge_composition(p.sequence)]),
			     np.array([hydrophobic_composition(p.sequence)])
		     ]))

In [61]:
if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = None
   estimated probability to be an adhesin = 0.01151728630065918
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = None
   estimated probability to be an adhesin = 0.04193472862243652
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = None
   estimated probability to be an adhesin = 0.02877509593963623
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



In [62]:
# Tmhelices module

for p in list_of_proteins:
    annotation, _ = tmhmm.predict(p.sequence)
    p.tmhmm_seq = annotation
    transmembrane_domains = 0
    for i in range(len(annotation)-1):
        if (annotation[i] == 'i' or annotation[i] == 'o') and annotation[i+1] == 'M':
            transmembrane_domains += 1
    p.transmembrane_doms = transmembrane_domains

  _, path = viterbi(sequence, *model)


In [63]:
if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = None
   estimated probability to be an adhesin = 0.01151728630065918
   number of transmembrane domains = 0
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = None
   estimated probability to be an adhesin = 0.04193472862243652
   number of transmembrane domains = 7
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = None
   estimated probability to be an adhesin = 0.02877509593963623
   number of transmembrane domains = 14
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



In [64]:
# Autoimmunity module
!makeblastdb -in "./NERVE/UP000005640_9606.fasta" -dbtype prot -parse_seqids -out "sapiens"
!blastp -query path_to_fastas -out sapiens_output.txt -db ./sapiens
# chiedere al prof cosa significa la finestra di 9 aminoacidi 



Building a new DB, current time: 09/02/2021 15:44:39
New DB name:   /content/sapiens
New DB title:  ./NERVE/UP000005640_9606.fasta
Sequence type: Protein
Deleted existing Protein BLAST database named /content/sapiens
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 20600 sequences in 0.691802 seconds.
Command line argument error: Argument "query". File is not accessible:  `path_to_fastas'


In [65]:
# (optional) Conservation module

In [66]:
# Function module
# make blastp on sprot (all the uniprot database) and take only the ones with more than 80% similarity (positives or identities?)
# magari prova deepgo e confronta i risultati

# qui uso prosite ma trova solo dei domini che non so come usare
from Bio.ExPASy.ScanProsite import scan, read
handler = scan(list_of_proteins[0].sequence)
read(handler)

[{'level': '0',
  'score': '25.873',
  'sequence_ac': 'USERSEQ1',
  'signature_ac': 'PS50932',
  'start': 2,
  'stop': 57},
 {'level_tag': '(0)',
  'sequence_ac': 'USERSEQ1',
  'signature_ac': 'PS00356',
  'start': 4,
  'stop': 22}]

In [67]:
# Select module (filters)