In [1]:
!rm -r spaan
!rm -r NERVE
!git clone git://github.com/nicolagulmini/spaan
!git clone git://github.com/nicolagulmini/NERVE
!python -m pip install git+https://github.com/nicolagulmini/tmhmm.py
!pip install Bio
!apt-get install ncbi-blast+ # for the autoimmunity module if we meant to use blastp to make the comparisons

Cloning into 'spaan'...
remote: Enumerating objects: 195, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (155/155), done.[K
remote: Total 195 (delta 109), reused 91 (delta 37), pack-reused 0[K
Receiving objects: 100% (195/195), 5.63 MiB | 28.99 MiB/s, done.
Resolving deltas: 100% (109/109), done.
Cloning into 'NERVE'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 45 (delta 12), reused 33 (delta 5), pack-reused 0[K
Receiving objects: 100% (45/45), 17.21 MiB | 23.25 MiB/s, done.
Resolving deltas: 100% (12/12), done.
Collecting git+https://github.com/nicolagulmini/tmhmm.py
  Cloning https://github.com/nicolagulmini/tmhmm.py to /tmp/pip-req-build-tn0hynmh
  Running command git clone -q https://github.com/nicolagulmini/tmhmm.py /tmp/pip-req-build-tn0hynmh


In [2]:
import tmhmm                            # to predict transmembrane domains
from Bio import SeqIO                   # to handle .fasta files
from NERVE import protein               # to contain proteins information
from tensorflow import keras            # to use the spaan model (to predict the probability of a protein to be an adhesin)
import pandas                           # to read mhcpep.csv and write the final report
from spaan.data_processing import *     # to extract proteins features for the spaan model

In [3]:
p_ad_no_citoplasm_filter = 0.46
p_ad_extracellular_filter = 0.38
transmemb_dom_limit_filter = 2
e_value = 1e-10
similarity_function = 0.8
verbose = 1

In [4]:
# main program

# TODO: check parameters, license ecc.

path_to_fastas = "./NERVE/data/tre.fasta"
list_of_fasta_proteins = list(SeqIO.parse(path_to_fastas, "fasta")) # put the right path

list_of_proteins = []
for p in list_of_fasta_proteins:
	p_id = p.id
	p_seq = p.seq
	list_of_proteins.append(protein.protein(p_id, p_seq))

if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = None
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = None
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = None
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



In [5]:
# Subcelloc module
# ... cerca di capire come chiamare psortb da qua
print("Warning: the PSORTB output has to be 'terse'!")

# final predictions parsing from the output of psortb program
filename = "20210903075318_psortb_gramneg.txt" # put the right filename from the previous (and still absent) lines of code
fp = open("./NERVE/data/"+filename, 'r')
lines = fp.readlines()
for i in range(1, len(lines)):
    attributes = lines[i].split(' ')
    if attributes[0] == list_of_proteins[i-1].id:
        last_attributes = attributes[len(attributes)-1].split('\t') # example: ['SV=2', 'Cytoplasmic', '9.97\n']
        list_of_proteins[i-1].localization = last_attributes[1] # if it gives errors, try last_attributes[len(last_attributes)-2]
fp.close()



In [6]:
if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = Cytoplasmic
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = CytoplasmicMembrane
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = CytoplasmicMembrane
   estimated probability to be an adhesin = 0
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



In [7]:
# Adhesin module

model = keras.models.load_model('./spaan/model_trained_on_original_dataset.h5') # insert the path to location!
for p in list_of_proteins:
	p.p_ad = float(model.predict([
			     np.array([aminoacids_frequencies(p.sequence)]),
			     np.array([multiplet_frequencies(p.sequence, 3)]),
			     np.array([multiplet_frequencies(p.sequence, 4)]),
			     np.array([multiplet_frequencies(p.sequence, 5)]),
			     np.array([dipeptide_frequencies(p.sequence)]),
			     np.array([charge_composition(p.sequence)]),
			     np.array([hydrophobic_composition(p.sequence)])
		     ]))

In [8]:
if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = Cytoplasmic
   estimated probability to be an adhesin = 0.01151728630065918
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = CytoplasmicMembrane
   estimated probability to be an adhesin = 0.04193472862243652
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = CytoplasmicMembrane
   estimated probability to be an adhesin = 0.02877509593963623
   number of transmembrane domains = None
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



In [9]:
# Tmhelices module

for p in list_of_proteins:
    annotation, _ = tmhmm.predict(p.sequence)
    p.tmhmm_seq = annotation
    transmembrane_domains = 0
    for i in range(len(annotation)-1):
        if (annotation[i] == 'i' or annotation[i] == 'o') and annotation[i+1] == 'M':
            transmembrane_domains += 1
    p.transmembrane_doms = transmembrane_domains

  _, path = viterbi(sequence, *model)


In [10]:
if verbose > 0:
	for p in list_of_proteins:
		p.print_information()

Information about protein sp|P06846|EBGR_ECOLI:
   accession number = P06846
   length = 327
   localization = Cytoplasmic
   estimated probability to be an adhesin = 0.01151728630065918
   number of transmembrane domains = 0
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AAC4|YBHL_ECOLI:
   accession number = P0AAC4
   length = 234
   localization = CytoplasmicMembrane
   estimated probability to be an adhesin = 0.04193472862243652
   number of transmembrane domains = 7
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None

Information about protein sp|P0AE16|AMPG_ECOLI:
   accession number = P0AE16
   length = 491
   localization = CytoplasmicMembrane
   estimated probability to be an adhesin = 0.02877509593963623
   number of transmembrane domains = 14
   number of peptides shared with sapiens = None
   number of peptides shared with mhcpep = None



# Da qui

In [15]:
# Autoimmunity module

#(!makeblastdb -in "./NERVE/UP000005640_9606.fasta" -dbtype prot -parse_seqids -out "sapiens")
# NON SERVE FARE IL DATABASE PERCHE L'HO GIA CARICATO SU GITHUB ALL'INDIRIZZO ./NERVE/sapiens_database/sapiens
!blastp -query ./NERVE/data/tre.fasta -out sapiens_output.txt -db ./NERVE/sapiens_database/sapiens
# chiedere al prof cosa significa la finestra di 9 aminoacidi 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ncbi-blast+ is already the newest version (2.6.0-1).
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.


In [12]:
# (optional) Conservation module

In [13]:
# Function module
# make blastp on sprot (all the uniprot database) and take only the ones with more than 80% similarity (positives or identities?)
# magari prova deepgo e confronta i risultati

# qui uso prosite ma trova solo dei domini che non so come usare
from Bio.ExPASy.ScanProsite import scan, read
handler = scan(list_of_proteins[0].sequence)
read(handler)

# attendiamo notizie dal prof

[{'level': '0',
  'score': '25.873',
  'sequence_ac': 'USERSEQ1',
  'signature_ac': 'PS50932',
  'start': 2,
  'stop': 57},
 {'level_tag': '(0)',
  'sequence_ac': 'USERSEQ1',
  'signature_ac': 'PS00356',
  'start': 4,
  'stop': 22}]

In [14]:
# Select module (filters)