<a href="https://colab.research.google.com/github/pcddb/pepss-ed/blob/main/pepss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PepSS

PepSS is a sequence based secondary structure predictor for peptides.

It takes a list of peptides in Fasta format and provides 3 state secondary structure predictions for each residue - Helix, Strand and Other.

## Instructions

1.   On your computer, put all the peptide sequences you want to predict into a single text file and name the file "peptides.fasta".
2.   On the right hand side of the screen, click the small folder icon.
2.   In the panel that opens up, right click and choose "Upload"
4.   Navigate to the peptides.fasta file you created and upload this file.
5.   Now click the "Runtime" menu at the top of the screen and choose "Run all"
6.   The results file will be automtically created and a prompt will appear asking you to download and save the file (result.fasta) to your computer.
7.   If there are any errors, these will be indicated and an attempt at an informative error message will be made. If you can't make it work feel free to email e.drew@qmul.ac.uk.



In [None]:
#@title Choose "Run all" in the Runtime menu or click the play button to the left to make prediction { display-mode: "form" }

# get the model from github repo
# add this later
!wget -O model.h5 https://github.com/elliot-drew/pepss-ed/raw/main/model.h5
from google.colab import files

import os
import numpy as np 
from sklearn import metrics
import pickle
import tensorflow as tf 
import json

tf.keras.backend.clear_session()

# required aa indices for making features

allowed = set(["A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","X"])

def check(test_str):
    return set(test_str) <= allowed

aa_ind={
	"A": 0,
	"R": 1,
	"N": 2,
	"D": 3,
	"C": 4,
	"Q": 5,
	"E": 6,
	"G": 7,
	"H": 8,
	"I": 9,
	"L": 10,
	"K": 11,
	"M": 12,
	"F": 13,
	"P": 14,
	"S": 15,
	"T": 16,
	"W": 17,
	"Y": 18,
	"V": 19,
	"X": 20 

}

kyledoo={
	"A":  1.800,
	"R": -4.500,
	"N": -3.500,
	"D": -3.500,
	"C":  2.500,
	"Q": -3.500,
	"E": -3.500,
	"G": -0.400,
	"H": -3.200,
	"I":  4.500,
	"L":  3.800,
	"K": -3.900,
	"M":  1.900,
	"F":  2.800,
	"P": -1.600,
	"S": -0.800,
	"T": -0.700,
	"W": -0.900,
	"Y": -1.300,
	"V":  4.200,
	"X": 0 

}

isoelec={
	"A":6.00,
	"R":0.76,
	"N":5.41,
	"D":2.77,
	"C":5.07,
	"E":3.22,
	"Q":5.65,
	"G":5.97,
	"H":7.59,
	"I":6.02,
	"L":5.98,
	"K":9.74,
	"M":5.74,
	"F":5.48,
	"P":6.30,
	"U":5.68,
	"S":5.68,
	"T":5.60,
	"W":5.89,
	"Y":5.66,
	"V":5.96,
}
isoelec["X"]=0

aa_ind_r={
	 0: "A",
	 1: "R",
	 2: "N",
	 3: "D",
	 4: "C",
	 5: "Q",
	 6: "E",
	 7: "G",
	 8: "H",
	 9: "I",
	 10: "L",
	 11: "K",
	 12: "M",
	 13: "F",
	 14: "P",
	 15: "S",
	 16: "T",
	 17: "W",
	 18: "Y",
	 19: "V",
	 20: "X" 

}

topidp={
	"W":-0.884,
	"F":-0.697,
	"Y":-0.510,
	"I":-0.486,
	"M":-0.397,
	"L":-0.326,
	"V":-0.121,
	"N":0.007,
	"C":0.02,
	"T":0.059,
	"A":0.06,
	"G":0.166,
	"R":0.18,
	"D":0.192,
	"H":0.303,
	"Q":0.318,
	"K":0.586,
	"S":0.341,
	"E":0.736,
	"P":0.987,
	"X":0
}


# function to get window for data

def getWindow(i, resnums, resnames, width):
	
	half_window=(width-1)/2
	window=["X"]*width
	for num, name in zip(resnums, resnames):
		offset=(int(num)-int(i))+half_window
		if(offset>=0 and offset <width):
			window[int(offset)]=name
	return(window)

# get sequences from
with open("/content/peptides.fasta", "r") as fin:
	peptide_sequences = fin.readlines()
 
# want a list of tuples - (name, seq)
bad_seq = []
names = []
seqs = []
seq = ""
name = ""

for l in peptide_sequences:
  if(l.startswith(">")):
    # if seq is > 0 in len then we add info to data
    if(len(seq)>0):
      # check sequence is aa 
      seq_check = check(seq)
      if(seq_check):
        names.append(name)
        seqs.append(seq)
      else:
        bad_seq.append((name, seq))
    # new name
    name = l.strip()[1:]
    seq = ""
  elif(len(l.strip()) > 0):
    seq += l.strip()

if(len(seq)>0):
  # check sequence is aa 
  seq_check = check(seq)
  if(seq_check):
    names.append(name)
    seqs.append(seq)
  else:
    bad_seq.append((name, seq))

# make the windowed input for model

seq_windows=[]

for n, seq in zip(names, seqs):
	inwindows=[]
	for i,res in enumerate(seq):
		window=getWindow(i+1, list(range(1,len(seq)+1)),seq, 19)
		onehot_window=[]
		for j in range(1, len(window)-1):
			r2=window[j]
			onehot=[0]*21
			onehot[aa_ind[r2]]=1
			onehot.append(isoelec[r2]/9.74) #normalise
			onehot.append(kyledoo[r2]/4.5)  #normalise
			onehot.append(topidp[r2])  
			onehot_window.append(onehot)
		inwindows.append(onehot_window)
	seq_windows.append(inwindows)
 
# load in model

model = tf.keras.models.load_model('/content/model.h5')

# for each sequence, convert windows to tensor then make prediction
# and store it

all_preds = {} # dict name: pred
all_text = ""

ss_inds = ["H", "E", "O"]

for n, w in zip(names, seq_windows):
  w_tf = tf.convert_to_tensor(w)
  pred = model.predict(w_tf)
  ss = []
  for ind in np.argmax(pred, axis=1):
    ss.append(ss_inds[ind])
  all_preds[n] = {}
  all_preds[n]["raw"] = pred
  all_preds[n]["ss"] = ss 
  all_text+=">{0}".format(n)
  all_text+="\n"
  all_text+= "".join(ss) + "\n"

with open("/content/result.fasta", "w") as fout:
  fout.write(all_text)

files.download("/content/result.fasta")
  





    



