In [None]:
# The original matrix search was written by H Nakano 20220908.
# The since deprecated search algorithm levaraging base20 indexing was written by Soki Nakano.
# Further performance improvements, code cleanup and flexible use of any peptide size written by Maurizio Camagna

import pandas as pd
import numpy as np
import sys, csv, re

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

## Notes:
- The column names of the score table are arbitrary, except for the first column, which must be named **AA**
- The order of the rows in the score table is also arbitrary, as the script will sort the file alphabetically

# Inputs

In [None]:
score_table_file = 'Input_file.csv' #name of score table file.
fasta_in = 'Human_proteins.fasta' # name of  multifasta file for search.
minimum_score = 12 # specify minimum score 
csv_name = 'output.csv' # output file name
write_only_dense_output = True

<br><br><br><br>

In [None]:
score_table = pd.read_csv(score_table_file)
score_table

Let's take a look at the scoring matrix and confirm that it is correct, as well as help us choose a minimum_score

In [None]:
best_peptide = ""
worst_peptide = ""

for col in score_table.columns[1:]:
    best_peptide += score_table.iloc[np.argmax(score_table[col]), 0]
    worst_peptide += score_table.iloc[np.argmin(score_table[col]), 0]

    
max_score = score_table.iloc[:,1:].max().sum()
min_score = score_table.iloc[:,1:].min().sum()

print("According to the score table, the min/max possible scores are:")
print(f"Max: {max_score}\t{best_peptide}")
print(f"Min: {min_score}\t{worst_peptide}")

In [None]:
score_table_list = score_table.sort_values("AA").values.tolist()
n = len(score_table.columns)-1 #n is the number of amino acids in the peptide

In [None]:
pep_num_list = [("A", 0), ("C", 1), ("D", 2), ("E", 3), ("F", 4), ("G", 5), ("H", 6), \
          ("I", 7), ("K", 8), ("L", 9), ("M", 10), ("N", 11), ("P", 12), ("Q", 13),\
          ("R", 14), ("S", 15), ("T", 16), ("V", 17), ("W", 18), ("Y", 19)]
pep_num_dict=dict(pep_num_list) 

In [None]:
def pep2score(pep):
    """Calculates the score for a given peptide."""
    score = 0
    for i, item in enumerate(pep):
        num = pep_num_dict[item]
        score += score_table_list[num][i+1]
    return score

In [None]:
scores = {}

out = []

for record in SeqIO.parse(fasta_in, 'fasta'):
    id_part = record.id
    desc_part = record.description
    seq = str(record.seq)
    
    for i in range (len(seq)-n+1):
        pep = seq[i:i+n]
        score = -1
        
        if pep in scores:
            score = scores[pep]
        else:
            try:
                score = pep2score(pep)
                scores[pep] = score
            except:
                pass
        if score > minimum_score:
            seq_t = [id_part, desc_part, i+1, score, pep, seq]
            out.append(seq_t)    


Let's write the peptide result to a csv file

In [None]:
if len(out)>0:
    out = pd.DataFrame(out)
    out.columns = ["id","desc","posi","score","target","sequence"]
    out = out.set_index('id')
    out = out.sort_values(by='score', ascending=False)
    if not write_only_dense_output:
        out.to_csv(csv_name)
else:
    print("No output file written: 0 hits")

<br>Let's also write a more compact output file (one row per protein)

In [None]:
if len(out)>0:
    #re-assure that the output is sorted by peptide scores
    out = out.sort_values(by='score', ascending=False)

    out_list = []
    out_list_columns = ["id", 'total score',"highest individual peptide score", 'peptide matches', 'unique peptide matches', 'sequence', 'peptides (peptide,score,count)']

    #group by each protein
    for protein_name, df in out.groupby(out.index):

        protein_score = df['score'].sum() #total peptide score for this protein
        peptide_matches = df['target'].count() #how many peptides matched
        unique_peptide_matches = df['target'].drop_duplicates().count() #how many peptides matches (no duplicates)

        highest_scoring_peptide = df['target'].iloc[0]
        highest_peptide_score = df['score'].iloc[0]

        #Let's write the protein sequence as lower case, 
        #except for positions where a peptide matches
        seq = df['sequence'].iloc[0].lower()
        for target in df['target']:
            seq = re.sub(target, target, seq, flags=re.IGNORECASE)


        peptide_dict = {}
        peptide_counts = df.groupby('target').count().iloc[:,0].to_dict()

        for peptide, score in zip(df['target'], df['score']):
            peptide_dict[peptide] = {"score":score, 'count':peptide_counts[peptide]}

        peptide_string = ""
        for peptide, value in peptide_dict.items():
            peptide_string+= f"{peptide},{round(value['score'],3)},{value['count']}|"
        peptide_string = peptide_string[:-1]


        out_list.append([protein_name, protein_score, highest_peptide_score, peptide_matches, unique_peptide_matches, seq, peptide_string])


    dense_out = pd.DataFrame(out_list)
    dense_out.columns = out_list_columns
    dense_out = dense_out.sort_values(by='total score', ascending=False)
    dense_out.to_csv(csv_name.replace(".csv", '.dense.csv'))