In [None]:
from collections import defaultdict
import pandas as pd

# Score lookup table for amino acids at each position
score_table = {
    "A": [17, 9, 4, 4, 4],
    "C": [0, 0, 0, 0, 0],
    "D": [1, 0, 3, 0, 0],
    "E": [0, 1, 1, 0, 0],
    "F": [1, 2, 0, 5, 0],
    "G": [2, 2, 3, 0, 36],
    "H": [0, 0, 0, 0, 0],
    "I": [1, 12, 0, 8, 0],
    "K": [2, 0, 2, 0, 1],
    "L": [1, 9, 2, 11, 0],
    "M": [0, 1, 0, 1, 0],
    "N": [1, 0, 2, 1, 3],
    "P": [2, 0, 0, 0, 0],
    "Q": [0, 0, 2, 1, 0],
    "R": [0, 0, 0, 0, 1],
    "S": [5, 1, 12, 2, 1],
    "T": [7, 2, 21, 16, 6],
    "V": [2, 12, 0, 3, 0],
    "W": [0, 0, 0, 0, 0],
    "Y": [10, 1, 0, 0, 0]
}

# Function to compute the score of a 5-amino-acid sequence using the score table
def score_sequence(seq):
    assert len(seq) == 5, "Sequence length must be 5"
    score = 0
    for i, amino_acid in enumerate(seq):
        score += score_table.get(amino_acid, [0] * 5)[i]
    return score

# The protein sequence to search within
protein_sequence = """MINAKMALPLLTALACMPSQASAATLLGQNIQSLSVFSNTYATTGANSSVYGSLMTGDVGTSGAGGFISGSFTAVGAATIGGGASRVGGNITAGGALTTGASSSVGGNITSGGAASVGAAGIIGGNIASAGAVSTGTASTIGGNVQSGGAASIGVGGSVGGSVSAVGAYTQGAGSSVPAHQPIVTPPVDAAVLIASLTDTVAFNQLQLLAAQAAFSRMVTTTFLDATITADTTLFSGVYSADSLATTASTTITLDGQNKIDQFWVFNIADILTTGASSKIVMINGANSNSIIWNAGGYSALGAGSTFLGTLISRANISVGANAEALGAGLSCGGLFSALSYLSTGDGAKIGGEGCTGVGAGFEVNSDGVAYHIDSSEPTSA~VPEPETWAMLMLGLGLVGTSMRRKSRAAVAA"""

# Search for all 5-amino-acid subsequences and compute scores
results = defaultdict(list)

for i in range(len(protein_sequence) - 4):
    subseq = protein_sequence[i:i + 5]
    score = score_sequence(subseq)
    results[subseq].append((score, (i, i + 5)))

# Display the results sorted by score in descending order
sorted_results = sorted(((seq, matches) for seq, matches in results.items()), key=lambda x: max(m[0] for m in x[1]), reverse=True)

# Convert results to displayable format
sorted_results_display = []
for seq, matches in sorted_results:
    for score, pos in matches:
        sorted_results_display.append([seq, score, pos])

# Create a DataFrame from the sorted results
df = pd.DataFrame(sorted_results_display, columns=["Sequence", "Score", "Position"])

df # Show top 10 results
df.to_csv('motif_example_results.csv', index=False) # Save results to a CSV file

# # Get the top 10 sequences
# top_10_sequences = [seq for seq, _ in sorted_results[:10]]

# # Function to highlight a sequence in the protein sequence
# def highlight_sequence(protein_sequence, seq):
#     return protein_sequence.replace(seq, f'\033[1;31m{seq}\033[0m')

# # Highlight the top 10 sequences in the protein sequence
# highlighted_protein_sequence = protein_sequence
# for seq in top_10_sequences:
#     highlighted_protein_sequence = highlight_sequence(highlighted_protein_sequence, seq)

# # Print the highlighted protein sequence
# print(highlighted_protein_sequence)