In [3]:
!pip install biopython

# Leaderboard Cyclopeptide Sequencing
# Using 4Q21 FASTA (small fragment)

from Bio import SeqIO
from collections import Counter

# ------------------------------
# Step 1: Read FASTA
# ------------------------------

record = SeqIO.read("rcsb_pdb_4Q21.fasta", "fasta")
sequence = str(record.seq)

print("Full Protein Length:", len(sequence))

peptide_string = sequence[:6]
print("Selected Peptide:", peptide_string)

# ------------------------------
# Step 2: Mass Table
# ------------------------------

mass_table = {
    'G': 57, 'A': 71, 'S': 87, 'P': 97,
    'V': 99, 'T': 101, 'C': 103, 'I': 113,
    'L': 113, 'N': 114, 'D': 115, 'K': 128,
    'Q': 128, 'E': 129, 'M': 131, 'H': 137,
    'F': 147, 'R': 156, 'Y': 163, 'W': 186
}

original_masses = [mass_table[aa] for aa in peptide_string]
print("Original Mass List:", original_masses)


# ------------------------------
# Step 3: Spectrum Functions
# ------------------------------

def cyclic_spectrum(peptide):
    prefix_mass = [0]

    for i in range(len(peptide)):
        prefix_mass.append(prefix_mass[i] + peptide[i])

    peptide_mass = prefix_mass[-1]
    spectrum = [0]

    for i in range(len(peptide)):
        for j in range(i + 1, len(peptide) + 1):
            spectrum.append(prefix_mass[j] - prefix_mass[i])
            if i > 0 and j < len(peptide):
                spectrum.append(peptide_mass - (prefix_mass[j] - prefix_mass[i]))

    return sorted(spectrum)


def linear_spectrum(peptide):
    prefix_mass = [0]

    for i in range(len(peptide)):
        prefix_mass.append(prefix_mass[i] + peptide[i])

    spectrum = [0]

    for i in range(len(peptide)):
        for j in range(i + 1, len(peptide) + 1):
            spectrum.append(prefix_mass[j] - prefix_mass[i])

    return sorted(spectrum)


# Generate experimental spectrum
experimental_spectrum = cyclic_spectrum(original_masses)
parent_mass = max(experimental_spectrum)

print("\nExperimental Spectrum:")
print(experimental_spectrum)


# ------------------------------
# Step 4: Scoring Function
# ------------------------------

def score(peptide, spectrum):
    peptide_spec = cyclic_spectrum(peptide)
    spectrum_counter = Counter(spectrum)
    peptide_counter = Counter(peptide_spec)

    total_score = 0
    for mass in peptide_counter:
        total_score += min(peptide_counter[mass], spectrum_counter[mass])

    return total_score


# ------------------------------
# Step 5: Expand
# ------------------------------

def expand(peptides):
    expanded = []
    for peptide in peptides:
        for mass in set(mass_table.values()):
            expanded.append(peptide + [mass])
    return expanded


# ------------------------------
# Step 6: Trim Leaderboard
# ------------------------------

def trim(leaderboard, spectrum, N):
    scored = [(peptide, score(peptide, spectrum)) for peptide in leaderboard]
    scored.sort(key=lambda x: x[1], reverse=True)

    if len(scored) <= N:
        return [p[0] for p in scored]

    cutoff_score = scored[N-1][1]

    trimmed = []
    for peptide, sc in scored:
        if sc >= cutoff_score:
            trimmed.append(peptide)

    return trimmed


# ------------------------------
# Step 7: Leaderboard Algorithm
# ------------------------------

def leaderboard_cyclopeptide_sequencing(spectrum, N):
    leaderboard = [[]]
    leader_peptide = []
    leader_score = 0

    while leaderboard:
        leaderboard = expand(leaderboard)

        for peptide in leaderboard[:]:
            mass = sum(peptide)

            if mass == parent_mass:
                current_score = score(peptide, spectrum)
                if current_score > leader_score:
                    leader_peptide = peptide
                    leader_score = current_score

            if mass > parent_mass:
                leaderboard.remove(peptide)

        leaderboard = trim(leaderboard, spectrum, N)

    return leader_peptide


# ------------------------------
# Step 8: Run Leaderboard
# ------------------------------

N = 10   # Leaderboard size (you can change)
result = leaderboard_cyclopeptide_sequencing(experimental_spectrum, N)

print("\nBest Peptide (Mass Format):")
print(result)


# ------------------------------
# Step 9: Convert Back to Letters
# ------------------------------

reverse_mass = {}
for aa, mass in mass_table.items():
    reverse_mass.setdefault(mass, []).append(aa)

print("\nBest Peptide (Amino Acid Format):")

possible_strings = ['']
for mass in result:
    new_strings = []
    for string in possible_strings:
        for aa in reverse_mass[mass]:
            new_strings.append(string + aa)
    possible_strings = new_strings

print(possible_strings)

Full Protein Length: 189
Selected Peptide: MTEYKL
Original Mass List: [131, 101, 129, 163, 128, 113]

Experimental Spectrum:
[0, 101, 113, 128, 129, 131, 163, 230, 232, 241, 244, 291, 292, 345, 361, 372, 393, 404, 420, 473, 474, 521, 524, 533, 535, 602, 634, 636, 637, 652, 664, 765]

Best Peptide (Mass Format):
[128, 163, 129, 101, 131, 113]

Best Peptide (Amino Acid Format):
['KYETMI', 'KYETML', 'QYETMI', 'QYETML']
