In [24]:
from Bio import SeqIO
from Bio.Seq import Seq
import re

def parse_fasta(fasta_content):
    """Parse FASTA content and return dictionary of sequences."""
    sequences = {}
    current_header = None
    current_sequence = []
    
    for line in fasta_content.split('\n'):
        if line.startswith('>'):
            if current_header:
                sequences[current_header] = ''.join(current_sequence)
            current_header = line.split('|')[1].split('_')[0]  # Get protein identifier
            current_sequence = []
        else:
            current_sequence.append(line.strip())
    
    if current_header:
        sequences[current_header] = ''.join(current_sequence)
    
    return sequences

def parse_peptides(peptides_content):
    """Parse peptides file content and return list of peptides."""
    peptides = []
    for line in peptides_content.split('\n'):
        peptide = line.strip()
        if peptide and peptide != "Peptide":  # Skip header and empty lines
            peptides.append(peptide)
    return peptides

def search_peptides(peptides, protein_sequences):
    """Search for exact matches of peptides in protein sequences."""
    matches = []
    
    for peptide in peptides:
        if not peptide:  # Skip empty peptides
            continue
            
        for protein_id, sequence in protein_sequences.items():
            if peptide in sequence:
                matches.append({
                    'peptide': peptide,
                    'peptide_length': len(peptide),
                    'protein': protein_id,
                    'position': sequence.find(peptide) + 1  # 1-based position
                })
    
    return matches

def main():
    # Read the peptides file
    with open('cow_peptides.txt', 'r') as f:
        peptides_content = f.read()
    
    # Read the FASTA file
    with open('main_homo_proteins.fasta', 'r') as f:
        fasta_content = f.read()
    
    # Parse the input files
    global peptides, matches
    peptides = parse_peptides(peptides_content)
    protein_sequences = parse_fasta(fasta_content)
    
    # Search for matches
    matches = search_peptides(peptides, protein_sequences)
    
    # Sort matches by peptide length (descending) and protein ID
    matches.sort(key=lambda x: (-x['peptide_length'], x['protein']))
    
    # Print results
    print(f"Found {len(matches)} matches:")
    print("\nPeptide\tLength\tProtein\tPosition")
    print("-" * 50)
    
    for match in matches:
        print(f"{match['peptide']}\t{match['peptide_length']}\t{match['protein']}\t{match['position']}")
    
    # Print summary statistics
    unique_peptides = len(set(match['peptide'] for match in matches))
    unique_proteins = len(set(match['protein'] for match in matches))
    
    print(f"\nSummary:")
    print(f"Total matches: {len(matches)}")
    print(f"Unique peptides with matches: {unique_peptides}")
    print(f"Proteins with at least one match: {unique_proteins}")

if __name__ == "__main__":
    main()

Found 104 matches:

Peptide	Length	Protein	Position
--------------------------------------------------
ENLHLPLPLL	10	P05814	137
NLHLPLPLL	9	P05814	138
WLAHKAL	7	P00709	123
LHLPLPL	7	P05814	139
LAHKAL	6	P00709	124
PYKLRP	6	P02788	90
LRPVAA	6	P02788	93
LPLPLL	6	P05814	141
KVLILA	6	P05814	2
VLPVPQ	6	P05814	90
VPYPQR	6	P05814	184
WLAHK	5	P00709	123
YKLRP	5	P02788	91
LPVPQ	5	P05814	91
LPLPL	5	P05814	141
YGLF	4	P00709	69
KLRP	4	P02788	92
LAMA	4	P02788	610
LPVP	4	P05814	91
WLA	3	P00709	123
YGL	3	P00709	69
PEL	3	P00709	43
YGG	3	P00709	37
LRP	3	P02788	93
VPP	3	P02788	329
MAP	3	P02788	612
IQA	3	P02788	65
YLG	3	P02788	338
LKK	3	P02788	693
LPQ	3	P05814	77
LPL	3	P05814	82
VEP	3	P05814	69
LQP	3	P05814	146
LPP	3	P05814	163
TVY	3	P05814	107
IPP	3	P07498	120
YPY	3	P07498	55
VEP	3	P07498	140
YQK	3	P07498	39
YPY	3	P47710	145
LPL	3	P47710	19
PFP	3	P47710	160
RPK	3	P47710	16
LP	2	P00709	42
FP	2	P00709	14
LF	2	P00709	7
LL	2	P00709	30
VP	2	P00709	5
WL	2	P00709	123
EL	2	P00709	26
LW	2	P00709	78
YG	2	P00709	37

In [10]:
matches.key(['peptide'])

AttributeError: 'list' object has no attribute 'key'

In [29]:
unique_match = []
for m in matches:
    unique_match.append(m['peptide'])
len(set(unique_match))   
#unique_match

65

In [26]:
fourpeptides = []
for i in peptides:
    if len(i) >= 4:
        fourpeptides.append(i)

len(fourpeptides)

412

In [14]:

def main():
    # Read the peptides file
    with open('human_peptides.txt', 'r') as f:
        peptides_content = f.read()
    
    # Read the FASTA file
    with open('CapraHircus_Proteins.fasta', 'r') as f:
        fasta_content = f.read()
    
    # Parse the input files
    global peptides, matches
    peptides = parse_peptides(peptides_content)
    protein_sequences = parse_fasta(fasta_content)
    
    # Search for matches
    matches = search_peptides(peptides, protein_sequences)
    
    # Sort matches by peptide length (descending) and protein ID
    matches.sort(key=lambda x: (-x['peptide_length'], x['protein']))
    
    # Print results
    print(f"Found {len(matches)} matches:")
    print("\nPeptide\tLength\tProtein\tPosition")
    print("-" * 50)
    
    for match in matches:
        print(f"{match['peptide']}\t{match['peptide_length']}\t{match['protein']}\t{match['position']}")
    
    # Print summary statistics
    unique_peptides = len(set(match['peptide'] for match in matches))
    unique_proteins = len(set(match['protein'] for match in matches))
    
    print(f"\nSummary:")
    print(f"Total matches: {len(matches)}")
    print(f"Unique peptides with matches: {unique_peptides}")
    print(f"Proteins with at least one match: {unique_proteins}")

if __name__ == "__main__":
    main()

Found 5 matches:

Peptide	Length	Protein	Position
--------------------------------------------------
YLGYLE	6	P18626	106
YPVEPF	6	P33048	129
EMPFPK	6	P33048	123
YGLF	4	A5JSS8	69
YLLF	4	P02756	120

Summary:
Total matches: 5
Unique peptides with matches: 5
Proteins with at least one match: 4


In [27]:
with open('human_peptides.txt', 'r') as f:
    peptides_content = f.read()

humn_peptides = parse_peptides(peptides_content)
humn_peptides = list(set(humn_peptides))
humn_peptides

['QKTAP',
 'YYGTNLYQRRPAIAINNPYVPRTYYANPAVVRPHAQIPQRQYLPNSHPPTVVRRPNLHPSFIAIPPKKIQDKIIIPTI',
 'NLHLPLP',
 'PQTLALP',
 'CFQWQRNMRKVRGPPVSCIKRD',
 'VPQPIP',
 'TKCFQWQRNMRKVRGPPVSCIKRDS',
 'NLHLP',
 'PYPQ',
 'HL',
 'VPQP',
 'GRRRSVQW',
 'PQPIP',
 'VA',
 'LHLP',
 'PSFQP',
 'NILP',
 'ENLHLP',
 'YLGSGY',
 'ANPAVVRP',
 'FQWQRNMRKVR',
 'YQRRPAIAINNPYVPRTYYANPAVVRPHAQIPQRQYLPNSHPPTVVRRPNLHPSF',
 'EVPKA',
 'ILP',
 'FL',
 'TKCFQWQRN',
 'WNLLRQAQEKFGKDKSP',
 'PNSHP',
 'GRRRS',
 'VLPIPQ',
 'PEATKCFQWQRNMRKVR',
 'PVPQP',
 'DKIYPSFQPQPLIYP',
 'RRSVQWCA',
 'YVPFP',
 'LALPP',
 'FQPQPLIYP',
 'AVVRP',
 'TVYTKGRVMP',
 'IPM',
 'FVEPIP',
 'GRVMPVLKSPTIPFFDPQIP',
 'ENLHLPLP',
 'YPFVEPI',
 'QPLIYP',
 'QPQPLIYP',
 'KCFQWQRNMRKVR',
 'SPTIPFFDPQIPK',
 'LHLPLP',
 'HLPLP',
 'EATKCFQWQRNMRKVRGPPVSCIKR',
 'EPIPYGFLP',
 'PTPAP',
 'VPYPQ',
 'HNPI',
 'GRVMP',
 'GFL',
 'QVPQPIP',
 'IYPF',
 'SFQPQPLIYP',
 'KCFQWQRNMRKVRGPPVSCI',
 'PLAQPA',
 'YGFL',
 'YVPR',
 'VPNSYP',
 'LENLHLPLP',
 'IAIPP',
 'VVRP',
 'KIYPSFQPQPLIYP',
 

In [28]:
count = 0
for u in unique_match:
    if u in humn_peptides:
        count += 1
print(count)

0
