In [1]:
import pandas as pd
from Bio import SeqIO
import glob
import re

In [2]:
modified_peptiedes_df = pd.read_csv('Data/Cleaned_Modifications.tsv', sep = '\t')
modified_peptiedes_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,scannum,Peptide,Observed Modifications,Protein ID,Gene,Delta Mass,Organism,best_locs
0,0,0,0,2397,AEATHPAPAESGNGAEGGK,Xlink:EGS[115](115.026943),A5FZF9,secB,115.0261,Acidiphilium_cryptum,AEATHPAPAESGNGAEGGk
1,1,1,1,2415,ASGAGGQHVNKTESAVR,Methyl(14.015650),A5FX99,prfA,14.0112,Acidiphilium_cryptum,ASGAggqHVNKTESAVR
2,5,6,6,2857,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0170,Acidiphilium_cryptum,ASGAggqHVNKTESAIR
3,6,7,7,2883,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0188,Acidiphilium_cryptum,ASGAggqHVNKTESAIR
4,9,13,13,4828,SHGDLSENAEYHSAR,Carboxy->Thiocarboxy(15.977156),A5FWZ7,greA,15.9849,Acidiphilium_cryptum,SHGDLSENAEyHSAR
...,...,...,...,...,...,...,...,...,...,...,...
26914,62464,102697,102697,38192,QMLLEAVADVDDALMEK,Ammonia-loss(-17.026549),A0A1R0IL53,fusA,-17.0240,Sulfobacillus_thermosulfidooxidans,qmLLEAVADVDDALMEK
26915,62465,102698,102698,38226,NMITGAAQMDGAILVVSAADGPMPQTR,Label:2H(3)+Oxidation(19.013745),A0A1R0IKZ6,tuf,19.0210,Sulfobacillus_thermosulfidooxidans,NMITGAAQMDGAIlvvSAADGPMPQTR
26916,62470,102706,102706,38463,ISAFYAPASSLAEMVEAILKDER,Carboxy(43.989829),A0A2T2WPV9,mdh,43.9851,Sulfobacillus_thermosulfidooxidans,ISAFYAPASSLAEMVEAILKDER
26917,62474,102714,102714,38688,IDPIPLIGFAGAPFTLASYIIEGGPSK,Methyl:2H(3)13C(1)(18.037835),A0A2T2WVR2,hemE,18.0330,Sulfobacillus_thermosulfidooxidans,IDPIPLIGFAGAPFTLASYIIEGGPSK


In [3]:
files = glob.glob('Data/Gene_FASTA_Files/*.fasta')

In [4]:
def parse_fasta(file):
    i = 0
    j = 0
    with open(file) as f:
        sequences = []
        genes = []
        sequence = ''
        for line in f:
            if '>' in line:
                if sequence != '':
                    i += 1
                    sequences.append(sequence)
                    sequence = ''
                line = re.sub(r'\n', '', line)
                line = re.sub('>', '', line)
                j += 1
                genes.append(line)  
            else:
                sequence += re.sub('\n', '', line)
        if sequence != '':
            sequences.append(sequence)
        df = pd.DataFrame({'ID': genes, 'Sequence': sequences})
        return (df)        

In [5]:
dfs = []
for file in files:
    df = parse_fasta(file)
    dfs.append(df)
fasta_df = pd.concat(dfs)
fasta_df

Unnamed: 0,ID,Sequence
0,Acidiphilium_cryptum(accA),MRHFLDFEKPVAELEAKIEELRRMTDPGELNIAEEVTLLSDKAERQ...
1,Bacillus_cereus(accA),MAELEFEKPVVELRNKIRELKDYTKNSQMDFSEEIRILEEKLENLE...
2,Bacillus_subtilis(accA),MAPRLEFEKPVIELQTKIAELKKFTQDSDMDLSAEIERLEDRLAKL...
3,Cellulophaga_baltica(accA),MEYLDFELPIKELEEQLDKCMIIGEESDVDVTETCKQIEKKLLETR...
4,Chryseobacterium_indologenes(accA),MEYLSFELPIKELMDQYQTCSLVGEESGVDVKLACSQIEDKILEKK...
...,...,...
16,Rhizobium_radiobacter(zwf)!alt_v_1!,MSSQIIPVEPFDCVVFGGTGDLAERKLLPALYHRQIEGQFTEPTRI...
17,Rhodopseudomonas_palustris(zwf),MTTQTDPTLPEGCAFVIFGVTGDLTHRLVIPALYNLAEANLLPEKF...
18,Stigmatella_aurantiaca(zwf),METQGLHIETTPREGEPIVRAGQPDPCVIVLFGATGDLAQRKLFPA...
19,Sulfobacillus_thermosulfidooxidans(zwf),MTDAHSVTNPLREGLVESRRPQPFTMIIFGAAGDLAHRKLFPALYN...


In [6]:
start_positions = []
end_positions = []
IDs = []
best_locs_positions = []
for index, row in modified_peptiedes_df.iterrows():
    peptide = row.Peptide
    organism = row.Organism
    gene = row.Gene
    df = fasta_df[fasta_df.ID.str.contains(organism)]
    df = df[df.ID.str.contains(gene)]
    start_pos = -1
    end_pos = -1
    best_locs_pos = []
    ID = float('NaN')
    for index1, row1 in df.iterrows():
        protein_sequence = row1.Sequence
        if peptide in protein_sequence:
            start_pos = protein_sequence.find(peptide)
            end_pos = start_pos + len(peptide) - 1
            ID = row1.ID
            # find best_locs_pos
            best_locs_peptide = row.best_locs           
            for i, c in zip(range(0,len(best_locs_peptide)), best_locs_peptide):
                if c.islower():
                    best_locs_pos.append(start_pos+i)
            break
    start_positions.append(start_pos)
    end_positions.append(end_pos)
    IDs.append(ID)
    best_locs_positions.append(best_locs_pos)

modified_peptiedes_df['start_position'] = start_positions
modified_peptiedes_df['end_position'] = end_positions
modified_peptiedes_df['fasta_id'] = IDs
modified_peptiedes_df['best_locs_positions'] = best_locs_positions
modified_peptiedes_df = modified_peptiedes_df.dropna()
modified_peptiedes_df.to_csv('Data/Cleaned_Modifications_with_start_end_positions.tsv', sep = '\t')

In [7]:
modified_peptiedes_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,scannum,Peptide,Observed Modifications,Protein ID,Gene,Delta Mass,Organism,best_locs,start_position,end_position,fasta_id,best_locs_positions
0,0,0,0,2397,AEATHPAPAESGNGAEGGK,Xlink:EGS[115](115.026943),A5FZF9,secB,115.0261,Acidiphilium_cryptum,AEATHPAPAESGNGAEGGk,64,82,Acidiphilium_cryptum(secB),[82]
1,1,1,1,2415,ASGAGGQHVNKTESAVR,Methyl(14.015650),A5FX99,prfA,14.0112,Acidiphilium_cryptum,ASGAggqHVNKTESAVR,222,238,Acidiphilium_cryptum(prfA),"[226, 227, 228]"
2,5,6,6,2857,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0170,Acidiphilium_cryptum,ASGAggqHVNKTESAIR,190,206,Acidiphilium_cryptum(prfB),"[194, 195, 196]"
3,6,7,7,2883,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0188,Acidiphilium_cryptum,ASGAggqHVNKTESAIR,190,206,Acidiphilium_cryptum(prfB),"[194, 195, 196]"
4,9,13,13,4828,SHGDLSENAEYHSAR,Carboxy->Thiocarboxy(15.977156),A5FWZ7,greA,15.9849,Acidiphilium_cryptum,SHGDLSENAEyHSAR,37,51,Acidiphilium_cryptum(greA),[47]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26914,62464,102697,102697,38192,QMLLEAVADVDDALMEK,Ammonia-loss(-17.026549),A0A1R0IL53,fusA,-17.0240,Sulfobacillus_thermosulfidooxidans,qmLLEAVADVDDALMEK,211,227,Sulfobacillus_thermosulfidooxidans(fusA),"[211, 212]"
26915,62465,102698,102698,38226,NMITGAAQMDGAILVVSAADGPMPQTR,Label:2H(3)+Oxidation(19.013745),A0A1R0IKZ6,tuf,19.0210,Sulfobacillus_thermosulfidooxidans,NMITGAAQMDGAIlvvSAADGPMPQTR,90,116,Sulfobacillus_thermosulfidooxidans(tuf),"[103, 104, 105]"
26916,62470,102706,102706,38463,ISAFYAPASSLAEMVEAILKDER,Carboxy(43.989829),A0A2T2WPV9,mdh,43.9851,Sulfobacillus_thermosulfidooxidans,ISAFYAPASSLAEMVEAILKDER,223,245,Sulfobacillus_thermosulfidooxidans(mdh),[]
26917,62474,102714,102714,38688,IDPIPLIGFAGAPFTLASYIIEGGPSK,Methyl:2H(3)13C(1)(18.037835),A0A2T2WVR2,hemE,18.0330,Sulfobacillus_thermosulfidooxidans,IDPIPLIGFAGAPFTLASYIIEGGPSK,139,165,Sulfobacillus_thermosulfidooxidans(hemE),[]
