In [1]:
import pandas as pd
import glob
import re

In [2]:
files = glob.glob('Data/Alignments/*.Meg')
files

[]

In [37]:
def parse_meg(file):
    d = dict()
    file = open(file)
    fasta_id = ''
    seq = str()
    i = 0
    for line in file:
        if '#' in line and 'mega' not in line:
            if len(seq) > 0 and len(fasta_id) > 0:
                d[i] = [fasta_id,seq]
                i += 1
            line = line.replace('\n', '').replace('#', '')
            fasta_id = line
            seq = ''
        else:
            seq = seq + line.replace('\n', '') 
    return d

In [38]:
alignment_dfs = []
for file in files:
    d = parse_meg(file)
    df = pd.DataFrame.from_dict(d, orient='index', columns = ['fasta_id', 'alignment'])
    alignment_dfs.append(df)
alignment_df = pd.concat(alignment_dfs)
alignment_df

Unnamed: 0,fasta_id,alignment
0,Acidiphilium_cryptum(accA),----------------------------------------------...
1,Bacillus_cereus(accA),----------------------------------------------...
2,Bacillus_subtilis(accA),----------------------------------------------...
3,Cellulophaga_baltica(accA),----------------------------------------------...
4,Chryseobacterium_indologenes(accA),----------------------------------------------...
...,...,...
15,Rhizobium_radiobacter(zwf),------------------------MNKQRSDVLVVFGATGDLAYKM...
16,Rhizobium_radiobacter(zwf)_alt_v_1,--------------------MSSQIIPVEPFDCVVFGGTGDLAERK...
17,Rhodopseudomonas_palustris(zwf),------------------MTTQTDPTLPEGCAFVIFGVTGDLTHRL...
18,Stigmatella_aurantiaca(zwf),----METQGLHIETTPREGEPIVRAGQPDPCVIVLFGATGDLAQRK...


In [19]:
modification_df = pd.read_csv('Data/Cleaned_Modifications_with_start_end_positions.csv')
modification_df = modification_df.dropna()

In [56]:
mod_ids = []
for ID in modification_df.Fasta_ID:
    if '[' in ID:
        ID = ID.replace('[', '_').replace(']', "").strip().replace(' ', '_')
        mod_ids.append(ID)
    else:
        mod_ids.append(ID.strip())
modification_df['Fasta_ID'] = mod_ids
modification_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,scannum,Peptide,Observed Modifications,Protein ID,Gene,Delta Mass,Organism,best_locs,start_position,end_position,Fasta_ID
0,0,0,0,2397,AEATHPAPAESGNGAEGGK,Xlink:EGS[115](115.026943),A5FZF9,secB,115.0261,Acidiphilium_cryptum,AEATHPAPAESGNGAEGGk,64,83,Acidiphilium_cryptum(secB)
1,1,1,1,2415,ASGAGGQHVNKTESAVR,Methyl(14.015650),A5FX99,prfA,14.0112,Acidiphilium_cryptum,ASGAggqHVNKTESAVR,222,239,Acidiphilium_cryptum(prfA)
2,2,3,6,2857,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0170,Acidiphilium_cryptum,ASGAggqHVNKTESAIR,190,207,Acidiphilium_cryptum(prfB)
3,3,4,7,2883,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0188,Acidiphilium_cryptum,ASGAggqHVNKTESAIR,190,207,Acidiphilium_cryptum(prfB)
4,4,6,13,4828,SHGDLSENAEYHSAR,Carboxy->Thiocarboxy(15.977156),A5FWZ7,greA,15.9849,Acidiphilium_cryptum,SHGDLSENAEyHSAR,37,52,Acidiphilium_cryptum(greA)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27760,27760,42678,102697,38192,QMLLEAVADVDDALMEK,Ammonia-loss(-17.026549),A0A1R0IL53,fusA,-17.0240,Sulfobacillus_thermosulfidooxidans,qmLLEAVADVDDALMEK,211,228,Sulfobacillus_thermosulfidooxidans(fusA)
27761,27761,42679,102698,38226,NMITGAAQMDGAILVVSAADGPMPQTR,Label:2H(3)+Oxidation(19.013745),A0A1R0IKZ6,tuf,19.0210,Sulfobacillus_thermosulfidooxidans,NMITGAAQMDGAIlvvSAADGPMPQTR,90,117,Sulfobacillus_thermosulfidooxidans(tuf)
27762,27762,42682,102706,38463,ISAFYAPASSLAEMVEAILKDER,Carboxy(43.989829),A0A2T2WPV9,mdh,43.9851,Sulfobacillus_thermosulfidooxidans,ISAFYAPASSLAEMVEAILKDER,223,246,Sulfobacillus_thermosulfidooxidans(mdh)
27763,27763,42686,102714,38688,IDPIPLIGFAGAPFTLASYIIEGGPSK,Methyl:2H(3)13C(1)(18.037835),A0A2T2WVR2,hemE,18.0330,Sulfobacillus_thermosulfidooxidans,IDPIPLIGFAGAPFTLASYIIEGGPSK,139,166,Sulfobacillus_thermosulfidooxidans(hemE)


In [65]:
modification_df[modification_df.Fasta_ID.isin(['Acidiphilium_cryptum(accA)'])]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,scannum,Peptide,Observed Modifications,Protein ID,Gene,Delta Mass,Organism,best_locs,start_position,end_position,Fasta_ID


In [64]:
for index, row in alignment_df.iterrows():
    alignment_map = dict()
    alignment = row['alignment']    
    a_pos = 0
    seq_pos = 0
    for c in alignment:
        if c == '-':
            a_pos += 1
        else:
            alignment_map[seq_pos] = a_pos
            seq_pos += 1
            a_pos += 1
    fasta_id = row['fasta_id'].strip()
    print(fasta_id)
    df = modification_df[modification_df.Fasta_ID == fasta_id]
    if len(df) != 0:
        print(df)
        break

Acidiphilium_cryptum(accA)
Bacillus_cereus(accA)
Bacillus_subtilis(accA)
     Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1  scannum     Peptide  \
404         404           536            1745     9559  LSEEELVQQR   

    Observed Modifications Protein ID  Gene  Delta Mass           Organism  \
404       TMAB(128.107539)     O34847  accA    128.0986  Bacillus_subtilis   

      best_locs  start_position  end_position                 Fasta_ID  
404  lSEEELVQQR             295           305  Bacillus_subtilis(accA)  


In [50]:
modification_df[modification_df.Fasta_ID.str.contains('cryptum')]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,scannum,Peptide,Observed Modifications,Protein ID,Gene,Delta Mass,Organism,best_locs,start_position,end_position,Fasta_ID
0,0,0,0,2397,AEATHPAPAESGNGAEGGK,Xlink:EGS[115](115.026943),A5FZF9,secB,115.0261,Acidiphilium_cryptum,AEATHPAPAESGNGAEGGk,64,83,Acidiphilium_cryptum(secB)
1,1,1,1,2415,ASGAGGQHVNKTESAVR,Methyl(14.015650),A5FX99,prfA,14.0112,Acidiphilium_cryptum,ASGAggqHVNKTESAVR,222,239,Acidiphilium_cryptum(prfA)
2,2,3,6,2857,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0170,Acidiphilium_cryptum,ASGAggqHVNKTESAIR,190,207,Acidiphilium_cryptum(prfB)
3,3,4,7,2883,ASGAGGQHVNKTESAIR,Methyl(14.015650),A5FZ59,prfB,14.0188,Acidiphilium_cryptum,ASGAggqHVNKTESAIR,190,207,Acidiphilium_cryptum(prfB)
4,4,6,13,4828,SHGDLSENAEYHSAR,Carboxy->Thiocarboxy(15.977156),A5FWZ7,greA,15.9849,Acidiphilium_cryptum,SHGDLSENAEyHSAR,37,52,Acidiphilium_cryptum(greA)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,384,485,1623,34147,ADLANLVEQLSSLTVLEAAELSK,Oxidation(15.994915),A5FZX2,rplL,15.9954,Acidiphilium_cryptum,ADLANLVEQLSSLTVlEAAELSK,1,24,Acidiphilium_cryptum(rplL)
385,385,486,1625,34345,NMITGAAQMDGAILVVSAADGPMPQTR,Methyl:2H(2)(16.028204),A5FZW7,tuf,16.0260,Acidiphilium_cryptum,NMITGAAQMDGAilvvsaADGPMPQTR,90,117,Acidiphilium_cryptum(tuf)
386,386,487,1626,34355,ADLANLVEQLSSLTVLEAAELSK,Oxidation(15.994915),A5FZX2,rplL,15.9924,Acidiphilium_cryptum,ADLANLVEQLSSLTVlEAAELSK,1,24,Acidiphilium_cryptum(rplL)
387,387,488,1629,34773,NMITGAAQMDGAILVVSAADGPMPQTR,Oxidation(15.994915),A5FZW7,tuf,16.0013,Acidiphilium_cryptum,NMITGAAQMDGAIlvVSAADGPMPQTR,90,117,Acidiphilium_cryptum(tuf)
