In [None]:
import os
import pandas as pd

In [None]:
# Enter here path to project directory
infolder = "/home/mark/final_test_astral/"

In [None]:
# Create project directory if it is not exist
if not os.path.isdir(infolder):
    os.makedirs(infolder)

In [None]:
# Move to the project directory
%cd $infolder
%pwd

In [None]:
# Download raw files from ProteomeXchange

!wget -b ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2024/02/PXD046417/*.raw

In [None]:
# For the next step, you should install ThermoRawFileParser (https://github.com/compomics/ThermoRawFileParser)

In [None]:
# Run ThermoRawFileParser for all raw files with an option to extract only MS1 spectra

for fn in os.listdir(infolder):
    if fn.endswith('.raw'):
        infile1 = os.path.join(infolder, fn)
        !ThermoRawFileParser -i $infile1 -L 1 -o $infolder

In [None]:
# (Optional)
# Remove all raw files which were already converted

# for fn in os.listdir(infolder):
#     if fn.endswith('.mzML'):
#         mzmlfile = os.path.join(infolder, fn)
#         rawname = mzmlfile.replace('.mzML', '.raw')
#         if os.path.getsize(mzmlfile) > 78502:
#             if os.path.exists(rawname):
#                 os.remove(rawname)
#         else:
#             print('small size', mzmlfile, os.path.getsize(mzmlfile))

In [None]:
# For the next step, you should install biosaur2 (https://github.com/markmipt/biosaur2)
!pip install biosaur2

In [None]:
# Run biosaur2 for all mzML files to extract peptide isotope clusters.

for fn in os.listdir(infolder):
    if fn.endswith('.mzML'):
        mzmlname = os.path.join(infolder, fn)
        # These are the options for 180 min DDA data
        if 'QE5_nLC11' in fn:
            !biosaur2 $mzmlname -minlh 5
        # These are the options for all Astral data
        else:
            !biosaur2 $mzmlname -minlh 1

In [None]:
# (Optional)
# Remove all mzML files which were already converted

# for fn in os.listdir(infolder):
#     if fn.endswith('.features.tsv'):
#         ftrfile = os.path.join(infolder, fn)
#         mzmlfile = ftrfile.replace('.features.tsv', '.mzML')
#         if os.path.getsize(ftrfile) > 1000:
#             if os.path.exists(mzmlfile):
#                 os.remove(mzmlfile)

In [None]:
# For the next step, you should install ms1searchpy (https://github.com/markmipt/ms1searchpy)
!pip install ms1searchpy

In [None]:
# For the next step, you should install DeepLC (https://github.com/compomics/DeepLC)
# The recommended version is the clone available at https://github.com/markmipt/DeepLC
# The latest official DeepLC versions should work too, but ms1searchpy processing time will be much longer
!pip install https://github.com/markmipt/DeepLC/archive/refs/heads/alternative_best_model.zip

In [None]:
# For the next step, you should put protein fasta database into working directory
# The database is placed along this notebook on the github (sprot_human_shuffled.fasta)

In [None]:
# Run ms1searchpy for all *features.tsv files generated by biosaur2

infasta = os.path.join(infolder, 'sprot_human_shuffled.fasta')
for fn in os.listdir(infolder):
    if fn.endswith('.features.tsv'):
        ftrfile = os.path.join(infolder, fn)
        protfile = ftrfile.replace('.features.tsv', '.features_proteins.tsv')
        if not os.path.exists(protfile):
            !ms1searchpy $ftrfile -d $infasta -sc 1 -i 2 -nproc 8 -mc 0 -cmin 1 -ptol 8 -fdr 5 -ts 2 -ml 1 -deeplc 1 -lmin 7 -mcalib 0 -deeplc_library /tmp/deeplc345.lib

In [None]:
# For the next step, you should put sample list for MSA and Controls samples into working directory
# The sample list is placed along this notebook on the github (Astral_Sample.tsv)

In [None]:
# Create a dictionary with keys = filename and values = GroupName (MSA or Control)

df1 = pd.read_table(os.path.join(infolder, 'Astral_Sample.tsv'))
df1['comment[data file]'] = df1['comment[data file]'].str.strip()
f_to_cond_map = df1.set_index('comment[data file]')['characteristics[phenotype]'].to_dict()

In [None]:
# Create a dictionary which contains list of files used in DirectMS1Quant analysis for 4 runs:
# 7min: all 3 technical replicates for 200 SPD data
# 7min1r: single techical replicate per sample for 200 SPD data
# 28min: 40 SPD data

file_dict = dict()

file_dict['7min'] = {
    'CTRL': [],
    'MSA': [],
}

file_dict['7min1r'] = {
    'CTRL': [],
    'MSA': [],
}

file_dict['28min'] = {
    'CTRL': [],
    'MSA': [],
}

file_dict['180min'] = {
    'CTRL': [],
    'MSA': [],
}


for z in os.listdir(infolder):
    if z.endswith('_proteins_full.tsv'):
        
        if 'NOR_QC' not in z:

            zname = z.split('.features')[0]

            if '_250ng_01' in z:
                
                if '2023115_Astral03_Evo5_UHG_SA_DIA_MSA_RR-135_250ng_01' not in z:
                
                    file_dict['28min'][f_to_cond_map[zname.replace('_250ng_01', '')]].append(os.path.join(infolder, z))
            elif 'QE5_nLC11' in z:
                sgroup = 'MSA' if 'SA_MSA' in z else 'CTRL'
                file_dict['180min'][sgroup].append(os.path.join(infolder, z))
            else:

                file_dict['7min'][f_to_cond_map[zname]].append(os.path.join(infolder, z))
                if 'RR_1_4th' in zname:
                    file_dict['7min1r'][f_to_cond_map[zname]].append(os.path.join(infolder, z))
                    

In [None]:
# Run DirectMS1Quant for single techical replicate per sample for 200 SPD data

S_first_list = ' '.join(file_dict['7min1r']['CTRL'])
S_second_list = ' '.join(file_dict['7min1r']['MSA'])

out_name = os.path.join(infolder, 'directms1quant_out_7min1r')
!directms1quant\
-S1 $S_first_list\
-S2 $S_second_list -out $out_name

In [None]:
# Run DirectMS1Quant for all 3 technical replicates for 200 SPD data

S_first_list = ' '.join(file_dict['7min']['CTRL'])
S_second_list = ' '.join(file_dict['7min']['MSA'])

out_name = os.path.join(infolder, 'directms1quant_out_7min')
!directms1quant\
-S1 $S_first_list\
-S2 $S_second_list -out $out_name

In [None]:
# Run DirectMS1Quant for 40 SPD data

S_first_list = ' '.join(file_dict['28min']['CTRL'])
S_second_list = ' '.join(file_dict['28min']['MSA'])

out_name = os.path.join(infolder, 'directms1quant_out_28min')
!directms1quant\
-S1 $S_first_list\
-S2 $S_second_list -out $out_name