In [9]:
import requests
import pandas as pd
import pymzml
import os
import shutil
from contextlib import closing

In [5]:
def download_file(url, local_filename):
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)

def process_masst_xic(input_results_df):
    ms2_records = input_results_df.to_dict(orient="records")
    for record in ms2_records:
        try:
            # Making data file available
            ftp_url = "ftp://massive.ucsd.edu/" + record["filename"][2:]
            local_filename = os.path.basename(record["filename"])
            proxy_url = "https://gnps-external.ucsd.edu/massiveftpproxy?ftppath={}".format(ftp_url)
            download_file(proxy_url, local_filename)
            
            # Converting
            filename, file_extension = os.path.splitext('local_filename')
            print(file_extension)
            if file_extension == ".mzXML":
                # Converting
                new_local_filename = filename + ".mzML"
                !FileConverter -in $local_filename -out $new_local_filename
                os.remove(local_filename)
                local_filename = new_local_filename

            # Finding the scan
            run = pymzml.run.Reader(local_filename)

            target_scan = record["filescan"]
            for spectrum in run:
                if str(spectrum.ID) == str(target_scan):
                    selected_precursors = spectrum.selected_precursors
                    precursor_dict = selected_precursors[0]
                    precursor_mz = precursor_dict["mz"]
                    precursor_i = precursor_dict["i"]

                    record["precursor_mz"] = precursor_mz
                    record["precursor_i"] = precursor_i
                    record["rt"] = spectrum.scan_time_in_minutes()        

            # Perform XIC
            target_mz = record["precursor_mz"]
            lower_rt = record["rt"] - 0.1
            upper_rt = record["rt"] + 0.1
            run = pymzml.run.Reader(local_filename, MS_precisions={1 : 5e-6, 2 : 20e-6})
            time_dependent_intensities = []

            for spectrum in run:
                spectrum_rt = float(spectrum.scan_time_in_minutes())
                if spectrum_rt < lower_rt or spectrum_rt > upper_rt:
                    continue

                if spectrum.ms_level == 1:
                    has_peak_matches = spectrum.has_peak(target_mz)
                    if has_peak_matches != []:
                        for mz, I in has_peak_matches:
                            time_dependent_intensities.append(
                                [spectrum.scan_time_in_minutes(), I, mz]
                            )

            intensity = sum([peak[2] for peak in time_dependent_intensities])
            record["xic_sum"] = (intensity)
        except KeyboardInterrupt:
            raise
        except:
            pass

        print(record)

        os.remove(local_filename)

    return pd.DataFrame(ms2_records)

In [6]:
### Attention, change this Task ID to your job!

GNPS_MASST_job = "7e9615544d1441319ab525930ebf40cd"

In [7]:
# Downloading Data. Data will appear in specs_ms.mgf file
masst_data_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=all_dataset_spectra_matches/".format(GNPS_MASST_job)
df = pd.read_csv(masst_data_url, sep="\t")

In [8]:
# Do stuff with it

enriched_df = process_masst_xic(df)

enriched_df.to_csv("masst_xic.tsv", sep="\t", index=False)


{'cluster_scan': 132950, 'dataset_id': 'MSV000082074', 'filename': 'f.MSV000082074/ccms_peak/Samples/G74022_5x_BG11_01_18395.mzML', 'filescan': 570, 'metadata': ' ', 'precursor_mz': 741.23959635, 'precursor_i': 3067.0, 'rt': 2.59645, 'xic_sum': 2964.95361328125}

{'cluster_scan': 132950, 'dataset_id': 'MSV000082074', 'filename': 'f.MSV000082074/ccms_peak/Samples/G74022_5x_BG11_01_18395.mzML', 'filescan': 574, 'metadata': ' ', 'precursor_mz': 741.23606441, 'precursor_i': 4021.0, 'rt': 2.615283333333333, 'xic_sum': 3706.1810302734375}

{'cluster_scan': 300, 'dataset_id': 'MSV000084237', 'filename': 'f.MSV000084237/ccms_peak/Raw/std_mix2_4.mzML', 'filescan': 938, 'metadata': ' ', 'precursor_mz': 741.241656539386, 'precursor_i': 2024564.875, 'rt': 1.6918989, 'xic_sum': 27425.880004882812}

{'cluster_scan': 300, 'dataset_id': 'MSV000084237', 'filename': 'f.MSV000084237/ccms_peak/Raw/std_mix2_4.mzML', 'filescan': 941, 'metadata': ' ', 'precursor_mz': 741.237789504636, 'precursor_i': 2411160

KeyboardInterrupt: 