In [None]:
# handle imports
from urllib.request import urlretrieve
from pathlib import Path
from zipfile import ZipFile

# download and extract data into data/algae directory
url = 'https://github.com/philmaweb/BreathAnalysis.github.io/raw/master/data/algae.zip'
zip_dst = Path("data/algae.zip")
dst_dir = Path("data/algae/")
dst_dir.mkdir(parents=True, exist_ok=True)
urlretrieve(url, zip_dst)

# unzip archive into data subdirectory
with ZipFile(zip_dst, "r") as archive_handle:
    archive_handle.extractall(Path(dst_dir))

In [None]:
import os
from pathlib import Path
from breathpy.model.BreathCore import construct_default_parameters,construct_default_processing_evaluation_steps
from breathpy.model.ProcessingMethods import GCMSPeakDetectionMethod, PerformanceMeasure, FeatureReductionMethod
from breathpy.model.GCMSTest import run_gcms_platform_multicore
from breathpy.generate_sample_data import generate_train_test_set_helper

"""
Runs analysis of the algae sample set (Sun M, Yang Z and Wawrik B (2018) Metabolomic Fingerprints 
of Individual Algal Cells Using the Single-Probe Mass Spectrometry Technique. 
Front. Plant Sci. 9:571. doi: 10.3389/fpls.2018.00571)

19 samples from four conditions - light, dark, nitrogen-limited and replete (post nitrogen-limited)
Samples originated from single-probe mass spectrometry files - we import created featureXML files.
:param cross_val_num:
:return:
"""
cross_val_num=3
# or use your local path to a dataset here
source_dir = Path("data/algae")
target_dir = Path("data")

# will delete previous split and rewrite data
train_df, test_df = generate_train_test_set_helper(source_dir, target_dir, cross_val_num=cross_val_num)
train_dir = Path(target_dir)/"train_algae"

# prepare analysis
set_name = "train_algae"
make_plots = True

# generate parameters
plot_parameters, file_parameters = construct_default_parameters(set_name, set_name, make_plots=make_plots)
preprocessing_params_dict = {GCMSPeakDetectionMethod.ISOTOPEWAVELET: {"hr_data": True}}
_, evaluation_params_dict = construct_default_processing_evaluation_steps(cross_val_num)
evaluation_params_dict[FeatureReductionMethod.REMOVE_PERCENTAGE_FEATURES]['percentage_threshold'] = 0.9
# running the full analysis takes less than 30 minutes of computation time using 6 
#  cores - in this example most if not all computations are single core though
run_gcms_platform_multicore(
    sample_dir=train_dir, 
    preprocessing_params=preprocessing_params_dict, 
    evaluation_parms=evaluation_params_dict, num_cores=6)