### This is the example from the breathPy paper - including sample preparation and imports

In [None]:
# handle imports
from urllib.request import urlretrieve
from shutil import move as file_move
import numpy as np
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
import joblib

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display

from breathpy.generate_sample_data import generate_train_test_sets, generate_train_test_set_helper
from breathpy.model.BreathCore import (MccImsAnalysis, MccImsMeasurement, PredictionModel,
                              construct_default_parameters,
                              construct_default_processing_evaluation_steps,
                              construct_custom_processing_evaluation_dict)
from breathpy.model.ProcessingMethods import FeatureReductionMethod, PerformanceMeasure, GCMSPeakDetectionMethod, GCMSAlignmentMethod
REMOVE_PERCENTAGE_FEATURES = FeatureReductionMethod.REMOVE_PERCENTAGE_FEATURES
from breathpy.tools.tools import get_peax_binary_path
from breathpy.view.BreathVisualizations import ClusterPlot, HeatmapPlot, RocCurvePlot, BoxPlot, TreePlot, TimeSeriesPlot

# download example zip-archive
url = 'https://github.com/philmaweb/BreathAnalysis.github.io/raw/master/data/full_candy.zip'
zip_dst = Path("data/full_candy.zip")
dst_dir = Path("data/full_candy/")
dst_dir.mkdir(parents=True, exist_ok=True)
urlretrieve(url, zip_dst)

# unzip archive into data subdirectory
with ZipFile(zip_dst, "r") as archive_handle:
    archive_handle.extractall(Path(dst_dir))
    
# split into train and validation set - use 1/3 for validation
generate_train_test_sets(
        dir_full_set="data/full_candy/", 
        root_target_dir="data/", cross_val_num=3, seed=42)

# create default parameters for preprocessing and evaluation
d_params = construct_default_processing_evaluation_steps()
pre_steps,eval_params = d_params
# define Visualnow layer and  class_labels for analysis
vn_layer_path = Path("data/train_full_candy/candy_layer.xls")
train_class_label_dict_path = Path(
        "data/train_full_candy/candy_class_labels.csv")
# define directory for training and test set
train_dir = Path("data/train_full_candy/")
validation_dir = Path("data/test_full_candy/")
# read in raw mcc-ims measurements of training set
train_measurements = [MccImsMeasurement(fn) 
        for fn in sorted(train_dir.glob("*ims.csv"))]
# define analysis
ims_analysis = MccImsAnalysis(
        train_measurements, pre_steps, 
        performance_measure_parameters=eval_params,
        class_label_file=train_class_label_dict_path, 
        dataset_name="train_full_candy", 
        visualnow_layer_file=vn_layer_path,
        peax_binary_path=get_peax_binary_path())

# normalization, denoising and peak_detection for measurements
ims_analysis.preprocess_multicore(num_cores=6)
# align peak detection results
ims_analysis.align_peaks()
# apply feature reduction
ims_analysis.reduce_features(
        [REMOVE_PERCENTAGE_FEATURES])
# evaluate model performance using 3-fold cross-validation
ims_analysis.evaluate_performance()
# export results
ims_analysis.export_results_to_csv("results/data/train_full_candy")

# prediction - will use parameters and features from previous steps
predictionModel = PredictionModel(ims_analysis)
# read in validation  measurements
vm_fns = sorted(Path(validation_dir).glob("*ims.csv"))
validation_measurements = [MccImsMeasurement(fn) for fn in vm_fns]


# prediction - will use parameters and features from previous steps
predictionModel = PredictionModel(ims_analysis)
# read in validation  measurements
vm_fns = sorted(Path(validation_dir).glob("*ims.csv"))
validation_measurements = [MccImsMeasurement(fn) for fn in vm_fns]
# pre-processing and alignment for validation set and prediction
prediction = predictionModel.predict(validation_measurements, num_cores=6)