### This tutorial demonstrates the binary analysis workflow of `BreathPy`.
Initially sample data (MCC-IMS measurements of breath after consuming either `menthol` or `citrus` candy, see material section of https://www.mdpi.com/2218-1989/10/10/393 for more information) is downloaded and split into a training and test fraction - the test samples will later serve to validate the created random forest classifier. The samples are normalized and denoised. Afterwards, several peak-detection methods are applied including the `VisualnowLayer` contained in the sample data. Subsequently peaks are aligned using the `ProbeClustering` approach and the features are reduced using `RemovePercentageFeatures` - which limits the reported features to the ones present in at least `percentage_threshold` of the minority class - in this case `citrus`. 
Features are then weighted using the `PerformanceMeasure`s `RANDOM_FOREST_CLASSIFICATION` and `FDR_CORRECTED_P_VALUE` - leaving 10 features each. 
Two decision trees and a `RandomForestClassifier` are trained. They serve as visual interpretation of the classifcation strategy based on each `PerformanceMeasure`.
After training, the `RandomForestClassifier` is used to predict the class labels of the test samples.
Finally, plots are created and saved in the `results/plots/` directory.

In [None]:
# handle imports
from urllib.request import urlretrieve
from shutil import move as file_move
import numpy as np
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
import joblib

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display

from breathpy.generate_sample_data import generate_train_test_sets, generate_train_test_set_helper
from breathpy.model.BreathCore import (MccImsAnalysis, MccImsMeasurement, PredictionModel,
                              construct_default_parameters,
                              construct_default_processing_evaluation_steps,
                              construct_custom_processing_evaluation_dict)
from breathpy.model.ProcessingMethods import FeatureReductionMethod, PerformanceMeasure, GCMSPeakDetectionMethod, GCMSAlignmentMethod
from breathpy.tools.tools import get_peax_binary_path

from breathpy.view.BreathVisualizations import ClusterPlot, HeatmapPlot, RocCurvePlot, BoxPlot, TreePlot, TimeSeriesPlot

In [None]:
# download sample data and split into train and test fraction

url = 'https://github.com/philmaweb/BreathAnalysis.github.io/raw/master/data/full_candy.zip'
zip_dst = Path("data/full_candy.zip")
dst_dir = Path("data/full_candy/")
dst_dir.mkdir(parents=True, exist_ok=True)
urlretrieve(url, zip_dst)

# unzip archive into data subdirectory
with ZipFile(zip_dst, "r") as archive_handle:
    archive_handle.extractall(Path(dst_dir))

raw_dir = dst_dir
target_dir = Path("data/")

# split into train and test fraction - use 1/3 of samples for validation
generate_train_test_sets(dir_full_set=raw_dir, root_target_dir=target_dir, cross_val_num=3, seed=42)

### Now we simplify and go step-by-step through the methods called by `breathpy.model.CoreTest.run_start_to_end_pipeline`

In [None]:
# define default parameters and train / test directory
folder_name = file_prefix = 'train_full_candy'

plot_parameters, file_parameters = construct_default_parameters(file_prefix, folder_name, make_plots=True)

# create default parameters for preprocessing and evaluation
preprocessing_steps, evaluation_params_dict = construct_default_processing_evaluation_steps()

# define directory for training and test set
train_dir = Path("data/train_full_candy/")
test_dir = Path("data/test_full_candy/")

# get class label dict file from training set
train_class_label_dict_fn = MccImsAnalysis.guess_class_label_extension(train_dir)

# read in raw mcc-ims measurements of training set - based on class_label_dict
train_measurements = [MccImsMeasurement(fn) for fn in train_dir.glob("*ims.csv")]

In [None]:
# setup analysis - need to get path of peax binary and get the visualnowlayer filename
peax_binary_path = get_peax_binary_path()
visualnow_layer_path = [filename for filename in train_dir.glob("*layer*") if
                        (str.endswith(str(filename), "layer.csv") or str.endswith(str(filename), "layer.xls"))][0]

# create output directory
if not Path(file_parameters['out_dir']).exists():
    Path(file_parameters['out_dir']).mkdir(parents=True, exist_ok=True)

# create analysis
ims_analysis = MccImsAnalysis(
    train_measurements, preprocessing_steps, outfile_names=[], performance_measure_parameters=evaluation_params_dict,
    class_label_file=train_class_label_dict_fn, dir_level=file_parameters['dir_level'],
    dataset_name=folder_name, visualnow_layer_file=visualnow_layer_path,
    peax_binary_path=peax_binary_path)

# run normalization, denoising and peak_detection for measurements using 6 cores
# for peak_detection we run [PEAX, WATERSHED, VISUALNOWLAYER, TOPHAT] methods defined in preprocessing_steps
# if one want to change default parameters, pass updated parameters for preprocessing_parameters
ims_analysis.preprocess_multicore(num_cores=6)

In [None]:
# plot and show a preprocessed measurement
test_measurement = ims_analysis.measurements[0]
HeatmapPlot.FastIntensityMatrix(test_measurement, plot_parameters=plot_parameters, title=str(test_measurement))

Image(Path("results/plots/heatmaps/fast_train_full_candy_intentsity_plot_BD18_1408280834_ims.png"))

In [None]:
# align peak detection results
ims_analysis.align_peaks()

#### Visualize
* show average chromatogram for each candy type - after normalization and denoising
* show clusters for each peak detection method

In [None]:
clusters = ClusterPlot.ClusterBasic(ims_analysis, plot_parameters=plot_parameters)
overlays = ClusterPlot.OverlayClasswiseAlignment(ims_analysis, plot_parameters=plot_parameters)

# get paths of the images
cluster_fn = Path(clusters[2][-1])
overlay_fn_citrus =Path("results/plots/overlay/")/overlays[2][-1]
overlay_fn_menthol =Path("results/plots/overlay/")/overlays[-2][-1]

# display images for the TOPHAT method
images = [cluster_fn, overlay_fn_citrus, overlay_fn_menthol]

for fn in images:
    display(Image(fn))

In [None]:
# apply feature reduction
ims_analysis.reduce_features(ims_analysis.AVAILABLE_FEATURE_REDUCTION_METHODS)
# evaluate model performance using 3-fold cross-validation
ims_analysis.evaluate_performance()

In [None]:
# export preprocessed files, peak detection results and feature_matrixes to csv into results directory
print(file_parameters['out_dir'])

# export preprocessed files, peak detection results and feature_matrixes to csv into results directory
ims_analysis.export_results_to_csv(file_parameters['out_dir'])

#### Visualize the analysis:
* show estimated model performance - ROC curve
* show best features superimposed for each candy type
* plot boxplot and time-series plot for each feature


In [None]:
roc_plots = RocCurvePlot.ROCCurve(ims_analysis.analysis_result, plot_parameters=plot_parameters)
box_plots = BoxPlot.BoxPlotBestFeature(ims_analysis.analysis_result, plot_parameters=plot_parameters)

try:
    dt_plots = TreePlot.DecisionTrees(ims_analysis.analysis_result, plot_parameters=plot_parameters, limit_to_peak_detection_method_name="TOPHAT")
except FileNotFoundError as e:
    # might not both be installe - need system executable and python library
    print("Probably graphviz is not installed - install via `conda install graphviz python-graphviz`")
    raise(e)

ts_plots = TimeSeriesPlot.TimeSeriesFromAnalysis(ims_analysis, plot_parameters=plot_parameters, limit_to_pdmn=['TOPHAT'], limit_to_features=['Peak_0178','Peak_0231'])

roc_fn = roc_plots[2][-1]
box_plot_fn = box_plots[0][-1]
if dt_plots:
    dt_fn = Path(dt_plots[1][-1][-1])

ts_fn0 = ts_plots[1][0]
ts_fn1 = ts_plots[1][1]
    
# display images for the TOPHAT method
images = [roc_fn, box_plot_fn, dt_fn, ts_fn0, ts_fn1]

for fn in images:
    display(Image(fn))

In [None]:
# prediction

# export prediction model and import from pickle
predictor_path = Path(file_parameters['out_dir'])/"predictors.sav"
ims_analysis.analysis_result.export_prediction_models(path_to_save=predictor_path)
predictors = joblib.load(predictor_path)

predictionModel = PredictionModel(
        preprocessing_params={s:{} for s in preprocessing_steps},
        evaluation_params=ims_analysis.performance_measure_parameter_dict,
        scipy_predictor_by_pdm=predictors,
        feature_names_by_pdm=ims_analysis.analysis_result.feature_names_by_pdm,
        peax_binary_path=peax_binary_path,
        visualnow_layer_file=visualnow_layer_path)




#  preparation - replace train_ with test_
#  otherwise can't find measurements - as the class labels don't match the measurement names
file_parameters['folder_path'] = file_parameters['folder_path'].replace("train_", "test_")
test_dir = file_parameters['folder_path']

test_result_dir = file_parameters['out_dir'].replace("train_", "test_")

test_measurements_fns = sorted(Path(test_dir).glob("*ims.csv"))
test_measurements = [MccImsMeasurement(fn) for fn in test_measurements_fns]

In [None]:
# predict - and run full preprocessing and alignment on test_measurements
prediction = predictionModel.predict(test_measurements)

In [None]:
test_labels_dict_fn = MccImsAnalysis.guess_class_label_extension(test_dir)
test_labels_dict = MccImsAnalysis.parse_class_labels(test_labels_dict_fn)
class_labels = np.unique([m.class_label for m in ims_analysis.measurements])
test_measurements_names = [path.name for path in test_measurements_fns]
for pdm, prediction_index in prediction.items():
    predicted_labels = {test_name: class_labels[p] for p, test_name in zip(prediction_index, test_measurements_names)}
    correct = dict()
    false = dict()
    for fn, predicted_label in predicted_labels.items():
        if predicted_label == test_labels_dict[fn]:
            correct[fn] = predicted_label
        else:
            false[fn] = predicted_label

    print(f"resulting_labels for {pdm.name} are: {predicted_labels}")
    print(f"Falsely classified: {false}\n")
    print(f"That's {len(correct.keys())} correct vs {len(false.keys())} false\n")