# PROJECT - METHOD - YYMMDD - NPC NMR Data Pipeline

#### This document provides a pipeline for the import of NMR data (pre-processing), and any associated sample metadata, followed by summaries and quality control reports of the data, implementation of quality control analysis and output of a final dataset ready for sharing with collaborators and data modeling. See SOP # for further details of requirements, descriptions of expected outputs and options for optimising data quality.

#### By default all summary reports (with the exception of the final report) will be output only to this notebook. The notebook (including outputs) can be saved using >File>Save and Checkpoint. However, if html copies of any reports are required these can be automatically saved to the save directory by adding the optional input argument output=saveDir.

# 1. Initial Setup

### Define file paths

In [None]:
toolboxPath = r'/local path to npyc-toolbox/phenomecentre/npyc-toolbox'
chemometricsPath = r'/local path to pyChemometrics-toolbox'

rawDataPath = r'/path to Bruker data either rack or full dataset dir level/'

limsFilePath = r'/path to LIMS file/PROJECT dataset LIMSfile.csv'
manifestPath = r'/path to subject information file/PROJECT SubjectINFOfile.csv'

saveDir = r'/path to save directory/Projects/PROJECT/METHOD DATE/'
pulseProgram='noesygppr1d'
sop='GenericNMRblood'

### Import code

In [None]:
import os
import matplotlib.pyplot as plt
import scipy
import pandas
import numpy
import pickle
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
%matplotlib inline
import sys
sys.path.append(chemometricsPath)
sys.path.append(toolboxPath)
import nPYc
import copy
from nPYc.enumerations import VariableType, DatasetLevel, AssayRole, SampleType
from nPYc.utilities.normalisation import NullNormaliser, TotalAreaNormaliser, ProbabilisticQuotientNormaliser

In [None]:
import datetime
from nPYc.__init__ import __version__ as version
print('Run with branch ' + version + ' on ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))

# 2. Import Data and Sample Metadata

### Import acquired data and associated acqusition parameters

In [None]:
nmrData = nPYc.NMRDataset(rawDataPath, pulseProgram=pulseProgram, sop=sop, pdata=1)
# default is noesygppr1d and urine use GenericNMRurine for urine

### Match Acquired Samples to Sample IDs (LIMS file) and subject information (if available)

In [None]:
nmrData.addSampleInfo(descriptionFormat='Basic CSV', filePath=limsFilePath)

# 3. Sample & Feature Summary Reports

### Generate sample summary report

In [None]:
nPYc.reports.generateReport(nmrData, 'sample summary')


### Generate feature summary report

In [None]:
nPYc.reports.generateReport(nmrData,'feature summary')

# 4. Analytical Multivariate Quality Control

### Select the samples and use the quality control checks to mask samples

In [None]:
nmrData.updateMasks(sampleTypes=[SampleType.StudySample, 
                                 SampleType.StudyPool, SampleType.ExternalReference], 
                    filterFeatures=False, qcSampleChecks=['LineWidthFail', 'CalibrationFail', 
                                                          'BaselineFail', 'WaterPeakFail'])

### Run analytical multivariate QC

#### There are several paramters in both the PCA model generation and multivariate report that can be optimised depending on your dataset, please see documentation for details. 

In [None]:
# NOTE: default scaling=1 ('uv'); withExclusions=False (masks not applied)
PCAmodelAnalytical = nPYc.multivariate.exploratoryAnalysisPCA(nmrData, withExclusions=True, scaling=0)

In [None]:
nPYc.reports.multivariateQCreport(nmrData, PCAmodelAnalytical, reportType='analytical', withExclusions=True)

### IF REQUIRED: generate interactive scores and loadings plots

In [None]:
# Interactive scores plot, e.g., plotting the scores for the first two components coloured by run order

# data = nPYc.plotting.plotScoresInteractive(nmrData, PCAmodelAnalytical, 'Run Order', components=[1, 2], withExclusions=True)
# iplot(data)

In [None]:
# Interactive loadings plot, e.g., plotting the loadings for component 2

# data = nPYc.plotting.plotLoadingsInteractive(nmrData, PCAmodelAnalytical, component=2, withExclusions=True)
# iplot(data)

# 5. Save QC Reports for Review

In [None]:
qcDir = os.path.join(saveDir, 'QC')
if not os.path.exists(qcDir):
    os.makedirs(qcDir)
nPYc.reports.generateReport(nmrData, 'sample summary', output=qcDir)
nPYc.reports.generateReport(nmrData, 'feature summary', withExclusions=True, output=qcDir)
nPYc.reports.multivariateQCreport(nmrData, PCAmodelAnalytical, reportType='analytical', withExclusions=True, output=qcDir)

# 6. Finalise & Export Dataset

### Select the samples (default is SampleType.StudySample and SampleType.StudyPool samples only)

In [None]:
nmrData.updateMasks(filterFeatures=True)

### Generate PCA model with updated settings

In [None]:
PCAmodelAnalytical = nPYc.multivariate.exploratoryAnalysisPCA(nmrData, withExclusions=True, scaling=0)

In [None]:
nPYc.reports.multivariateQCreport(nmrData, PCAmodelAnalytical, withExclusions=True, reportType='analytical')

### IF REQUIRED: mark samples for exclusion based on multivariate QC results

In [None]:
# For example, mark outlying samples for exclusion (e.g., from interactive scores plot)
# nmrData.excludeSamples(nmrData.sampleMetadata.iloc[nmrData.sampleMetadata['Sample File Name'].values=='Sample File Name']['Sample File Name'], on='Sample File Name', message='Outlier in PCA scores')

In [None]:
# Repeat PCA modelling
# PCAmodelAnalytical = nPYc.multivariate.exploratoryAnalysisPCA(nmrData, withExclusions=True, scaling=0)
# nPYc.reports.multivariateQCreport(nmrData, PCAmodelAnalytical, reportType='analytical', withExclusions=True)

### Check final dataset output if current masks applied

In [None]:
nPYc.reports.generateReport(nmrData, 'final report', withExclusions=True, pcaModel=PCAmodelAnalytical)

### Export data

In [None]:
# Export final dataset
nmrData.exportDataset(destinationPath=saveDir)

In [None]:
# Export final summary report
nPYc.reports.generateReport(nmrData, 'final report', output=saveDir, pcaModel=PCAmodelAnalytical)

In [None]:
# To export combined dataset (e.g., format for SIMCA)
nmrData.exportDataset(destinationPath=saveDir, saveFormat='UnifiedCSV')

### IF REQUIRED: change normalisation

In [None]:
# For total area normalisation
# nmrData.Normalisation = TotalAreaNormaliser()

# For probabilistic quotient normalisation
nmrData.Normalisation = ProbabilisticQuotientNormaliser()

### PCA of normalised dataset

In [None]:
PCAmodelAnalytical_normalised = nPYc.multivariate.exploratoryAnalysisPCA(nmrData, scaling=0)

In [None]:
nPYc.reports.multivariateQCreport(nmrData, PCAmodelAnalytical_normalised, reportType='analytical')

### Export normalised data

In [None]:
normalisedDir = os.path.join(saveDir, 'Normalised data')
if not os.path.exists(normalisedDir):
    os.makedirs(normalisedDir)

In [None]:
nmrData.exportDataset(destinationPath=normalisedDir)

In [None]:
nPYc.reports.generateReport(nmrData, 'final report', pcaModel=PCAmodelAnalytical_normalised, output=normalisedDir)

In [None]:
nmrData.exportDataset(destinationPath=normalisedDir, saveFormat='UnifiedCSV')

# 7. Biological Multivariate Report

In [None]:
# Keep study samples only, but all features
nmrData.updateMasks(sampleTypes=[SampleType.StudySample], filterFeatures=False)

In [None]:
PCAmodelBiological = nPYc.multivariate.exploratoryAnalysisPCA(nmrData, withExclusions=True, scaling=0)

In [None]:
nPYc.reports.multivariateQCreport(nmrData, PCAmodelBiological, withExclusions=True, reportType='biological')

In [None]:
# Save report (NOTE: check output directory correct for whether data normalised or not)
nPYc.reports.multivariateQCreport(nmrData, PCAmodelBiological, withExclusions=True, reportType='biological', output=normalisedDir)

### IF REQUIRED: define subset of biological parameters, with defined type, for plotting

In [None]:
# Define parameters to plot, keys as column names, values as data type
# biologicalMeasurements = {'Test': 'categorical', 'Age': 'continuous'}

# Repeat PCA
# PCAmodelBiological = nPYc.multivariate.exploratoryAnalysisPCA(nmrData, scaling=0)
# nPYc.reports.multivariateQCreport(nmrData, PCAmodelBiological, reportType='biological', biologicalMeasurements=biologicalMeasurements)