# PROJECT - METHOD - YYMMDD - NPC Targeted Data Pipeline

#### This document provides a pipeline for the import of Targeted data (post TargetLynx pre-processing for example), and any associated sample metadata, followed by summaries and quality control reports of the data (both in sample and feature dimensions), implementation of batch correction and feature selection and output of a final dataset ready for sharing with collaborators and data modeling. See SOP # for further details of requirements, descriptions of expected outputs and options for optimising data quality.

#### By default all summary reports (with the exception of the final report) will be output only to this notebook. The notebook (including outputs) can be saved using >File>Save and Checkpoint. However, if html copies of any reports are required these can be automatically saved to the save directory by adding the optional input argument output=saveDir.

# 1. Initial Setup

### Define file paths

In [None]:
toolboxPath = '/local path to npyc-toolbox/phenomecentre/npyc-toolbox'

TargetlynxDataPath1 = '/path to Targetlynx file/PROJECT dataset PIfile Batch1.xml'
TargetlynxDataPath2 = '/path to Targetlynx file/PROJECT dataset PIfile Batch2.xml'
calibrationReportPath1 = '/path to calibration report/PROJECT dataset calibration report file1.csv'
calibrationReportPath2 = '/path to calibration report/PROJECT dataset calibration report file2.csv'
nmrRawDataPath  = '/path to NMR data folder/'
nmrRawDataPath1 = '/path to NMR data folder/Rack01 date/'
nmrRawDataPath2 = '/path to NMR data folder/Rack02 date/'

basicCsvFilePath = '/path to basicCSV file/PROJECT dataset basicCsvMetadata.csv'
manifestPath = '/path to subject information file/PROJECT SubjectINFOfile.csv'

saveDir = '/path to save directory/Projects/PROJECT/METHOD DATE/'

### Import code

In [None]:
import os
import matplotlib.pyplot as plt
import scipy
import pandas
import numpy
import pickle
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
%matplotlib inline
import sys
sys.path.append(toolboxPath)
import nPYc
import copy
from nPYc.enumerations import VariableType, DatasetLevel, AssayRole, SampleType, CalibrationMethod, QuantificationType

### Date and version

In [None]:
import datetime
from nPYc.__init__ import __version__ as version
print('Run with branch ' + version + ' on ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))

### Create saveDir

In [None]:
if not os.path.exists(saveDir):
    os.makedirs(saveDir)
    os.makedirs(os.path.join(saveDir, 'data objects'))

# 2. Import Data and Sample Metadata

## Import acquired data and associated acquisition parameters

---
### fileType = 'TargetLynx'

#### Import each plate

In [None]:
# Load each plate
targetedData1 = nPYc.TargetedDataset(TargetlynxDataPath1, fileType='TargetLynx', calibrationReportPath=calibrationReportPath1, sop='OxylipinMS')
targetedData2 = nPYc.TargetedDataset(TargetlynxDataPath2, fileType='TargetLynx', calibrationReportPath=calibrationReportPath2, sop='OxylipinMS')

# Further options are accepted:
#
# Other SOP
# sop='AminoAcidMS'
#
# Don't filter out Internal Standards (default False)
# keepIS = True
#
# Change the samples to process based on MassLynx SampleType (default ['Study Sample','QC'])
# sampleTypeToProcess = ['Study Sample','QC','Blank','Other']
#
# To replace values <LLOQ by the noise concentration equivalent (default replace by -inf)
# noiseFilled = True
#   To select the calibration sample use for response reference (default None, use the middle of the calibration curve)
#   responseReference = str or list of str
#
# To replace only <LLOQ (default False,  both <LLOQ and >ULOQ)
# onlyLLOQ = True
#
# To keep peak caracteristics (default False) (peak area, peak response, peak concentration deviation, peak integration flag, peak RT) in self.peakInfo and self.calibration['calibPeakInfo']
# keepPeakInfo = True
#
# To keep import exclusions in sampleMetadataExcluded,... (default False)
# keepExcluded = True

---
### Bruker Quant-UR
### fileType = 'Bruker Quantification', sop='BrukerQuant-UR'

In [None]:
# Load plate separately, or from a parent folder, all sub-folders will be searched
#tData = nPYc.TargetedDataset(nmrRawDataPath, fileType='Bruker Quantification', sop='BrukerQuant-UR', fileNamePattern='.*?urine_quant_report_b\.xml$', unit='mmol/mol Crea')
targetedData1 = nPYc.TargetedDataset(nmrRawDataPath1, fileType='Bruker Quantification', sop='BrukerQuant-UR', fileNamePattern='.*?urine_quant_report_b\.xml$', unit='mmol/mol Crea')
targetedData2 = nPYc.TargetedDataset(nmrRawDataPath2, fileType='Bruker Quantification', sop='BrukerQuant-UR', fileNamePattern='.*?urine_quant_report_b\.xml$', unit='mmol/mol Crea')

# Further options are accepted:
#
# To provide the regex to recognise the data xml files
# fileNamePattern = ''
#
# To select the right pdata (default 1)
# pdata = 1
#
# If the same features are present multiple times with different units, use 'unit' to only select a specific unit (default None, all entries)
# unit = 'mmol/L'
# unit = 'mmol/mol Crea'

---
### Bruker BI-LISA
### fileType = 'Bruker Quantification', sop='BrukerBI-LISA'

In [None]:
# Load plate separately, or from a parent folder, all sub-folders will be searched
#tData = nPYc.TargetedDataset(nmrRawDataPath, fileType='Bruker Quantification', sop='BrukerBI-LISA', fileNamePattern='.*?results\.xml$')
targetedData1 = nPYc.TargetedDataset(nmrRawDataPath1, fileType='Bruker Quantification', sop='BrukerBI-LISA', fileNamePattern='.*?results\.xml$')
targetedData2 = nPYc.TargetedDataset(nmrRawDataPath2, fileType='Bruker Quantification', sop='BrukerBI-LISA', fileNamePattern='.*?results\.xml$')

# Further options are accepted:
#
# To provide the regex to recognise the data xml files
# fileNamePattern = ''
#
# To select the right pdata (default 1)
# pdata = 1

-----
### Merge all plates in a single dataset

In [None]:
# Merge imported plates
tData = targetedData1 + targetedData2

Rename the dataset

In [None]:
tData.name = 'Targeted Project'

---
### Match acquired samples to Sample File Name (Basic CSV file) and subject information (if available)

In [None]:
tData.addSampleInfo(descriptionFormat='Basic CSV', filePath=basicCsvFilePath)

In [None]:
#tData.addSampleInfo(descriptionFormat='NPC LIMS', filePath=limsFilePath)

### Sample summary
Samples acquired and acquisition structure

In [None]:
nPYc.reports.generateReport(tData, 'sample summary')

In [None]:
# To exclude any samples of 'Unknown' type:
# tData.excludeSamples(tData.sampleMetadata[pandas.isnull(tData.sampleMetadata['Sample Base Name'])]['Sample File Name'], on='Sample File Name', message='Unknown type')

# Then apply masks:
# tData.applyMasks()

#### Merge Limits of Quantification across all plates
Assess the impact of the common Limits of Quantification

In [None]:
nPYc.reports.generateReport(tData, reportType='merge LOQ assessment')

# To change the number of plots on each row
# numberPlotPerRowLOQ = 3

If the new LOQ are suitable, merge Limits of Quantification. Otherwise some batch might have to be reprocessed.

In [None]:
# Update the limits of quantification, keeping the lowest common denominator across all batch: highest LLOQ, lowest ULOQ
tData.mergeLimitsOfQuantification()

# To keep each batch LOQ (default False)
# keepBatchLOQ = True
#
# To replace only <LLOQ (default False,  both <LLOQ and >ULOQ)
# onlyLLOQ = True

### Save/load data

In [None]:
pickle.dump(tData, open(os.path.join(saveDir, 'data objects', tData.name + "_targetedDataImported.p"), "wb"))

In [None]:
#tData = pickle.load( open('path to data objects/targetedDataImported.p', "rb"))

# 3. Quality check

### Feature Summary

In [None]:
nPYc.reports.generateReport(tData, 'feature summary')

# To modify the Accuracy and Precision percentage (default +/-20%)
# Accuracy 80%-120%, Precision 0-20%
#percentRange = 20
#percentRange = None

# To change the number of plots on each row
# numberPlotPerRowFeature = 2

# 4. Analytical Multivariate Quality Control

### Select the samples

In [None]:
tData.updateMasks(sampleTypes=[SampleType.StudySample, SampleType.StudyPool, SampleType.ExternalReference], filterFeatures=False)

### Run analytical multivariate QC

In [None]:
# SVD does not accept missing values
try:
    PCAmodelAnalytical = nPYc.multivariate.exploratoryAnalysisPCA(tData, withExclusions=True, scaling=1.0)
    nPYc.reports.multivariateReport.multivariateQCreport(tData, PCAmodelAnalytical, reportType='analytical', withExclusions=True)
except ValueError:
    print('Multivariate analysis is not currently possible with values <LLOQ or >ULOQ.')

### IF REQUIRED: generate interactive scores and loadings plots

In [None]:
# Interactive scores plot, e.g., plotting the scores for the first two components coloured by run order

# data = nPYc.plotting.plotScoresInteractive(PCAmodelAnalytical, 'Run Order', components=[1, 2])
# iplot(data)

In [None]:
# Interactive loadings plot, e.g., plotting the loadings for component 2

# data = nPYc.plotting.plotLoadingsInteractive(PCAmodelAnalytical, component=2)
# iplot(data)

# 6. Save QC Reports for Review

In [None]:
qcDir = os.path.join(saveDir, 'QC')
if not os.path.exists(qcDir):
    os.makedirs(qcDir)
nPYc.reports.generateReport(tData, 'sample summary', output=qcDir)
nPYc.reports.generateReport(tData, 'feature summary', withExclusions=True, percentRange=20, output=qcDir)
try:
    PCAmodelAnalytical = nPYc.multivariate.exploratoryAnalysisPCA(tData, withExclusions=True, scaling=1.0)
    nPYc.reports.multivariateReport.multivariateQCreport(tData, PCAmodelAnalytical, reportType='analytical', withExclusions=True, output=qcDir)
except ValueError:
    print('Multivariate analysis is not currently possible with values <LLOQ or >ULOQ.')

# 7. Finalise and export dataset

### Select the samples (default is SampleType.StudySample and SampleType.StudyPool samples only)

In [None]:
tData.updateMasks()

### IF REQUIRED: remove features only Monitored and not quantified

In [None]:
#tData.updateMasks(filterSamples=False, quantificationTypes=[QuantificationType.IS, QuantificationType.QuantOwnLabeledAnalogue, QuantificationType.QuantAltLabeledAnalogue, QuantificationType.QuantOther])

### Generate PCA model with updated settings

In [None]:
PCAmodelAnalytical = nPYc.multivariate.exploratoryAnalysisPCA(tData, withExclusions=True, scaling=1.0)

nPYc.reports.multivariateReport.multivariateQCreport(tData, reportType='analytical', withExclusions=True)

### Check final dataset output if current masks applied

In [None]:
nPYc.reports.generateReport(tData, 'final report', withExclusions=True, pcaModel=PCAmodelAnalytical)

### Apply masks

In [None]:
tData.applyMasks()

### Export Data

In [None]:
# Export final dataset
tData.exportDataset(destinationPath=saveDir)

In [None]:
# Export final summary report
#nPYc.reports.generateReport(tData, 'final report', output=saveDir, pcaModel=None)
nPYc.reports.generateReport(tData, 'final report', output=saveDir, pcaModel=PCAmodelAnalytical)

In [None]:
# To export combined dataset (e.g., format for SIMCA)
tData.exportDataset(destinationPath=saveDir, saveFormat='UnifiedCSV')

# 8. Biological Multivariate Report

In [None]:
# Keep study samples only, but all features
tData.updateMasks(sampleTypes=[SampleType.StudySample], filterFeatures=False)

In [None]:
try:
    PCAmodelBiological = nPYc.reports.multivariateReport.multivariateQCreport(tData, reportType='biological', withExclusions=True, scale_method='uv')
except ValueError:
    print('Multivariate analysis is not currently possible with values <LLOQ or >ULOQ.')