Skip to content

Commit

Permalink
updated batch correction to ignore specified sampletype/assayrole com…
Browse files Browse the repository at this point in the history
…binations
  • Loading branch information
carolinesands committed Jan 17, 2024
1 parent 699ccae commit 3f78659
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 54 deletions.
26 changes: 23 additions & 3 deletions Tests/test_batchandrocorrection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
sys.path.append("..")
import nPYc
from generateTestDataset import generateTestDataset
from nPYc.enumerations import SampleType
from nPYc.enumerations import SampleType, AssayRole


class test_rocorrection(unittest.TestCase):
Expand All @@ -21,12 +21,17 @@ def setUp(self):
self.noSamp = numpy.random.randint(500, high=2000, size=None)
self.noFeat = numpy.random.randint(50, high=100, size=None)

self.msData = generateTestDataset(self.noSamp, self.noFeat, dtype='MSDataset')
self.msDataERCorrection = generateTestDataset(20, 2, dtype='MSDataset')
self.msData = generateTestDataset(self.noSamp, self.noFeat, dtype='MSDataset', sop='GenericMS')

self.msDataERCorrection = generateTestDataset(20, 2, dtype='MSDataset', sop='GenericMS')
self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudyPool, :] = 0.5
self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudySample, :] = 2
self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.ExternalReference, :] = 1

self.msDataLRCorrection = generateTestDataset(20, 2, dtype='MSDataset', sop='GenericMS')
self.msDataLRCorrection.sampleMetadata['SampleType'] = SampleType.StudyPool
self.msDataLRCorrection.sampleMetadata.loc[self.msDataLRCorrection.sampleMetadata['AssayRole'] != AssayRole.PrecisionReference, 'AssayRole'] = AssayRole.LinearityReference

def test_correctMSDataset(self):
with self.subTest(msg='Test correctMSdataset parallelisation'):
"""
Expand Down Expand Up @@ -56,6 +61,21 @@ def test_correctMSDataset(self):

numpy.testing.assert_array_almost_equal(correctedDataER.intensityData, expectedCorrectedERDataIntensity, err_msg="Corrected intensities are not equal")

with self.subTest(msg='Test linearity reference samples are not corrected'):
"""
Check that linearity samples (by default not corrected, see 'GenericMS.py') are not corrected
"""

correctedDataLR = nPYc.batchAndROCorrection.correctMSdataset(self.msDataLRCorrection,
correctionSampleType=SampleType.StudyPool)

LRmask = (correctedDataLR.sampleMetadata['SampleType'] == SampleType.StudyPool) & \
(correctedDataLR.sampleMetadata['AssayRole'] == AssayRole.LinearityReference)

numpy.testing.assert_array_almost_equal(correctedDataLR.intensityData[LRmask, :],
self.msDataLRCorrection.intensityData[LRmask, :],
err_msg="By default, linearity reference samples should not be corrected")


class test_rocorrection_synthetic(unittest.TestCase):

Expand Down
3 changes: 2 additions & 1 deletion nPYc/StudyDesigns/SOP/GenericMS.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@
"LM Resolution", "Resolution", "Sampling Cone", "Scan Time (sec)", "Source Offset"],
"featureMetadataNotExported":["mzmin", "mzmax", "rtmin", "rtmax", "npeaks", "ms_level", "Exclusion Details",
"User Excluded","blankFilter", "blankValue", "artifactualFilter", "rsdFilter", "rsdSP",
"correlationToDilution", "correlationToDilutionFilter", "varianceRatioFilter", "rsdSS/rsdSP"]
"correlationToDilution", "correlationToDilutionFilter", "varianceRatioFilter", "rsdSS/rsdSP"],
"samplesNotCorrected": {"SampleType" : ["StudyPool"], "AssayRole" : ["LinearityReference"]}
}
27 changes: 24 additions & 3 deletions nPYc/batchAndROCorrection/_batchAndROCorrection.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,20 @@
from ..enumerations import AssayRole, SampleType


def correctMSdataset(data, window=11, method='LOWESS', align='median', parallelise=True, excludeFailures=True, correctionSampleType=SampleType.StudyPool):
def correctMSdataset(data,
window=11,
method='LOWESS',
align='median',
parallelise=True,
excludeFailures=True,
correctionSampleType=SampleType.StudyPool):
"""
Conduct run-order correction and batch alignment on the :py:class:`~nPYc.objects.MSDataset` instance *data*, returning a new instance with corrected intensity values.
Sample are seperated into batches acording to the *'Correction Batch'* column in *data.sampleMetadata*.
Sample are seperated into batches according to the *'Correction Batch'* column in *data.sampleMetadata*.
Samples are only corrected if they have a value in *'Correction Batch'* AND an *'AssayRole'*/*'SampleType'*
combination not defined in 'samplesNotCorrected' (taken from the sop/data.Attributes['samplesNotCorrected'])
:param data: MSDataset object with measurements to be corrected
:type data: MSDataset
Expand Down Expand Up @@ -54,13 +63,25 @@ def correctMSdataset(data, window=11, method='LOWESS', align='median', paralleli
if not isinstance(correctionSampleType,SampleType):
raise TypeError("correctionType must be a SampleType")

# Define the samples to be corrected (only corrected if have value in 'Correction Batch' and not listed for
# exclusion in 'samplesNotCorrected'
samplesForCorrection = data.sampleMetadata['Correction Batch'].values.astype(float)

for s in numpy.arange(len(data.Attributes['samplesNotCorrected']['SampleType'])):
try:
mask = (data.sampleMetadata['SampleType'] == SampleType[data.Attributes['samplesNotCorrected']['SampleType'][s]]) & \
(data.sampleMetadata['AssayRole'] == AssayRole[data.Attributes['samplesNotCorrected']['AssayRole'][s]])
samplesForCorrection[mask] = numpy.nan
except KeyError:
raise KeyError('data.Attributes[\'samplesNotCorrected\'] must contain valid SampleType/AssayRole enumeration entries')

with warnings.catch_warnings():
warnings.simplefilter('ignore', category=RuntimeWarning)

correctedP = _batchCorrectionHead(data.intensityData,
data.sampleMetadata['Run Order'].values,
(data.sampleMetadata['SampleType'].values == correctionSampleType) & (data.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference),
data.sampleMetadata['Correction Batch'].values,
samplesForCorrection,
window=window,
method=method,
align=align,
Expand Down
110 changes: 63 additions & 47 deletions nPYc/reports/_generateReportMS.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ._generateBasicPCAReport import generateBasicPCAReport
from ..reports._finalReportPeakPantheR import _finalReportPeakPantheR
from ..utilities._filters import blankFilter
from ..batchAndROCorrection import correctMSdataset

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
Expand Down Expand Up @@ -1720,68 +1721,83 @@ def batchCorrectionTest(dataset, nFeatures=10, window=11):
from ..batchAndROCorrection._batchAndROCorrection import _batchCorrection

# Samplemask
SSmask = (dataset.sampleMetadata['SampleType'].values == SampleType.StudySample) & (
dataset.sampleMetadata['AssayRole'].values == AssayRole.Assay)
SPmask = (dataset.sampleMetadata['SampleType'].values == SampleType.StudyPool) & (
dataset.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference)
ERmask = (dataset.sampleMetadata['SampleType'].values == SampleType.ExternalReference) & (
dataset.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference)
LRmask = (dataset.sampleMetadata['SampleType'].values == SampleType.ExternalReference) & (
dataset.sampleMetadata['AssayRole'].values == AssayRole.LinearityReference)
sampleMask = (SSmask | SPmask | ERmask | LRmask) & (dataset.sampleMask == True).astype(bool)
# SSmask = (dataset.sampleMetadata['SampleType'].values == SampleType.StudySample) & (
# dataset.sampleMetadata['AssayRole'].values == AssayRole.Assay)
# SPmask = (dataset.sampleMetadata['SampleType'].values == SampleType.StudyPool) & (
# dataset.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference)
# ERmask = (dataset.sampleMetadata['SampleType'].values == SampleType.ExternalReference) & (
# dataset.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference)
# LRmask = (dataset.sampleMetadata['SampleType'].values == SampleType.ExternalReference) & (
# dataset.sampleMetadata['AssayRole'].values == AssayRole.LinearityReference)
# sampleMask = (SSmask | SPmask | ERmask | LRmask) & (dataset.sampleMask == True).astype(bool)

# Exclude features with zero values
zeroMask = sum(dataset.intensityData[sampleMask, :] == 0)
zeroMask = zeroMask == 0
# zeroMask = sum(dataset.intensityData[sampleMask, :] == 0)
# zeroMask = sum(dataset.intensityData == 0)
# zeroMask = zeroMask == 0

# Exclude features which fail correlation to dilution
try:
passMask = numpy.logical_and(zeroMask, dataset.correlationToDilution >= dataset.Attributes['corrThreshold'])
#passMask = numpy.logical_and(zeroMask, dataset.correlationToDilution >= dataset.Attributes['corrThreshold'])
passMask = dataset.correlationToDilution >= dataset.Attributes['corrThreshold']
except:
passMask = zeroMask
#passMask = zeroMask
passMask = numpy.squeeze(numpy.ones([dataset.noFeatures, 1], dtype=bool), axis=1)

# Select subset of features on which to perform batch correction
maskNum = [i for i, x in enumerate(passMask) if x]
random.shuffle(maskNum)

if len(maskNum) > nFeatures:
maskNum = maskNum[:nFeatures]

preData = copy.deepcopy(dataset)
preData.featureMask = numpy.squeeze(numpy.zeros([preData.noFeatures, 1], dtype=bool), axis=1)
preData.featureMask[maskNum] = True
preData.applyMasks()

# Do batch correction
featureList = []
correctedData = numpy.zeros([dataset.intensityData.shape[0], nFeatures])
fits = numpy.zeros([dataset.intensityData.shape[0], nFeatures])
featureIX = 0
parameters = dict()
parameters['window'] = window
parameters['method'] = 'LOWESS'
parameters['align'] = 'median'

for feature in maskNum:
correctedP = _batchCorrection(dataset.intensityData[:, feature],
dataset.sampleMetadata['Run Order'].values,
SPmask,
dataset.sampleMetadata['Correction Batch'].values,
range(0, 1), # All features
parameters,
0)

if sum(numpy.isfinite(correctedP[0][1])) == dataset.intensityData.shape[0]:
correctedData[:, featureIX] = correctedP[0][1]
fits[:, featureIX] = correctedP[0][2]
featureList.append(feature)
featureIX = featureIX + 1

if featureIX == nFeatures:
break
postData = correctMSdataset(preData)

# Do batch correction
#featureList = []
#correctedData = numpy.zeros([dataset.intensityData.shape[0], nFeatures])
#fits = numpy.zeros([dataset.intensityData.shape[0], nFeatures])
#featureIX = 0
#parameters = dict()
#parameters['window'] = window
#parameters['method'] = 'LOWESS'
#parameters['align'] = 'median'

#for feature in maskNum:
# correctedP = _batchCorrection(dataset.intensityData[:, feature],
# dataset.sampleMetadata['Run Order'].values,
# SPmask,
# dataset.sampleMetadata['Correction Batch'].values,
# range(0, 1), # All features
# parameters,
# 0)

# if sum(numpy.isfinite(correctedP[0][1])) == dataset.intensityData.shape[0]:
# correctedData[:, featureIX] = correctedP[0][1]
# fits[:, featureIX] = correctedP[0][2]
# featureList.append(feature)
# featureIX = featureIX + 1

# if featureIX == nFeatures:
# break

# Create copy of dataset and trim
preData = copy.deepcopy(dataset)
preData.intensityData = dataset.intensityData[:, featureList]
preData.featureMetadata = dataset.featureMetadata.loc[featureList, :]
preData.featureMetadata.reset_index(drop=True, inplace=True)
#preData = copy.deepcopy(dataset)
#preData.intensityData = dataset.intensityData[:, featureList]
#preData.featureMetadata = dataset.featureMetadata.loc[featureList, :]
#preData.featureMetadata.reset_index(drop=True, inplace=True)
#preData.featureMask = preData.featureMask[featureList]

# Run batch correction
postData = copy.deepcopy(preData)
postData.intensityData = correctedData
postData.fit = fits
#postData = copy.deepcopy(preData)
#postData.intensityData = correctedData
#postData.fit = fits

# Return results
return preData, postData, featureList
return preData, postData, maskNum #featureList

0 comments on commit 3f78659

Please sign in to comment.