Skip to content

Commit

Permalink
update inferBatches and sample info matching reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
carolinesands committed Jan 9, 2024
1 parent b4425f0 commit 73c7397
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 14 deletions.
6 changes: 3 additions & 3 deletions nPYc/StudyDesigns/SOP/Generic.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"LinearityReference": "red", "Linearity Reference": "red",
"MethodReference": "blue", "Method Reference": "blue",
"ProceduralBlank": "aquamarine", "Procedural Blank": "aquamarine", "Blank": "aquamarine",
"Other": "grey", "UnknownRole": "grey", "Unknown": "grey", "UnknownType": "grey", "nan": "grey", "NaN": "grey", "NA": "grey", "Unspecified SampleType or AssayRole": "grey"
"Other": "grey", "UnknownRole": "grey", "Unknown Role": "grey", "Unknown Type": "grey", "Unknown": "grey", "UnknownType": "grey", "nan": "grey", "NaN": "grey", "NA": "grey", "Unspecified SampleType or AssayRole": "grey"
},
"sampleTypeMarkers":{
"StudySample": "8", "Assay": "8", "Study Sample": "8", "Sample": "x",
Expand All @@ -26,7 +26,7 @@
"LinearityReference": "4", "Linearity Reference": "4",
"MethodReference": "3", "Method Reference": "3",
"ProceduralBlank": "x", "Procedural Blank": "x", "Blank": "x",
"UnknownType": "d", "UnknownRole": "d", "Unknown": "d", "nan": "d", "NaN": "d", "NA": "d", "Unspecified SampleType or AssayRole": "d"
"UnknownType": "d", "UnknownRole": "d", "Unknown Role": "d", "Unknown Type": "d", "Unknown": "d", "nan": "d", "NaN": "d", "NA": "d", "Unspecified SampleType or AssayRole": "d"
},
"sampleTypeAbbr":{
"StudySample": "SS", "Assay": "Assay", "Study Sample": "SS", "Sample": "Sample",
Expand All @@ -35,6 +35,6 @@
"LinearityReference": "SRD", "Linearity Reference": "SRD",
"MethodReference": "MR", "Method Reference": "MR",
"ProceduralBlank": "Blank", "Procedural Blank": "Blank", "Blank": "Blank",
"UnknownType": "NA", "UnknownRole": "NA", "Unknown": "NA", "nan": "NA", "NaN": "NA", "NA": "NA", "Unspecified SampleType or AssayRole": "NA"
"UnknownType": "NA", "UnknownRole": "NA", "Unknown Role": "NA", "Unknown Type": "NA", "Unknown": "NA", "nan": "NA", "NaN": "NA", "NA": "NA", "Unspecified SampleType or AssayRole": "NA"
}
}
9 changes: 4 additions & 5 deletions nPYc/StudyDesigns/SOP/GenericMS.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@
"analyticalMeasurements": {"Study" : "categorical", "Chromatography" : "categorical", "Ionisation" : "categorical", "Instrument" : "categorical", "Re-Run" : "categorical", "Suplemental Injections" : "categorical", "Matrix" : "categorical", "Well" : "categorical", "Plate" : "categorical", "Batch" : "categorical", "Dilution" : "continuous", "Measurement Date" : "date", "Measurement Time" : "date", "$$ Instrument:" : "categorical", "Backing" : "continuous", "Capillary (kV)" : "continuous", "Collision" : "continuous", "Collision Energy" : "continuous", "Column Serial Number:" : "categorical", "ColumnType:" : "categorical", "Cone Gas Flow (L/Hr)" : "continuous", "Desolvation Gas Flow (L/Hr)" : "continuous", "Desolvation Temperature (°C)" : "continuous", "Detector" : "continuous", "Detector Unit" : "categorical", "End Mass" : "continuous", "HM Resolution" : "continuous", "Interscan Time (sec)" : "continuous", "LM Resolution" : "continuous", "Polarity" : "categorical", "Resolution" : "continuous", "Sampling Cone" : "continuous", "Scan Time (sec)" : "continuous", "Source Offset" : "continuous", "Source Temperature (°C)" : "continuous", "Start Mass" : "continuous", "TOF" : "continuous", "Warnings" : "categorical", "Acquired Time" : "date", "Run Order" : "continuous", "Correction Batch" : "categorical", "Assay data name": "categorical", "Assay data location": "categorical", "Sample position": "categorical", "Sample batch": "categorical", "Acquisition batch": "categorical", "Plot Sample Type": "categorical", "AssayRole": "categorical", "SampleType": "categorical", "Exclusion Details": "categorical", "Skipped": "categorical", "Assay protocol": "categorical"},
"excludeFromPlotting": ["Sample File Name", "Sample Base Name", "Batch Termini", "Study Reference", "Long-Term Reference", "Method Reference", "Dilution Series", "Skipped", "Study Sample", "File Path", "Exclusion Details", "Assay protocol", "Status", "Measurement Date", "Measurement Time", "Data Present", "LIMS Present", "LIMS Marked Missing", "Assay data name", "Assay data location", "AssayRole", "SampleType", "Sampling ID", "Plot Sample Type", "SubjectInfoData", "Detector Unit"],
"sampleMetadataNotExported":["Metadata Available", "Sample Base Name", "Study", "Chromatography", "Ionisation", "Re-Run",
"Suplemental Injections", "Skipped", "Matrix", "Well", "Plate", "Batch", "Correction Batch",
"Dilution", "Exclusion Details", "Measurement Date", "Measurement Time", "File Path", "Polarity", "Warnings",
"Acquired Time", "Dilution Series", "Assay data name", "Assay data location", "Sample position", "Sample batch", "Assay protocol",
"Suplemental Injections", "Skipped", "Matrix", "Well", "Plate", "Exclusion Details", "Measurement Date",
"Measurement Time", "File Path", "Polarity", "Warnings","Source Temperature (°C)", "Start Mass", "TOF",
"Dilution Series", "Assay data name", "Assay data location", "Sample position", "Sample batch", "Assay protocol",
"Status", "Data Present", "LIMS Present", "LIMS Marked Missing", "$$ Acquired Date:", "$$ Acquired Time:",
"$$ Instrument:", "Acquired Full Time String", "Backing", "Capillary (kV)",
"Collision Energy", "Collision", "Column Serial Number:", "ColumnType:",
"Cone Gas Flow (L/Hr)", "Desolvation Gas Flow (L/Hr)", "Desolvation Temperature (°C)",
"Detector", "End Mass", "HM Resolution", "Instrument", "Interscan Time (sec)",
"LM Resolution", "Resolution", "Sampling Cone", "Scan Time (sec)", "Source Offset",
"Source Temperature (°C)", "Start Mass", "TOF"],
"LM Resolution", "Resolution", "Sampling Cone", "Scan Time (sec)", "Source Offset"],
"featureMetadataNotExported":["mzmin", "mzmax", "rtmin", "rtmax", "npeaks", "ms_level", "Exclusion Details",
"User Excluded","blankFilter", "blankValue", "artifactualFilter", "rsdFilter", "rsdSP",
"correlationToDilution", "correlationToDilutionFilter", "varianceRatioFilter", "rsdSS/rsdSP"]
Expand Down
9 changes: 9 additions & 0 deletions nPYc/objects/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..utilities import normalisation
from ..utilities.normalisation._normaliserABC import Normaliser
import warnings
from IPython.display import display


class Dataset:
Expand Down Expand Up @@ -1316,6 +1317,11 @@ def _matchBasicCSV(self, filePath):
# self.sampleMask[metadataNotAvailable] = False
# joinedTable.loc[metadataNotAvailable, 'Exclusion Details'] = 'No Metadata in CSV'

# Print warning that samples should be added to basic CSV or excluded from dataset
print('The following samples should be added to "Basic CSV" file, or excluded from dataset or nPYc-Toolbox functionality may be compromised:')
#display(acquired_butnotcsv)
print(*acquired_butnotcsv['Sample File Name'].values, sep='\n')

# 1) ACQ and in "include Sample" - drop and set mask to false
# Samples Not ACQ and in "include Sample" set to False - drop and ignore from the dataframe

Expand Down Expand Up @@ -1343,6 +1349,9 @@ def _matchBasicCSV(self, filePath):
if 'Batch' not in self.sampleMetadata:
self.sampleMetadata['Batch'] = 1

# Check all samples have metadata


self.Attributes['Log'].append([datetime.now(), 'Basic CSV matched from %s' % (filePath)])

def _getSampleMetadataFromFilename(self, filenameSpec):
Expand Down
78 changes: 72 additions & 6 deletions nPYc/objects/_msDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1274,10 +1274,10 @@ def _getSampleMetadataFromRawData(self, rawDataPath, filetype="Waters .raw"):
if 'Exclusion Details' not in self.sampleMetadata:
self.sampleMetadata['Exclusion Details'] = None

# Flag samples with missing instrument parameters
# Print warning that samples are missing info, raw files should be located or samples should be excluded from dataset
missingSampleInfo = ~self.sampleMetadata['Sample File Name'].isin(instrumentParams['Sample File Name'])
if sum(missingSampleInfo) > 0:
print("Missing information for {0} samples\n".format(sum(missingSampleInfo)))
print('Raw data for the following samples should be added to the raw data folder, or samples should be excluded from dataset else nPYc-Toolbox functionality may be compromised:\n')
print(*self.sampleMetadata.loc[missingSampleInfo, 'Sample File Name'].values, sep='\n')

def _getSampleMetadataFromFilename(self, filenameSpec):
Expand Down Expand Up @@ -1421,8 +1421,75 @@ def _inferBatches(self, gapLength=24):
:rtype: pandas.Dataframe
"""
sampleMetadata = self.sampleMetadata.copy()
# Loop over samples in run order

# Try to infer batches from 'Acquired Time' (preference) or 'Run Order'
try:

# Generate sampleMetadata sorted by run order
if ('Run Order' not in sampleMetadata.columns):
sampleMetadata['Order'] = sampleMetadata.sort_values(by='Acquired Time').index
sampleMetadata['Run Order'] = sampleMetadata.sort_values(by='Order').index
sampleMetadata.drop('Order', axis=1, inplace=True)

sortedSampleMetadata = sampleMetadata.sort_values(by='Run Order')

# Use 'Acquired Time'(preference) or 'Run Order'
if ('Acquired Time' in sampleMetadata.columns):
usefield = 'Acquired Time'
else:
usefield = 'Run Order'



# Set first batch
sampleMetadata['Correction Batch'] = 1
sampleMetadata['Batch'] = 1

# Calculate the consecutive time differences
timeDelta = sortedSampleMetadata[usefield].diff()

batchTimeSplits = [sortedSampleMetadata.loc[idx, usefield] for idx, x in
sortedSampleMetadata.iterrows() if timeDelta.loc[idx] > timedelta(hours=gapLength)]
batchTimeSplits.extend([sortedSampleMetadata[usefield].max()])
batchNumber = 1

for idx, batchSplit in enumerate(batchTimeSplits):
currentBatchIndex = sampleMetadata[usefield] <= batchSplit
if idx > 0:
currentBatchIndex &= sampleMetadata[usefield] >= batchTimeSplits[idx - 1]
sampleMetadata.loc[currentBatchIndex, 'Correction Batch'] = batchNumber
sampleMetadata.loc[currentBatchIndex, 'Batch'] = batchNumber
batchNumber += 1

# Handle the 'Dilution Series' field
if sum(sampleMetadata['AssayRole'] == AssayRole.LinearityReference) > 0:
SRD_series = 1
previousDilutionRunOrder = sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, 'Run Order'].min()
previousBatch = 1
for idx, row in sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, :].iterrows():
if (row['Run Order'] - previousDilutionRunOrder > 1) or (row['Batch'] > previousBatch):
SRD_series += 1
sampleMetadata.loc[idx, 'Dilution Series'] = SRD_series
previousDilutionRunOrder = row['Run Order']
previousBatch = row['Batch']

# Method Reference, Dilution Series, and Blanks should have "Correction Batch" = nan
SamplesNoBatchCorrection = sampleMetadata['AssayRole'].isin([AssayRole.Blank, AssayRole.LinearityReference])
sampleMetadata.loc[SamplesNoBatchCorrection, 'Correction Batch'] = numpy.nan

# Handle cases where a first batch contains only blanks or pre-injection blanks.
if numpy.nanmin(sampleMetadata['Correction Batch']) > 1:
batchDiff = numpy.nanmin(sampleMetadata['Correction Batch']) - 1
sampleMetadata['Correction Batch'] -= batchDiff

self.sampleMetadata = sampleMetadata

except (AttributeError, KeyError, TypeError):
warnings.warn('Unable to infer batches without complete run order or acquired time info, skipping.')

"""
# If 'Acquired Time' data present
if ('Acquired Time' in sampleMetadata.columns) and (not sampleMetadata['Acquired Time'].isnull().all()):
Expand All @@ -1434,6 +1501,7 @@ def _inferBatches(self, gapLength=24):
sortedSampleMetadata = sampleMetadata.sort_values(by='Run Order')
sampleMetadata['Correction Batch'] = 1
sampleMetadata['Batch'] = 1
timeDelta = sortedSampleMetadata['Acquired Time'].diff()
batchTimeSplits = [sortedSampleMetadata.loc[idx, 'Acquired Time'] for idx, x in
Expand Down Expand Up @@ -1505,9 +1573,7 @@ def _inferBatches(self, gapLength=24):
sampleMetadata['Correction Batch'] -= batchDiff
self.sampleMetadata = sampleMetadata

else:
warnings.warn('Unable to infer batches without run order or acquired time info, skipping.')
"""

def amendBatches(self, sampleRunOrder):
"""
Expand Down

0 comments on commit 73c7397

Please sign in to comment.