Skip to content

Commit

Permalink
storage: Add normalization extensions to setup params. #TASK-5861, #T…
Browse files Browse the repository at this point in the history
…ASK-5448
  • Loading branch information
j-coll committed May 23, 2024
1 parent f5ea5a4 commit e396772
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 109 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,18 @@ private void inferParams(VariantSetupParams params) {
params.setAverageSamplesPerFile(params.getExpectedSamples().floatValue() / params.getExpectedFiles().floatValue());
} else {
switch (params.getDataDistribution()) {
case SINGLE_SAMPLE_FILES:
case SINGLE_SAMPLE_PER_FILE:
params.setAverageSamplesPerFile(1f);
break;
case MULTI_SAMPLE_FILES:
case MULTIPLE_SAMPLES_PER_FILE:
params.setAverageSamplesPerFile(params.getExpectedSamples().floatValue() / params.getExpectedFiles().floatValue());
break;
case MULTIPLE_FILE_PER_SAMPLE:
case MULTIPLE_FILES_PER_SAMPLE:
// Hard to tell. Let's assume 2 samples per file
params.setAverageSamplesPerFile(2f);
break;
case MULTI_SAMPLE_FILES_SPLIT_BY_CHROMOSOME:
case MULTI_SAMPLE_FILES_SPLIT_BY_REGION:
case FILES_SPLIT_BY_CHROMOSOME:
case FILES_SPLIT_BY_REGION:
params.setAverageSamplesPerFile(params.getExpectedSamples().floatValue());
break;
default:
Expand Down Expand Up @@ -132,10 +132,10 @@ private void check(String studyStr, VariantSetupParams params, String token) thr
}

if (params.getExpectedFiles() == null || params.getExpectedFiles() <= 0) {
throw new IllegalArgumentException("Missing expectedFilesNumber");
throw new IllegalArgumentException("Missing expectedFiles");
}
if (params.getExpectedSamples() == null || params.getExpectedSamples() <= 0) {
throw new IllegalArgumentException("Missing expectedSamplesNumber");
throw new IllegalArgumentException("Missing expectedSamples");
}

if (params.getAverageFileSize() == null && params.getFileType() == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ protected void check() throws Exception {
params.putIfNotEmpty(VariantStorageOptions.INCLUDE_GENOTYPE.key(), indexParams.getIncludeGenotypes());
params.put(VariantStorageOptions.STATS_AGGREGATION.key(), indexParams.getAggregated());
params.putIfNotEmpty(VariantStorageOptions.STATS_AGGREGATION_MAPPING_FILE.key(), indexParams.getAggregationMappingFile());
params.put(VariantStorageOptions.GVCF.key(), indexParams.isGvcf());
if (indexParams.isGvcf()) {
params.put(VariantStorageOptions.GVCF.key(), indexParams.isGvcf());
}

// queryOptions.putIfNotNull(VariantFileIndexerStorageOperation.TRANSFORMED_FILES, indexParams.transformedPaths);

Expand All @@ -98,7 +100,9 @@ protected void check() throws Exception {
params.put(VariantStorageOptions.FAMILY.key(), indexParams.isFamily());
params.put(VariantStorageOptions.SOMATIC.key(), indexParams.isSomatic());
params.putIfNotEmpty(VariantStorageOptions.LOAD_SPLIT_DATA.key(), indexParams.getLoadSplitData());
params.put(VariantStorageOptions.LOAD_MULTI_FILE_DATA.key(), indexParams.isLoadMultiFileData());
if (indexParams.isLoadMultiFileData()) {
params.put(VariantStorageOptions.LOAD_MULTI_FILE_DATA.key(), indexParams.isLoadMultiFileData());
}
params.putIfNotEmpty(VariantStorageOptions.LOAD_SAMPLE_INDEX.key(), indexParams.getLoadSampleIndex());
params.putIfNotEmpty(VariantStorageOptions.LOAD_ARCHIVE.key(), indexParams.getLoadArchive());
params.putIfNotEmpty(VariantStorageOptions.LOAD_HOM_REF.key(), indexParams.getLoadHomRef());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
import org.opencb.opencga.core.testclassification.duration.LongTests;
import org.opencb.opencga.core.tools.result.ExecutionResult;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager;
import org.opencb.opencga.storage.core.metadata.models.SampleMetadata;
import org.opencb.opencga.storage.core.metadata.models.VariantScoreMetadata;
import org.opencb.opencga.storage.core.utils.CellBaseUtils;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
Expand Down Expand Up @@ -94,8 +96,9 @@ public class VariantOperationsTest {
public static final String USER = "user";
public static final String PASSWORD = TestParamConstants.PASSWORD;
public static final String PROJECT = "project";
public static final String PROJECT_FQN = ORGANIZATION + '@' + PROJECT;
public static final String STUDY = "study";
public static final String STUDY_FQN = ORGANIZATION + '@' + PROJECT + ':' + STUDY;
public static final String STUDY_FQN = PROJECT_FQN + ':' + STUDY;
public static final String PHENOTYPE_NAME = "myPhenotype";
public static final Phenotype PHENOTYPE = new Phenotype(PHENOTYPE_NAME, PHENOTYPE_NAME, "mySource")
.setStatus(Phenotype.Status.OBSERVED);
Expand Down Expand Up @@ -331,14 +334,11 @@ public void setUpCatalogManager() throws Exception {

@Test
public void testSetup() throws Exception {
VariantSetupParams setupParams = new VariantSetupParams();
setupParams
.setFileType(VariantSetupParams.FileType.GENOME_VCF)
.setDataDistribution(VariantSetupParams.DataDistribution.MULTI_SAMPLE_FILES)
.setExpectedFiles(20)
.setExpectedSamples(100);
String study2 = "study2";
catalogManager.getStudyManager().create(PROJECT, study2, null, "Phase 1", "Done", null, null, null, null, null, token);
String study2fqn = catalogManager.getStudyManager()
.create(PROJECT, study2, null, "Phase 1", "Done", null, null, null, null, null, token)
.first().getFqn();
File file = opencga.createFile(study2, "variant-test-file.vcf.gz", token);

try {
toolRunner.execute(VariantIndexOperationTool.class, study2,
Expand All @@ -352,9 +352,55 @@ public void testSetup() throws Exception {
MatcherAssert.assertThat(e.getCause().getMessage(), CoreMatchers.containsString("The variant storage has not been setup for study"));
}

try {
VariantSetupParams setupParams = new VariantSetupParams()
.setFileType(VariantSetupParams.FileType.GENOME_VCF)
.setDataDistribution(VariantSetupParams.DataDistribution.MULTIPLE_SAMPLES_PER_FILE)
.setExpectedFiles(20)
.setExpectedSamples(100)
.setNormalizeExtensions(Arrays.asList("VS", "SV"));
variantStorageManager.variantSetup(study2, setupParams, token);
fail("should have failed");
} catch (Exception e) {
System.err.println(e.getMessage());
MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("Unsupported normalize extensions"));
}

try {
VariantSetupParams setupParams = new VariantSetupParams()
.setFileType(VariantSetupParams.FileType.GENOME_VCF)
.setDataDistribution(VariantSetupParams.DataDistribution.MULTIPLE_SAMPLES_PER_FILE)
.setExpectedSamples(100)
.setNormalizeExtensions(Arrays.asList("VS", "SV"));
variantStorageManager.variantSetup(study2, setupParams, token);
fail("should have failed");
} catch (Exception e) {
System.err.println(e.getMessage());
MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("Missing expectedFiles"));
}

VariantSetupParams setupParams = new VariantSetupParams()
.setFileType(VariantSetupParams.FileType.GENOME_VCF)
.setDataDistribution(VariantSetupParams.DataDistribution.MULTIPLE_FILES_PER_SAMPLE)
.setExpectedFiles(20)
.setAverageSamplesPerFile(2.5f)
.setExpectedSamples(10)
.setNormalizeExtensions(Arrays.asList("SV", "VAF"));
VariantSetupResult result = variantStorageManager.variantSetup(study2, setupParams, token);
assertEquals(VariantSetupResult.Status.READY, result.getStatus());

toolRunner.execute(VariantIndexOperationTool.class, study2,
new VariantIndexParams()
.setFile(file.getId())
.setLoadHomRef(YesNoAuto.YES.name()),
Paths.get(opencga.createTmpOutdir("_index")), "index", token);

VariantStorageMetadataManager metadataManager = opencga.getVariantStorageEngineByProject(PROJECT_FQN).getMetadataManager();
int studyId = metadataManager.getStudyId(study2fqn);
int sampleId = metadataManager.getSampleId(studyId, "NA19600");
SampleMetadata sampleMetadata = metadataManager.getSampleMetadata(studyId, sampleId);
assertEquals(VariantStorageEngine.SplitData.MULTI, sampleMetadata.getSplitData());

try {
variantStorageManager.variantSetup(STUDY, setupParams, token);
fail("Should fail");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import org.opencb.commons.annotations.DataField;
import org.opencb.opencga.core.tools.ToolParams;

import java.util.List;

public class VariantSetupParams extends ToolParams {

@DataField(description = "Expected number of samples that will be loaded. Used to infer some parameters. "
Expand Down Expand Up @@ -38,6 +40,9 @@ public class VariantSetupParams extends ToolParams {
@DataField(description = "Data distribution of the files. This parameter is used to infer the number of samples per file.")
private DataDistribution dataDistribution;

@DataField(description = "List of normalization extensions")
private List<String> normalizeExtensions;

public VariantSetupParams(VariantSetupParams params) {
this.expectedSamples = params.expectedSamples;
this.expectedFiles = params.expectedFiles;
Expand All @@ -46,6 +51,7 @@ public VariantSetupParams(VariantSetupParams params) {
this.variantsPerSample = params.variantsPerSample;
this.averageSamplesPerFile = params.averageSamplesPerFile;
this.dataDistribution = params.dataDistribution;
this.normalizeExtensions = params.normalizeExtensions;
}

public VariantSetupParams() {
Expand All @@ -58,30 +64,30 @@ public enum DataDistribution {
// - Cancer germline
// - RD germline without family calling
@DataField(description = "Single sample VCF files. One file per sample. e.g. Platinum gVCF, Cancer germline, RD germline without family calling")
SINGLE_SAMPLE_FILES,
SINGLE_SAMPLE_PER_FILE,

// Multi samples VCF files. One file with multiple samples.
// e.g.
// - Corpasome
// - RD germline with family calling
@DataField(description = "Multi samples VCF files. One file with multiple samples. e.g. Corpasome, RD germline with family calling")
MULTI_SAMPLE_FILES,
MULTIPLE_SAMPLES_PER_FILE,

// Multiple files per sample. Each file might have multiple samples.
// e.g.
// - Somatic study with multiple callers
@DataField(description = "Multiple files per sample. Each file might have multiple samples. e.g. Somatic study with multiple callers")
MULTIPLE_FILE_PER_SAMPLE,
MULTIPLE_FILES_PER_SAMPLE,

// Large aggregated/joined/merged files. Each file has all samples. Each file contains a specific set of chromosomes.
// e.g.
// - 1000 genomes
@DataField(description = "Large aggregated/joined/merged files. Each file has all samples. Each file contains a specific set of chromosomes. e.g. 1000 genomes")
MULTI_SAMPLE_FILES_SPLIT_BY_CHROMOSOME,
FILES_SPLIT_BY_CHROMOSOME,

// Large aggregated/joined/merged files. Each file has all samples. Each file contains an arbitrary region.
@DataField(description = "Large aggregated/joined/merged files. Each file has all samples. Each file contains an arbitrary region.")
MULTI_SAMPLE_FILES_SPLIT_BY_REGION,
FILES_SPLIT_BY_REGION,
}

public enum FileType {
Expand Down Expand Up @@ -155,4 +161,13 @@ public VariantSetupParams setDataDistribution(DataDistribution dataDistribution)
this.dataDistribution = dataDistribution;
return this;
}

public List<String> getNormalizeExtensions() {
return normalizeExtensions;
}

public VariantSetupParams setNormalizeExtensions(List<String> normalizeExtensions) {
this.normalizeExtensions = normalizeExtensions;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.metadata.SampleVariantStats;
import org.opencb.biodata.models.variant.metadata.VariantMetadata;
import org.opencb.biodata.tools.variant.normalizer.extensions.VariantNormalizerExtensionFactory;
import org.opencb.cellbase.client.config.ClientConfiguration;
import org.opencb.cellbase.client.rest.CellBaseClient;
import org.opencb.commons.datastore.core.*;
import org.opencb.opencga.core.api.ParamConstants;
import org.opencb.opencga.core.common.TimeUtils;
import org.opencb.opencga.core.config.storage.StorageConfiguration;
import org.opencb.opencga.core.models.operations.variant.VariantAggregateFamilyParams;
Expand Down Expand Up @@ -154,7 +156,12 @@ public static SplitData from(ObjectMap options) {
String loadSplitDataStr = options.getString(LOAD_SPLIT_DATA.key());
boolean multiFile = options.getBoolean(LOAD_MULTI_FILE_DATA.key());
if (StringUtils.isNotEmpty(loadSplitDataStr) && multiFile) {
throw new IllegalArgumentException("Unable to mix loadSplitFile and loadMultiFile");
if (loadSplitDataStr.equalsIgnoreCase("multi")) {
return MULTI;
} else {
throw new IllegalArgumentException("Unable to mix " + LOAD_MULTI_FILE_DATA.key() + "=true and "
+ LOAD_SPLIT_DATA.key() + "='" + loadSplitDataStr + "'");
}
}
if (StringUtils.isEmpty(loadSplitDataStr) && !multiFile) {
return null;
Expand Down Expand Up @@ -1408,6 +1415,38 @@ public VariantAggregationExecutor getVariantAggregationExecutor(Query query, Que
public ObjectMap inferConfigurationParams(VariantSetupParams params) {
ObjectMap options = new ObjectMap();

List<String> normalizeExtensions = params.getNormalizeExtensions();
if (normalizeExtensions != null && !normalizeExtensions.isEmpty()) {
if (!normalizeExtensions.equals(Collections.singletonList(ParamConstants.ALL))) {
List<String> unsupportedExtensions = new ArrayList<>();
for (String normalizeExtension : normalizeExtensions) {
if (!VariantNormalizerExtensionFactory.ALL_EXTENSIONS.contains(normalizeExtension)) {
unsupportedExtensions.add(normalizeExtension);
}
}
if (!unsupportedExtensions.isEmpty()) {
throw new IllegalArgumentException("Unsupported normalize extensions: " + unsupportedExtensions + ". Supported "
+ "extensions are: " + VariantNormalizerExtensionFactory.ALL_EXTENSIONS);
}
}
options.put(NORMALIZATION_EXTENSIONS.key(), normalizeExtensions);
}
if (params.getDataDistribution() != null) {
switch (params.getDataDistribution()) {
case FILES_SPLIT_BY_CHROMOSOME:
options.put(LOAD_SPLIT_DATA.key(), SplitData.CHROMOSOME);
break;
case FILES_SPLIT_BY_REGION:
options.put(LOAD_SPLIT_DATA.key(), SplitData.REGION);
break;
case MULTIPLE_FILES_PER_SAMPLE:
options.put(LOAD_MULTI_FILE_DATA.key(), true);
options.put(LOAD_SPLIT_DATA.key(), SplitData.MULTI);
break;
default:
break;
}
}

return options;
}
Expand Down
Loading

0 comments on commit e396772

Please sign in to comment.