From 3a8570701b2f622ff6a072e2e166453c239e0742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 31 Aug 2018 18:39:04 +0100 Subject: [PATCH] storage: Handle overlap with symbolic NO_VARIATION #877 --- .../hadoop/variant/gaps/FillGapsTask.java | 37 +++++++++++++-- .../hadoop/variant/gaps/FillGapsTaskTest.java | 45 ++++++++++++++++--- .../src/test/resources/gaps2/file1.genome.vcf | 15 +++++++ .../src/test/resources/gaps2/file2.genome.vcf | 18 ++++++++ 4 files changed, 106 insertions(+), 9 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file1.genome.vcf create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file2.genome.vcf diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java index 8fca8acadc1..63dd2749d78 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java @@ -1,5 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.gaps; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.vcf.VCFConstants; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; @@ -143,23 +144,51 @@ public VariantOverlappingStatus fillGaps(Variant variant, Set missingSa Variant archiveVariant = convertToVariant(vcfSlice, vcfRecord, fileId); if (archiveVariant.getType().equals(VariantType.NO_VARIATION)) { - overlappingStatus = processReferenceOverlap(missingSamples, put, archiveVariant); + overlappingStatus = processReferenceOverlap(missingSamples, put, variant, archiveVariant); } else { overlappingStatus = processVariantOverlap(variant, missingSamples, put, sampleIndexPuts, archiveVariant); } return overlappingStatus; } - protected VariantOverlappingStatus processReferenceOverlap(Set missingSamples, Put put, Variant archiveVariant) { + protected VariantOverlappingStatus processReferenceOverlap(Set missingSamples, Put put, + Variant variant, Variant archiveVariant) { VariantOverlappingStatus overlappingStatus = REFERENCE; FileEntry fileEntry = archiveVariant.getStudies().get(0).getFiles().get(0); fileEntry.getAttributes().remove(VCFConstants.END_KEY); if (StringUtils.isEmpty(fileEntry.getCall())) { - fileEntry.setCall(archiveVariant.getStart() + ":" + archiveVariant.getReference() + ":.:0"); + fileEntry.setCall(archiveVariant.getStart() + ":" + archiveVariant.getReference() + ":" + archiveVariant.getAlternate() + ":0"); } + // SYMBOLIC reference overlap -- <*> , + if (VariantType.NO_VARIATION.equals(archiveVariant.getType()) + && !archiveVariant.getAlternate().isEmpty() + && !archiveVariant.getAlternate().equals(Allele.NO_CALL_STRING)) { - studyConverter.convert(archiveVariant, put, missingSamples, overlappingStatus); + // Create template variant + Variant mergedVariant = new Variant( + variant.getChromosome(), + variant.getStart(), + variant.getEnd(), + variant.getReference(), + variant.getAlternate()); + + StudyEntry studyEntry = new StudyEntry(); + studyEntry.setFormat(archiveVariant.getStudies().get(0).getFormat()); + studyEntry.setSortedSamplesPosition(new LinkedHashMap<>()); + studyEntry.setSamplesData(new ArrayList<>()); + mergedVariant.addStudyEntry(studyEntry); + mergedVariant.setType(variant.getType()); + + // Merge NO_VARIATION into the template variant + mergedVariant = variantMerger.merge(mergedVariant, archiveVariant); + + // Convert study information to PUT + studyConverter.convert(mergedVariant, put, missingSamples, overlappingStatus); + + } else { + studyConverter.convert(archiveVariant, put, missingSamples, overlappingStatus); + } return overlappingStatus; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java index b1654ca0862..3f86b3a5725 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java @@ -9,7 +9,9 @@ import org.junit.rules.ExternalResource; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.AlternateCoordinate; import org.opencb.biodata.models.variant.avro.FileEntry; +import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.commons.ProgressLogger; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; @@ -21,6 +23,7 @@ import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; +import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryUtils; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; @@ -40,9 +43,7 @@ import static org.hamcrest.CoreMatchers.containsString; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertThat; -import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.everyResult; -import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.withSampleData; -import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.withStudy; +import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.*; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.MISSING_GENOTYPES_UPDATED; import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.printVariants; import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.removeFile; @@ -156,6 +157,29 @@ public void testFillGapsConflictingFiles() throws Exception { getResourceUri("gaps/file1.genome.vcf"), getResourceUri("gaps/file2.genome.vcf"))); + checkConflictingFiles(studyConfiguration); + } + + @Test + public void testFillGapsConflictingFilesNonRef() throws Exception { + StudyConfiguration studyConfiguration = load(new QueryOptions(), Arrays.asList( + getResourceUri("gaps2/file1.genome.vcf"), + getResourceUri("gaps2/file2.genome.vcf"))); + + checkConflictingFiles(studyConfiguration); + + VariantDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor(); + + Variant variantMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10035:A:G"), null).first(); + assertEquals("0/0", variantMulti.getStudies().get(0).getSampleData("s1", "GT")); + assertEquals(new AlternateCoordinate("1", 10035, 10035, "A", "<*>", VariantType.NO_VARIATION), + variantMulti.getStudies().get(0).getSecondaryAlternates().get(0)); + assertEquals("4,0,1", variantMulti.getStudies().get(0).getSampleData("s1", "AD")); + assertEquals("0/1", variantMulti.getStudies().get(0).getSampleData("s2", "GT")); + assertEquals("13,23,0", variantMulti.getStudies().get(0).getSampleData("s2", "AD")); + } + + public void checkConflictingFiles(StudyConfiguration studyConfiguration) throws Exception { HadoopVariantStorageEngine variantStorageEngine = (HadoopVariantStorageEngine) this.variantStorageEngine; VariantHadoopDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor(); @@ -163,7 +187,7 @@ public void testFillGapsConflictingFiles() throws Exception { sampleIds.sort(Integer::compareTo); fillGaps(variantStorageEngine, studyConfiguration, sampleIds); - printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri()); + printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri(1)); checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10020:A:T")); checkSampleIndexTable(dbAdaptor); @@ -344,7 +368,8 @@ protected void checkFillMissing(VariantHadoopDBAdaptor dbAdaptor, List for (Variant variant : dbAdaptor) { StudyEntry studyEntry = variant.getStudies().get(0); - boolean newVariant = !missingGenotypesUpdated && studyEntry.getFiles().stream().map(FileEntry::getFileId).map(Integer::valueOf).allMatch(newFilesSet::contains); + boolean newVariant = !missingGenotypesUpdated && studyEntry.getFiles().stream().map(FileEntry::getFileId) + .map(studyConfiguration.getFileIds()::get).allMatch(newFilesSet::contains); List> samplesData = studyEntry.getSamplesData(); for (int i = 0; i < samplesData.size(); i++) { List data = samplesData.get(i); @@ -390,6 +415,16 @@ protected void checkSampleIndexTable(VariantHadoopDBAdaptor dbAdaptor) throws IO countFromIndex += variants.size(); VariantQueryResult result = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), variants) .append(VariantQueryParam.INCLUDE_SAMPLE.key(), sampleId), null); + Set expected = variants.stream().map(Variant::toString).collect(Collectors.toSet()); + Set actual = result.getResult().stream().map(Variant::toString).collect(Collectors.toSet()); + if (!expected.equals(actual)) { + HashSet extra = new HashSet<>(actual); + extra.removeAll(expected); + HashSet missing = new HashSet<>(expected); + missing.removeAll(actual); + System.out.println("missing = " + missing); + System.out.println("extra = " + extra); + } assertEquals(message, variants.size(), result.getNumResults()); for (Variant variant : result.getResult()) { assertEquals(message, gt, variant.getStudies().get(0).getSampleData(0).get(0)); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file1.genome.vcf b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file1.genome.vcf new file mode 100644 index 00000000000..cdc369be6d2 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file1.genome.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.2 +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 +1 1 . N . . END=10003 GT:DP .:. +1 10004 . C . . END=10010 GT:DP:AD 0/0:3:2,1 +1 10011 . ATTT A 2 . . GT:DP:AD 0/1:41:20,21 +1 10015 . A . . END=10020 GT:DP:AD 0/0:7:6,1 +1 10020 . A T 2 . . GT:DP:AD 0/1:42:20,22 +1 10021 . A . . END=10030 GT:DP:AD 0/0:7:6,1 +1 10031 . T TAAA 1 . . GT:DP:AD 0/1:43:20,23 +1 10032 . A . . END=10043 GT:DP:AD 0/0:5:4,1 \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file2.genome.vcf b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file2.genome.vcf new file mode 100644 index 00000000000..0fd9876108c --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps2/file2.genome.vcf @@ -0,0 +1,18 @@ +##fileformat=VCFv4.2 +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##GAPS=1:10015-10030 with 1:10020:A:T +##MULTI_OVERLAP=1:10013:T:C and 1:10014:A:T with 1:10011:ATTT:A +##INSERTION_GAP=1:10031:T:TAAA does not overlap with any from here +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2 +1 1 . N . . END=10003 GT:DP .:. +1 10004 . C . . END=10012 GT:DP:AD 0/0:3:2,1 +1 10013 . T C 2 . . GT:DP:AD 0/1:30:10,20 +1 10014 . T A 2 . . GT:DP:AD 0/1:31:11,21 +1 10031 . T G 1 . . GT:DP:AD 0/1:32:12,22 +1 10032 . A . . END=10034 GT:DP:AD 0/0:6:5,1 +1 10035 . A G 1 . . GT:DP:AD 0/1:33:13,23 +1 10036 . A . . END=10043 GT:DP:AD 0/0:5:4,1 \ No newline at end of file