diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/HBaseToStudyEntryConverter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/HBaseToStudyEntryConverter.java index bb015fb2e0f..4f354236ef7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/HBaseToStudyEntryConverter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/HBaseToStudyEntryConverter.java @@ -38,6 +38,7 @@ import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; +import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryUtils; import org.opencb.opencga.storage.hadoop.variant.converters.AbstractPhoenixConverter; import org.opencb.opencga.storage.hadoop.variant.converters.HBaseToVariantConverter; @@ -364,6 +365,11 @@ protected StudyEntry newStudyEntry(StudyConfiguration studyConfiguration, List sampleData) { sampleData = remapSamplesData(sampleData, formatsMap); + Integer gtIdx = studyEntry.getFormatPositions().get("GT"); + // Replace UNKNOWN_GENOTYPE, if any + if (gtIdx != null && sampleData.get(gtIdx).equals(GenotypeClass.UNKNOWN_GENOTYPE)) { + sampleData.set(gtIdx, unknownGenotype); + } String sampleName = studyConfiguration.getSampleIds().inverse().get(sampleId); studyEntry.addSampleData(sampleName, sampleData); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/StudyEntryToHBaseConverter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/StudyEntryToHBaseConverter.java index 1851595a19b..e0a7e668bf6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/StudyEntryToHBaseConverter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/study/StudyEntryToHBaseConverter.java @@ -172,14 +172,7 @@ private List remapFileData(Variant variant, StudyEntry studyEntry, FileE } // Trim all leading null values - for (int i = fileColumn.size() - 1; i >= 0; i--) { - if (fileColumn.get(i) != null) { - if (i != fileColumn.size() - 1) { - fileColumn = fileColumn.subList(0, i + 1); - } - break; - } - } + fileColumn = trimLeadingNullValues(fileColumn, HBaseToStudyEntryConverter.FILE_INFO_START_IDX); return fileColumn; } @@ -258,8 +251,20 @@ private List remapSampleData(StudyEntry studyEntry, int[] formatReMap, L // } // sampleDataIdx++; // } + remappedSampleData = trimLeadingNullValues(remappedSampleData, 1); return remappedSampleData; } + private List trimLeadingNullValues(List values, int minSize) { + int i = values.size() - 1; + while (i >= minSize && values.get(i) == null) { + i--; + } + if (i != values.size() - 1) { + values = values.subList(0, i + 1); + } + return values; + } + } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java index fcc19a30ab9..7b665e0f42f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTask.java @@ -7,6 +7,7 @@ import org.apache.hadoop.hbase.client.Put; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantBuilder; import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.biodata.models.variant.protobuf.VariantProto; @@ -14,6 +15,7 @@ import org.opencb.biodata.tools.variant.converters.proto.VcfRecordProtoToVariantConverter; import org.opencb.biodata.tools.variant.merge.VariantMerger; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; +import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.converters.study.StudyEntryToHBaseConverter; import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexConverter; @@ -24,8 +26,7 @@ import java.util.*; import java.util.stream.Collectors; -import static org.opencb.opencga.storage.hadoop.variant.gaps.VariantOverlappingStatus.REFERENCE; -import static org.opencb.opencga.storage.hadoop.variant.gaps.VariantOverlappingStatus.VARIANT; +import static org.opencb.opencga.storage.hadoop.variant.gaps.VariantOverlappingStatus.*; /** * Created on 15/01/18. @@ -53,7 +54,7 @@ public FillGapsTask(StudyConfiguration studyConfiguration, GenomeHelper helper, this.helper = helper; studyConverter = new StudyEntryToHBaseConverter(this.helper.getColumnFamily(), studyConfiguration, true, - Collections.singleton("?/?"), // Do not skip any genotype + Collections.emptySet(), // Do not skip any genotype null); // Do not update release variantMerger = new VariantMerger(false).configure(studyConfiguration.getVariantHeader()); } @@ -98,18 +99,26 @@ public VariantOverlappingStatus fillGaps(Variant variant, Set missingSa } } - VcfSliceProtos.VcfRecord vcfRecord; - VcfSliceProtos.VcfSlice vcfSlice; + final VcfSliceProtos.VcfRecord vcfRecord; + final VcfSliceProtos.VcfSlice vcfSlice; if (overlappingRecords.isEmpty()) { - // TODO: There was a gap in the gVCF? - // May happen that the variant to fill is an insertion, and there is no overlapping - // Should write "./." ? "0/0" for insertions? - logger.debug("Not overlap for fileId " + fileId + " in variant " + variant); - // Nothing to do! - return null; - } else if (overlappingRecords.size() > 1) { - // TODO: What if multiple overlaps? + if (skipReferenceVariants) { + // We are not reading reference blocks, so there gaps are expected and read as HOM_REF + return null; + } else { + // There was a gap in the gVCF? +// logger.debug("Not overlap for fileId " + fileId + " in variant " + variant); + if (variant.getType().equals(VariantType.INDEL) && variant.getReference().isEmpty()) { + // May happen that the variant to fill is an insertion, and there is no overlapping + // Write HOM_REF genotype for this samples + return processVariantFileGap(variant, missingSamples, put, fileId, "0/0"); + } else { + // There was a gap in the original file + return processVariantFileGap(variant, missingSamples, put, fileId, GenotypeClass.UNKNOWN_GENOTYPE); + } + } + } else if (overlappingRecords.size() > 1) { // Discard ref_blocks List> realVariants = overlappingRecords .stream() @@ -120,66 +129,142 @@ public VariantOverlappingStatus fillGaps(Variant variant, Set missingSa vcfRecord = realVariants.get(0).getRight(); vcfSlice = realVariants.get(0).getLeft(); } else { - String msg = "Found multiple overlaps for variant " + variant + " in file " + fileId; - if (!quiet) { -// throw new IllegalStateException(msg); - logger.warn(msg); - } - return null; +// String msg = "Found multiple overlaps for variant " + variant + " in file " + fileId; +// if (!quiet) { +//// throw new IllegalStateException(msg); +// logger.warn(msg); +// } + return processMultipleOverlappings(variant, missingSamples, put, fileId); } } else { vcfRecord = overlappingRecords.get(0).getRight(); vcfSlice = overlappingRecords.get(0).getLeft(); } Variant archiveVariant = convertToVariant(vcfSlice, vcfRecord, fileId); - if (VcfRecordProtoToVariantConverter.getVariantType(vcfRecord.getType()).equals(VariantType.NO_VARIATION)) { - FileEntry fileEntry = archiveVariant.getStudies().get(0).getFiles().get(0); - fileEntry.getAttributes().remove(VCFConstants.END_KEY); - if (StringUtils.isEmpty(fileEntry.getCall())) { - fileEntry.setCall(archiveVariant.getStart() + ":" + archiveVariant.getReference() + ":.:0"); - } - overlappingStatus = REFERENCE; - studyConverter.convert(archiveVariant, put, missingSamples, overlappingStatus); + + if (archiveVariant.getType().equals(VariantType.NO_VARIATION)) { + overlappingStatus = processReferenceOverlap(missingSamples, put, archiveVariant); } else { - Variant mergedVariant = new Variant( - variant.getChromosome(), - variant.getStart(), - variant.getEnd(), - variant.getReference(), - variant.getAlternate()); - StudyEntry studyEntry = new StudyEntry(); - studyEntry.setFormat(archiveVariant.getStudies().get(0).getFormat()); - studyEntry.setSortedSamplesPosition(new LinkedHashMap<>()); - studyEntry.setSamplesData(new ArrayList<>()); - - mergedVariant.addStudyEntry(studyEntry); - mergedVariant.setType(variant.getType()); - - mergedVariant = variantMerger.merge(mergedVariant, archiveVariant); - overlappingStatus = VARIANT; - - if (studyEntry.getFormatPositions().containsKey("GT")) { - int samplePosition = 0; - Integer gtIdx = studyEntry.getFormatPositions().get("GT"); - for (String sampleName : studyEntry.getOrderedSamplesName()) { - Integer sampleId = studyConfiguration.getSampleIds().get(sampleName); - if (missingSamples.contains(sampleId)) { - String gt = studyEntry.getSamplesData().get(samplePosition).get(gtIdx); - // Only genotypes without the main alternate (0/2, 2/3, ...) should be written as pending. - if (SampleIndexDBLoader.validGenotype(gt) && !hasMainAlternate(gt)) { - Put sampleIndexPut = new Put( - SampleIndexConverter.toRowKey(sampleId, variant.getChromosome(), variant.getStart()), - put.getTimeStamp()); - sampleIndexPut.addColumn(helper.getColumnFamily(), SampleIndexConverter.toPendingColumn(variant, gt), null); - sampleIndexPuts.add(sampleIndexPut); - } + overlappingStatus = processVariantOverlap(variant, missingSamples, put, sampleIndexPuts, archiveVariant); + } + return overlappingStatus; + } + + protected VariantOverlappingStatus processReferenceOverlap(Set missingSamples, Put put, Variant archiveVariant) { + VariantOverlappingStatus overlappingStatus = REFERENCE; + + FileEntry fileEntry = archiveVariant.getStudies().get(0).getFiles().get(0); + fileEntry.getAttributes().remove(VCFConstants.END_KEY); + if (StringUtils.isEmpty(fileEntry.getCall())) { + fileEntry.setCall(archiveVariant.getStart() + ":" + archiveVariant.getReference() + ":.:0"); + } + + studyConverter.convert(archiveVariant, put, missingSamples, overlappingStatus); + return overlappingStatus; + } + + protected VariantOverlappingStatus processVariantOverlap(Variant variant, Set missingSamples, Put put, + List sampleIndexPuts, Variant archiveVariant) { + VariantOverlappingStatus overlappingStatus = VARIANT; + + Variant mergedVariant = new Variant( + variant.getChromosome(), + variant.getStart(), + variant.getEnd(), + variant.getReference(), + variant.getAlternate()); + StudyEntry studyEntry = new StudyEntry(); + studyEntry.setFormat(archiveVariant.getStudies().get(0).getFormat()); + studyEntry.setSortedSamplesPosition(new LinkedHashMap<>()); + studyEntry.setSamplesData(new ArrayList<>()); + + mergedVariant.addStudyEntry(studyEntry); + mergedVariant.setType(variant.getType()); + + mergedVariant = variantMerger.merge(mergedVariant, archiveVariant); + + + if (studyEntry.getFormatPositions().containsKey("GT")) { + int samplePosition = 0; + Integer gtIdx = studyEntry.getFormatPositions().get("GT"); + for (String sampleName : studyEntry.getOrderedSamplesName()) { + Integer sampleId = studyConfiguration.getSampleIds().get(sampleName); + if (missingSamples.contains(sampleId)) { + String gt = studyEntry.getSamplesData().get(samplePosition).get(gtIdx); + // Only genotypes without the main alternate (0/2, 2/3, ...) should be written as pending. + if (SampleIndexDBLoader.validGenotype(gt) && !hasMainAlternate(gt)) { + Put sampleIndexPut = new Put( + SampleIndexConverter.toRowKey(sampleId, variant.getChromosome(), variant.getStart()), + put.getTimeStamp()); + sampleIndexPut.addColumn(helper.getColumnFamily(), SampleIndexConverter.toPendingColumn(variant, gt), null); + sampleIndexPuts.add(sampleIndexPut); } - samplePosition++; } + samplePosition++; } + } + + studyConverter.convert(mergedVariant, put, missingSamples, overlappingStatus); + return overlappingStatus; + } + + protected VariantOverlappingStatus processVariantFileGap(Variant variant, Set missingSamples, Put put, Integer fileId, - studyConverter.convert(mergedVariant, put, missingSamples, overlappingStatus); + String gt) { + return processVariantFile(variant, missingSamples, put, fileId, GAP, gt); + } + + private VariantOverlappingStatus processVariantFile(Variant variant, Set missingSamples, Put put, Integer fileId, + VariantOverlappingStatus overlappingStatus, String gt) { + LinkedHashMap samplePosition = getSamplePosition(fileId); + List> samplesData = new ArrayList<>(samplePosition.size()); + for (int i = 0; i < samplePosition.size(); i++) { + samplesData.add(Collections.singletonList(gt)); } + + VariantBuilder builder = Variant.newBuilder( + variant.getChromosome(), + variant.getStart(), + variant.getEnd(), + variant.getReference(), + variant.getAlternate()) + .setStudyId(String.valueOf(studyConfiguration.getStudyId())) + .setFormat("GT") + .setFileId(fileId.toString()) + .setSamplesPosition(samplePosition) + .setSamplesData(samplesData); + + studyConverter.convert(builder.build(), put, missingSamples, overlappingStatus); + return overlappingStatus; + } + + protected VariantOverlappingStatus processMultipleOverlappings(Variant variant, Set missingSamples, Put put, Integer fileId) { + VariantOverlappingStatus overlappingStatus = MULTI; + + LinkedHashMap samplePosition = getSamplePosition(fileId); + List> samplesData = new ArrayList<>(samplePosition.size()); + for (int i = 0; i < samplePosition.size(); i++) { + samplesData.add(Collections.singletonList("2/2")); + } + + VariantBuilder builder = Variant.newBuilder( + variant.getChromosome(), + variant.getStart(), + variant.getEnd(), + variant.getReference(), + variant.getAlternate()) + .addAlternate("<*>") + .setStudyId(String.valueOf(studyConfiguration.getStudyId())) + .setFileId(fileId.toString()) + // add overlapping variants at attributes + .setFormat("GT") + .setSamplesPosition(samplePosition) + .setSamplesData(samplesData); + + +// processVariantOverlap(variant, missingSamples, put, sampleIndexPuts, builder.build()); + studyConverter.convert(builder.build(), put, missingSamples, overlappingStatus); + return overlappingStatus; } @@ -203,7 +288,24 @@ public boolean getOverlappingVariants(Variant variant, int fileId, String reference = vcfRecord.getReference(); String alternate = vcfRecord.getAlternate(); // If the VcfRecord starts after the variant, stop looking for variants - if (isRegionAfterVariantStart(start, end, variant)) { + if (overlapsWith(variant, chromosome, start, end)) { + if (resetPosition == null) { + resetPosition = Math.max(iterator.previousIndex() - 1, firstIndex); + } + if (skipReferenceVariants && hasAllReferenceGenotype(vcfSlice, vcfRecord)) { + // Skip this variant + continue; + } + + // If the same variant is present for this file in the VcfSlice, the variant is already loaded + if (isVariantAlreadyLoaded(variant, vcfSlice, vcfRecord, chromosome, start, end, reference, alternate)) { + // Variant already loaded. Nothing to do! + isAlreadyPresent = true; + break; + } + + overlappingRecords.add(ImmutablePair.of(vcfSlice, vcfRecord)); + } else if (isRegionAfterVariantStart(start, end, variant)) { if (resetPosition == null) { resetPosition = Math.max(iterator.previousIndex() - 1, firstIndex); } @@ -225,25 +327,12 @@ public boolean getOverlappingVariants(Variant variant, int fileId, } else { break; } - } else if (overlapsWith(variant, chromosome, start, end)) { - if (resetPosition == null) { - resetPosition = Math.max(iterator.previousIndex() - 1, firstIndex); - } - if (skipReferenceVariants && hasAllReferenceGenotype(vcfSlice, vcfRecord)) { - // Skip this variant - continue; - } - - // If the same variant is present for this file in the VcfSlice, the variant is already loaded - if (isVariantAlreadyLoaded(variant, vcfSlice, vcfRecord, chromosome, start, end, reference, alternate)) { - // Variant already loaded. Nothing to do! - isAlreadyPresent = true; - break; - } - - overlappingRecords.add(ImmutablePair.of(vcfSlice, vcfRecord)); } } + if (resetPosition == null && !iterator.hasNext()) { + // If the iterator reaches the end without finding any point, reset the iterator + resetPosition = firstIndex; + } // Send back the iterator if (resetPosition != null) { // logger.info("Reset from " + iterator.nextIndex() + " to " + resetPosition + ". fileId : " + fileId + " variant " + variant); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/VariantOverlappingStatus.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/VariantOverlappingStatus.java index 9ac86ab8f05..b94d7f366cf 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/VariantOverlappingStatus.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/gaps/VariantOverlappingStatus.java @@ -14,6 +14,14 @@ public enum VariantOverlappingStatus { * A reference block from this file is overlapping with this variant. */ REFERENCE("R"), + /** + * There was a gap in the original file. + */ + GAP("G"), + /** + * + */ + MULTI("M"), /** * This variant is present on this file. */ diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHbaseTestUtils.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHbaseTestUtils.java index ae893de8c00..0b2088a33b1 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHbaseTestUtils.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHbaseTestUtils.java @@ -36,6 +36,7 @@ import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.io.DataWriter; import org.opencb.commons.utils.CompressionUtils; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.storage.core.StoragePipelineResult; @@ -44,8 +45,10 @@ import org.opencb.opencga.storage.core.metadata.local.FileStudyConfigurationAdaptor; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHBaseQueryParser; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; @@ -371,6 +374,7 @@ public static void printVariants(Collection studyConfigurati FileStudyConfigurationAdaptor.write(studyConfiguration, outDir.resolve("study_configuration_" + studyConfiguration.getStudyName() + ".json")); printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, outDir); printArchiveTable(studyConfiguration, dbAdaptor, outDir); + printVcf(studyConfiguration, dbAdaptor, outDir); } printMetaTable(dbAdaptor, outDir); printSamplesIndexTable(dbAdaptor, outDir); @@ -379,6 +383,19 @@ public static void printVariants(Collection studyConfigurati HBaseToVariantConverter.setFailOnWrongVariants(old); } + private static void printVcf(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, Path outDir) throws IOException { + try (OutputStream os = new FileOutputStream(outDir.resolve("variant." + studyConfiguration.getStudyName() + ".vcf").toFile())) { + Query query = new Query(VariantQueryParam.STUDY.key(), studyConfiguration.getStudyName()).append(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "."); + QueryOptions queryOptions = new QueryOptions(); + DataWriter writer = new VariantWriterFactory(dbAdaptor).newDataWriter(VariantWriterFactory.VariantOutputFormat.VCF, os, query, queryOptions); + writer.open(); + writer.pre(); + writer.write(dbAdaptor.get(query, queryOptions).getResult()); + writer.post(); + writer.close(); + } + } + private static void printSamplesIndexTable(VariantHadoopDBAdaptor dbAdaptor, Path outDir) throws IOException { for (Integer studyId : dbAdaptor.getStudyConfigurationManager().getStudies(null).values()) { String sampleGtTableName = dbAdaptor.getTableNameGenerator().getSampleIndexTableName(studyId); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java index 2ba80381344..bf37375eea1 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskTest.java @@ -20,6 +20,7 @@ import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; +import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryUtils; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; @@ -150,6 +151,32 @@ public void testFillGapsPlatinumFiles(ObjectMap options) throws Exception { checkQueryGenotypes(dbAdaptor); } + @Test + public void testFillGapsConflictingFiles() throws Exception { + StudyConfiguration studyConfiguration = load(new QueryOptions(), Arrays.asList(getResourceUri("gaps/file1.genome.vcf"), getResourceUri("gaps/file2.genome.vcf"))); + + HadoopVariantStorageEngine variantStorageEngine = (HadoopVariantStorageEngine) this.variantStorageEngine; + + VariantHadoopDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor(); + List sampleIds = new ArrayList<>(studyConfiguration.getSampleIds().values()); + sampleIds.sort(Integer::compareTo); + + fillGaps(variantStorageEngine, studyConfiguration, sampleIds); + printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri()); + checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10020:A:T")); + checkSampleIndexTable(dbAdaptor); + + Variant variantGap = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10020:A:T"), null).first(); + assertEquals("0/1", variantGap.getStudies().get(0).getSampleData("s1", "GT")); + assertEquals(GenotypeClass.UNKNOWN_GENOTYPE, variantGap.getStudies().get(0).getSampleData("s2", "GT")); + + Variant variantMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10012:TTT:-"), null).first(); + assertEquals("<*>", variantMulti.getStudies().get(0).getSecondaryAlternates().get(0).getAlternate()); + assertEquals("0/1", variantMulti.getStudies().get(0).getSampleData("s1", "GT")); + assertEquals("2/2", variantMulti.getStudies().get(0).getSampleData("s2", "GT")); + + } + @Test public void testFillMissingPlatinumFiles() throws Exception { ObjectMap options = new ObjectMap() @@ -245,10 +272,6 @@ private StudyConfiguration loadPlatinum(ObjectMap extraParams, int max) throws E private StudyConfiguration loadPlatinum(ObjectMap extraParams, int from, int to) throws Exception { - StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration(); - HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine(); - VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor(); - List inputFiles = new LinkedList<>(); for (int fileId = from; fileId <= to; fileId++) { @@ -256,10 +279,20 @@ private StudyConfiguration loadPlatinum(ObjectMap extraParams, int from, int to) inputFiles.add(getResourceUri(fileName)); } + return load(extraParams, inputFiles); + } + + private StudyConfiguration load(ObjectMap extraParams, List inputFiles) throws Exception { + StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration(); + HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine(); + VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor(); + ObjectMap options = variantStorageManager.getConfiguration().getStorageEngine(variantStorageManager.getStorageEngineId()).getVariant().getOptions(); options.put(VariantStorageEngine.Options.STUDY.key(), studyConfiguration.getStudyName()); + options.put(VariantStorageEngine.Options.GVCF.key(), true); options.put(HadoopVariantStorageEngine.VARIANT_TABLE_INDEXES_SKIP, true); options.put(HadoopVariantStorageEngine.HADOOP_LOAD_ARCHIVE_BATCH_SIZE, 1); + options.put(VariantStorageEngine.Options.MERGE_MODE.key(), VariantStorageEngine.MergeMode.BASIC); options.putAll(extraParams); List index = variantStorageManager.index(inputFiles, outputUri, true, true, true); @@ -275,6 +308,10 @@ private StudyConfiguration loadPlatinum(ObjectMap extraParams, int from, int to) } protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, List sampleIds) { + checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10178:-:C")); + } + + protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, List sampleIds, Set variantsWithGaps) { for (Variant variant : dbAdaptor) { boolean anyUnknown = false; boolean allUnknown = true; @@ -284,14 +321,11 @@ protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoo allUnknown &= unknown; } // Fail if any, but not all samples are unknown - try { - Assert.assertFalse(variant.toString(), anyUnknown && !allUnknown); -// Assert.assertTrue(allUnknown || !anyUnknown); - } catch (AssertionError e) { - if (variant.toString().equals("1:10178:-:C")) { + if (anyUnknown && !allUnknown) { + if (variantsWithGaps.contains(variant.toString())) { System.out.println("Gaps in variant " + variant); } else { - throw e; + Assert.fail("Gaps in variant " + variant); } } } @@ -330,7 +364,9 @@ private void checkQueryGenotypes(VariantHadoopDBAdaptor dbAdaptor) { for (String sample : StudyConfiguration.getIndexedSamples(sc).keySet()) { VariantQueryResult queryResult = dbAdaptor.get(new Query(VariantQueryParam.SAMPLE.key(), sample) .append(VariantQueryParam.INCLUDE_SAMPLE.key(), VariantQueryUtils.ALL).append(VariantQueryParam.INCLUDE_FILE.key(), VariantQueryUtils.ALL), new QueryOptions()); - assertThat(queryResult, everyResult(allVariants, withStudy(STUDY_NAME, withSampleData(sample, "GT", anyOf(containsString("1"), containsString("2"), containsString("3")))))); + assertThat(queryResult, everyResult(allVariants, + withStudy(STUDY_NAME, + withSampleData(sample, "GT", anyOf(containsString("1"), containsString("2"), containsString("3")))))); } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskUtilsTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskUtilsTest.java index 63e189947b1..dee5077bf41 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskUtilsTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/gaps/FillGapsTaskUtilsTest.java @@ -25,7 +25,7 @@ public class FillGapsTaskUtilsTest { @Test - public void testgetOverlappingVariants() { + public void testGetOverlappingVariants() { FillGapsTask a = new FillGapsTask(new StudyConfiguration(1, "a"), new GenomeHelper(new Configuration()), true); VcfSliceProtos.VcfSlice vcfSlice = buildVcfSlice("17:29113:T:C", "17:29185:A:G", "17:29190-29189::AAAAAAAA", "17:29464:G:"); @@ -49,7 +49,7 @@ public void testgetOverlappingVariants() { @Test - public void testgetOverlappingVariants2() { + public void testGetOverlappingVariants2() { FillGapsTask a = new FillGapsTask(new StudyConfiguration(1, "a"), new GenomeHelper(new Configuration()), true); VcfSliceProtos.VcfSlice vcfSlice = buildVcfSlice( diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf new file mode 100644 index 00000000000..f1785101328 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 +1 1 . N . . . END=10003 GT:DP .:. +1 10004 . C . . . END=10010 GT:DP 0/0:3 +1 10011 . ATTT A 2 . . GT:DP 0/1:40 +1 10015 . A . . . END=10020 GT:DP 0/0:7 +1 10020 . A T 2 . . GT:DP 0/1:41 +1 10021 . A . . . END=10030 GT:DP 0/0:7 +1 10031 . T TAAA 1 . . GT:DP 0/1:42 +1 10032 . A . . . END=10043 GT:DP 0/0:5 \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf new file mode 100644 index 00000000000..df3e0316b29 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##INFO= +##GAPS=1:10015-10030 with 1:10020:A:T +##MULTI_OVERLAP=1:10013:T:C and 1:10014:A:T with 1:10011:ATTT:A +##INSERTION_GAP=1:10031:T:TAAA does not overlap with any from here +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2 +1 1 . N . . . END=10003 GT:DP .:. +1 10004 . C . . . END=10012 GT:DP 0/0:3 +1 10013 . T C 2 . . GT:DP 0/1:30 +1 10014 . T A 2 . . GT:DP 0/1:31 +1 10031 . T G 1 . . GT:DP 0/1:32 +1 10032 . A G 1 . . GT:DP 0/1:33 +1 10033 . A . . . END=10043 GT:DP 0/0:5 \ No newline at end of file