Skip to content

Commit

Permalink
storage: Handle gaps in VCFs and multiple overlapps at FillGaps opera…
Browse files Browse the repository at this point in the history
…tion. #877
  • Loading branch information
j-coll committed Jul 24, 2018
1 parent 3315f6b commit f88cc06
Show file tree
Hide file tree
Showing 9 changed files with 289 additions and 100 deletions.
Expand Up @@ -38,6 +38,7 @@
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryUtils;
import org.opencb.opencga.storage.hadoop.variant.converters.AbstractPhoenixConverter;
import org.opencb.opencga.storage.hadoop.variant.converters.HBaseToVariantConverter;
Expand Down Expand Up @@ -364,6 +365,11 @@ protected StudyEntry newStudyEntry(StudyConfiguration studyConfiguration, List<S
protected void addMainSampleDataColumn(StudyConfiguration studyConfiguration, StudyEntry studyEntry,
int[] formatsMap, Integer sampleId, List<String> sampleData) {
sampleData = remapSamplesData(sampleData, formatsMap);
Integer gtIdx = studyEntry.getFormatPositions().get("GT");
// Replace UNKNOWN_GENOTYPE, if any
if (gtIdx != null && sampleData.get(gtIdx).equals(GenotypeClass.UNKNOWN_GENOTYPE)) {
sampleData.set(gtIdx, unknownGenotype);
}

String sampleName = studyConfiguration.getSampleIds().inverse().get(sampleId);
studyEntry.addSampleData(sampleName, sampleData);
Expand Down
Expand Up @@ -172,14 +172,7 @@ private List<String> remapFileData(Variant variant, StudyEntry studyEntry, FileE
}

// Trim all leading null values
for (int i = fileColumn.size() - 1; i >= 0; i--) {
if (fileColumn.get(i) != null) {
if (i != fileColumn.size() - 1) {
fileColumn = fileColumn.subList(0, i + 1);
}
break;
}
}
fileColumn = trimLeadingNullValues(fileColumn, HBaseToStudyEntryConverter.FILE_INFO_START_IDX);

return fileColumn;
}
Expand Down Expand Up @@ -258,8 +251,20 @@ private List<String> remapSampleData(StudyEntry studyEntry, int[] formatReMap, L
// }
// sampleDataIdx++;
// }
remappedSampleData = trimLeadingNullValues(remappedSampleData, 1);

return remappedSampleData;
}

private List<String> trimLeadingNullValues(List<String> values, int minSize) {
int i = values.size() - 1;
while (i >= minSize && values.get(i) == null) {
i--;
}
if (i != values.size() - 1) {
values = values.subList(0, i + 1);
}
return values;
}

}

Large diffs are not rendered by default.

Expand Up @@ -14,6 +14,14 @@ public enum VariantOverlappingStatus {
* A reference block from this file is overlapping with this variant.
*/
REFERENCE("R"),
/**
* There was a gap in the original file.
*/
GAP("G"),
/**
*
*/
MULTI("M"),
/**
* This variant is present on this file.
*/
Expand Down
Expand Up @@ -36,6 +36,7 @@
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.io.DataWriter;
import org.opencb.commons.utils.CompressionUtils;
import org.opencb.opencga.core.common.TimeUtils;
import org.opencb.opencga.storage.core.StoragePipelineResult;
Expand All @@ -44,8 +45,10 @@
import org.opencb.opencga.storage.core.metadata.local.FileStudyConfigurationAdaptor;
import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator;
import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager;
import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory;
import org.opencb.opencga.storage.hadoop.utils.HBaseManager;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHBaseQueryParser;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor;
Expand Down Expand Up @@ -371,6 +374,7 @@ public static void printVariants(Collection<StudyConfiguration> studyConfigurati
FileStudyConfigurationAdaptor.write(studyConfiguration, outDir.resolve("study_configuration_" + studyConfiguration.getStudyName() + ".json"));
printVariantsFromArchiveTable(dbAdaptor, studyConfiguration, outDir);
printArchiveTable(studyConfiguration, dbAdaptor, outDir);
printVcf(studyConfiguration, dbAdaptor, outDir);
}
printMetaTable(dbAdaptor, outDir);
printSamplesIndexTable(dbAdaptor, outDir);
Expand All @@ -379,6 +383,19 @@ public static void printVariants(Collection<StudyConfiguration> studyConfigurati
HBaseToVariantConverter.setFailOnWrongVariants(old);
}

private static void printVcf(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, Path outDir) throws IOException {
try (OutputStream os = new FileOutputStream(outDir.resolve("variant." + studyConfiguration.getStudyName() + ".vcf").toFile())) {
Query query = new Query(VariantQueryParam.STUDY.key(), studyConfiguration.getStudyName()).append(VariantQueryParam.UNKNOWN_GENOTYPE.key(), ".");
QueryOptions queryOptions = new QueryOptions();
DataWriter<Variant> writer = new VariantWriterFactory(dbAdaptor).newDataWriter(VariantWriterFactory.VariantOutputFormat.VCF, os, query, queryOptions);
writer.open();
writer.pre();
writer.write(dbAdaptor.get(query, queryOptions).getResult());
writer.post();
writer.close();
}
}

private static void printSamplesIndexTable(VariantHadoopDBAdaptor dbAdaptor, Path outDir) throws IOException {
for (Integer studyId : dbAdaptor.getStudyConfigurationManager().getStudies(null).values()) {
String sampleGtTableName = dbAdaptor.getTableNameGenerator().getSampleIndexTableName(studyId);
Expand Down
Expand Up @@ -20,6 +20,7 @@
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryUtils;
import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver;
Expand Down Expand Up @@ -150,6 +151,32 @@ public void testFillGapsPlatinumFiles(ObjectMap options) throws Exception {
checkQueryGenotypes(dbAdaptor);
}

@Test
public void testFillGapsConflictingFiles() throws Exception {
StudyConfiguration studyConfiguration = load(new QueryOptions(), Arrays.asList(getResourceUri("gaps/file1.genome.vcf"), getResourceUri("gaps/file2.genome.vcf")));

HadoopVariantStorageEngine variantStorageEngine = (HadoopVariantStorageEngine) this.variantStorageEngine;

VariantHadoopDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor();
List<Integer> sampleIds = new ArrayList<>(studyConfiguration.getSampleIds().values());
sampleIds.sort(Integer::compareTo);

fillGaps(variantStorageEngine, studyConfiguration, sampleIds);
printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri());
checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10020:A:T"));
checkSampleIndexTable(dbAdaptor);

Variant variantGap = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10020:A:T"), null).first();
assertEquals("0/1", variantGap.getStudies().get(0).getSampleData("s1", "GT"));
assertEquals(GenotypeClass.UNKNOWN_GENOTYPE, variantGap.getStudies().get(0).getSampleData("s2", "GT"));

Variant variantMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10012:TTT:-"), null).first();
assertEquals("<*>", variantMulti.getStudies().get(0).getSecondaryAlternates().get(0).getAlternate());
assertEquals("0/1", variantMulti.getStudies().get(0).getSampleData("s1", "GT"));
assertEquals("2/2", variantMulti.getStudies().get(0).getSampleData("s2", "GT"));

}

@Test
public void testFillMissingPlatinumFiles() throws Exception {
ObjectMap options = new ObjectMap()
Expand Down Expand Up @@ -245,21 +272,27 @@ private StudyConfiguration loadPlatinum(ObjectMap extraParams, int max) throws E

private StudyConfiguration loadPlatinum(ObjectMap extraParams, int from, int to) throws Exception {

StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine();
VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor();

List<URI> inputFiles = new LinkedList<>();

for (int fileId = from; fileId <= to; fileId++) {
String fileName = "platinum/1K.end.platinum-genomes-vcf-NA" + fileId + "_S1.genome.vcf.gz";
inputFiles.add(getResourceUri(fileName));
}

return load(extraParams, inputFiles);
}

private StudyConfiguration load(ObjectMap extraParams, List<URI> inputFiles) throws Exception {
StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration();
HadoopVariantStorageEngine variantStorageManager = getVariantStorageEngine();
VariantHadoopDBAdaptor dbAdaptor = variantStorageManager.getDBAdaptor();

ObjectMap options = variantStorageManager.getConfiguration().getStorageEngine(variantStorageManager.getStorageEngineId()).getVariant().getOptions();
options.put(VariantStorageEngine.Options.STUDY.key(), studyConfiguration.getStudyName());
options.put(VariantStorageEngine.Options.GVCF.key(), true);
options.put(HadoopVariantStorageEngine.VARIANT_TABLE_INDEXES_SKIP, true);
options.put(HadoopVariantStorageEngine.HADOOP_LOAD_ARCHIVE_BATCH_SIZE, 1);
options.put(VariantStorageEngine.Options.MERGE_MODE.key(), VariantStorageEngine.MergeMode.BASIC);
options.putAll(extraParams);
List<StoragePipelineResult> index = variantStorageManager.index(inputFiles, outputUri, true, true, true);

Expand All @@ -275,6 +308,10 @@ private StudyConfiguration loadPlatinum(ObjectMap extraParams, int from, int to)
}

protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, List<Integer> sampleIds) {
checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10178:-:C"));
}

protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, List<Integer> sampleIds, Set<String> variantsWithGaps) {
for (Variant variant : dbAdaptor) {
boolean anyUnknown = false;
boolean allUnknown = true;
Expand All @@ -284,14 +321,11 @@ protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoo
allUnknown &= unknown;
}
// Fail if any, but not all samples are unknown
try {
Assert.assertFalse(variant.toString(), anyUnknown && !allUnknown);
// Assert.assertTrue(allUnknown || !anyUnknown);
} catch (AssertionError e) {
if (variant.toString().equals("1:10178:-:C")) {
if (anyUnknown && !allUnknown) {
if (variantsWithGaps.contains(variant.toString())) {
System.out.println("Gaps in variant " + variant);
} else {
throw e;
Assert.fail("Gaps in variant " + variant);
}
}
}
Expand Down Expand Up @@ -330,7 +364,9 @@ private void checkQueryGenotypes(VariantHadoopDBAdaptor dbAdaptor) {
for (String sample : StudyConfiguration.getIndexedSamples(sc).keySet()) {
VariantQueryResult<Variant> queryResult = dbAdaptor.get(new Query(VariantQueryParam.SAMPLE.key(), sample)
.append(VariantQueryParam.INCLUDE_SAMPLE.key(), VariantQueryUtils.ALL).append(VariantQueryParam.INCLUDE_FILE.key(), VariantQueryUtils.ALL), new QueryOptions());
assertThat(queryResult, everyResult(allVariants, withStudy(STUDY_NAME, withSampleData(sample, "GT", anyOf(containsString("1"), containsString("2"), containsString("3"))))));
assertThat(queryResult, everyResult(allVariants,
withStudy(STUDY_NAME,
withSampleData(sample, "GT", anyOf(containsString("1"), containsString("2"), containsString("3"))))));
}
}

Expand Down
Expand Up @@ -25,7 +25,7 @@ public class FillGapsTaskUtilsTest {


@Test
public void testgetOverlappingVariants() {
public void testGetOverlappingVariants() {
FillGapsTask a = new FillGapsTask(new StudyConfiguration(1, "a"), new GenomeHelper(new Configuration()), true);

VcfSliceProtos.VcfSlice vcfSlice = buildVcfSlice("17:29113:T:C", "17:29185:A:G", "17:29190-29189::AAAAAAAA", "17:29464:G:");
Expand All @@ -49,7 +49,7 @@ public void testgetOverlappingVariants() {


@Test
public void testgetOverlappingVariants2() {
public void testGetOverlappingVariants2() {
FillGapsTask a = new FillGapsTask(new StudyConfiguration(1, "a"), new GenomeHelper(new Configuration()), true);

VcfSliceProtos.VcfSlice vcfSlice = buildVcfSlice(
Expand Down
@@ -0,0 +1,13 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1
1 1 . N . . . END=10003 GT:DP .:.
1 10004 . C . . . END=10010 GT:DP 0/0:3
1 10011 . ATTT A 2 . . GT:DP 0/1:40
1 10015 . A . . . END=10020 GT:DP 0/0:7
1 10020 . A T 2 . . GT:DP 0/1:41
1 10021 . A . . . END=10030 GT:DP 0/0:7
1 10031 . T TAAA 1 . . GT:DP 0/1:42
1 10032 . A . . . END=10043 GT:DP 0/0:5
@@ -0,0 +1,15 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
##GAPS=1:10015-10030 with 1:10020:A:T
##MULTI_OVERLAP=1:10013:T:C and 1:10014:A:T with 1:10011:ATTT:A
##INSERTION_GAP=1:10031:T:TAAA does not overlap with any from here
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2
1 1 . N . . . END=10003 GT:DP .:.
1 10004 . C . . . END=10012 GT:DP 0/0:3
1 10013 . T C 2 . . GT:DP 0/1:30
1 10014 . T A 2 . . GT:DP 0/1:31
1 10031 . T G 1 . . GT:DP 0/1:32
1 10032 . A G 1 . . GT:DP 0/1:33
1 10033 . A . . . END=10043 GT:DP 0/0:5

0 comments on commit f88cc06

Please sign in to comment.