Skip to content

Commit

Permalink
storage: Handle overlap with symbolic NO_VARIATION #877
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Aug 31, 2018
1 parent 479a76b commit 3a85707
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 9 deletions.
@@ -1,5 +1,6 @@
package org.opencb.opencga.storage.hadoop.variant.gaps;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
Expand Down Expand Up @@ -143,23 +144,51 @@ public VariantOverlappingStatus fillGaps(Variant variant, Set<Integer> missingSa
Variant archiveVariant = convertToVariant(vcfSlice, vcfRecord, fileId);

if (archiveVariant.getType().equals(VariantType.NO_VARIATION)) {
overlappingStatus = processReferenceOverlap(missingSamples, put, archiveVariant);
overlappingStatus = processReferenceOverlap(missingSamples, put, variant, archiveVariant);
} else {
overlappingStatus = processVariantOverlap(variant, missingSamples, put, sampleIndexPuts, archiveVariant);
}
return overlappingStatus;
}

protected VariantOverlappingStatus processReferenceOverlap(Set<Integer> missingSamples, Put put, Variant archiveVariant) {
protected VariantOverlappingStatus processReferenceOverlap(Set<Integer> missingSamples, Put put,
Variant variant, Variant archiveVariant) {
VariantOverlappingStatus overlappingStatus = REFERENCE;

FileEntry fileEntry = archiveVariant.getStudies().get(0).getFiles().get(0);
fileEntry.getAttributes().remove(VCFConstants.END_KEY);
if (StringUtils.isEmpty(fileEntry.getCall())) {
fileEntry.setCall(archiveVariant.getStart() + ":" + archiveVariant.getReference() + ":.:0");
fileEntry.setCall(archiveVariant.getStart() + ":" + archiveVariant.getReference() + ":" + archiveVariant.getAlternate() + ":0");
}
// SYMBOLIC reference overlap -- <*> , <NON_REF>
if (VariantType.NO_VARIATION.equals(archiveVariant.getType())
&& !archiveVariant.getAlternate().isEmpty()
&& !archiveVariant.getAlternate().equals(Allele.NO_CALL_STRING)) {

studyConverter.convert(archiveVariant, put, missingSamples, overlappingStatus);
// Create template variant
Variant mergedVariant = new Variant(
variant.getChromosome(),
variant.getStart(),
variant.getEnd(),
variant.getReference(),
variant.getAlternate());

StudyEntry studyEntry = new StudyEntry();
studyEntry.setFormat(archiveVariant.getStudies().get(0).getFormat());
studyEntry.setSortedSamplesPosition(new LinkedHashMap<>());
studyEntry.setSamplesData(new ArrayList<>());
mergedVariant.addStudyEntry(studyEntry);
mergedVariant.setType(variant.getType());

// Merge NO_VARIATION into the template variant
mergedVariant = variantMerger.merge(mergedVariant, archiveVariant);

// Convert study information to PUT
studyConverter.convert(mergedVariant, put, missingSamples, overlappingStatus);

} else {
studyConverter.convert(archiveVariant, put, missingSamples, overlappingStatus);
}
return overlappingStatus;
}

Expand Down
Expand Up @@ -9,7 +9,9 @@
import org.junit.rules.ExternalResource;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.commons.ProgressLogger;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
Expand All @@ -21,6 +23,7 @@
import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryUtils;
import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver;
Expand All @@ -40,9 +43,7 @@
import static org.hamcrest.CoreMatchers.containsString;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertThat;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.everyResult;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.withSampleData;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.withStudy;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.*;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.MISSING_GENOTYPES_UPDATED;
import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.printVariants;
import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.removeFile;
Expand Down Expand Up @@ -156,14 +157,37 @@ public void testFillGapsConflictingFiles() throws Exception {
getResourceUri("gaps/file1.genome.vcf"),
getResourceUri("gaps/file2.genome.vcf")));

checkConflictingFiles(studyConfiguration);
}

@Test
public void testFillGapsConflictingFilesNonRef() throws Exception {
StudyConfiguration studyConfiguration = load(new QueryOptions(), Arrays.asList(
getResourceUri("gaps2/file1.genome.vcf"),
getResourceUri("gaps2/file2.genome.vcf")));

checkConflictingFiles(studyConfiguration);

VariantDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor();

Variant variantMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10035:A:G"), null).first();
assertEquals("0/0", variantMulti.getStudies().get(0).getSampleData("s1", "GT"));
assertEquals(new AlternateCoordinate("1", 10035, 10035, "A", "<*>", VariantType.NO_VARIATION),
variantMulti.getStudies().get(0).getSecondaryAlternates().get(0));
assertEquals("4,0,1", variantMulti.getStudies().get(0).getSampleData("s1", "AD"));
assertEquals("0/1", variantMulti.getStudies().get(0).getSampleData("s2", "GT"));
assertEquals("13,23,0", variantMulti.getStudies().get(0).getSampleData("s2", "AD"));
}

public void checkConflictingFiles(StudyConfiguration studyConfiguration) throws Exception {
HadoopVariantStorageEngine variantStorageEngine = (HadoopVariantStorageEngine) this.variantStorageEngine;

VariantHadoopDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor();
List<Integer> sampleIds = new ArrayList<>(studyConfiguration.getSampleIds().values());
sampleIds.sort(Integer::compareTo);

fillGaps(variantStorageEngine, studyConfiguration, sampleIds);
printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri());
printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri(1));
checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10020:A:T"));
checkSampleIndexTable(dbAdaptor);

Expand Down Expand Up @@ -344,7 +368,8 @@ protected void checkFillMissing(VariantHadoopDBAdaptor dbAdaptor, List<Integer>

for (Variant variant : dbAdaptor) {
StudyEntry studyEntry = variant.getStudies().get(0);
boolean newVariant = !missingGenotypesUpdated && studyEntry.getFiles().stream().map(FileEntry::getFileId).map(Integer::valueOf).allMatch(newFilesSet::contains);
boolean newVariant = !missingGenotypesUpdated && studyEntry.getFiles().stream().map(FileEntry::getFileId)
.map(studyConfiguration.getFileIds()::get).allMatch(newFilesSet::contains);
List<List<String>> samplesData = studyEntry.getSamplesData();
for (int i = 0; i < samplesData.size(); i++) {
List<String> data = samplesData.get(i);
Expand Down Expand Up @@ -390,6 +415,16 @@ protected void checkSampleIndexTable(VariantHadoopDBAdaptor dbAdaptor) throws IO
countFromIndex += variants.size();
VariantQueryResult<Variant> result = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), variants)
.append(VariantQueryParam.INCLUDE_SAMPLE.key(), sampleId), null);
Set<String> expected = variants.stream().map(Variant::toString).collect(Collectors.toSet());
Set<String> actual = result.getResult().stream().map(Variant::toString).collect(Collectors.toSet());
if (!expected.equals(actual)) {
HashSet<String> extra = new HashSet<>(actual);
extra.removeAll(expected);
HashSet<String> missing = new HashSet<>(expected);
missing.removeAll(actual);
System.out.println("missing = " + missing);
System.out.println("extra = " + extra);
}
assertEquals(message, variants.size(), result.getNumResults());
for (Variant variant : result.getResult()) {
assertEquals(message, gt, variant.getStudies().get(0).getSampleData(0).get(0));
Expand Down
@@ -0,0 +1,15 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1
1 1 . N <NON_REF> . . END=10003 GT:DP .:.
1 10004 . C <NON_REF> . . END=10010 GT:DP:AD 0/0:3:2,1
1 10011 . ATTT A 2 . . GT:DP:AD 0/1:41:20,21
1 10015 . A <NON_REF> . . END=10020 GT:DP:AD 0/0:7:6,1
1 10020 . A T 2 . . GT:DP:AD 0/1:42:20,22
1 10021 . A <NON_REF> . . END=10030 GT:DP:AD 0/0:7:6,1
1 10031 . T TAAA 1 . . GT:DP:AD 0/1:43:20,23
1 10032 . A <NON_REF> . . END=10043 GT:DP:AD 0/0:5:4,1
@@ -0,0 +1,18 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Filtered basecall depth used for site genotyping">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the region described in this record">
##GAPS=1:10015-10030 with 1:10020:A:T
##MULTI_OVERLAP=1:10013:T:C and 1:10014:A:T with 1:10011:ATTT:A
##INSERTION_GAP=1:10031:T:TAAA does not overlap with any from here
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2
1 1 . N <NON_REF> . . END=10003 GT:DP .:.
1 10004 . C <NON_REF> . . END=10012 GT:DP:AD 0/0:3:2,1
1 10013 . T C 2 . . GT:DP:AD 0/1:30:10,20
1 10014 . T A 2 . . GT:DP:AD 0/1:31:11,21
1 10031 . T G 1 . . GT:DP:AD 0/1:32:12,22
1 10032 . A <NON_REF> . . END=10034 GT:DP:AD 0/0:6:5,1
1 10035 . A G 1 . . GT:DP:AD 0/1:33:13,23
1 10036 . A <NON_REF> . . END=10043 GT:DP:AD 0/0:5:4,1

0 comments on commit 3a85707

Please sign in to comment.