Skip to content

Commit

Permalink
storage: Allow filter region+ct and gene+ct using sample index #1881
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Jan 21, 2022
1 parent a122c43 commit 7da0666
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -316,7 +316,7 @@ public Query next(int batchSize) {
// Always execute "next" over variantsIterator, to fail if empty
variants.add(variantsIterator.next());
} while (variantsIterator.hasNext() && variants.size() < batchSize);
newQuery.append(VariantQueryParam.ID.key(), variants);
newQuery.append(VariantQueryUtils.ID_INTERSECT.key(), variants);
lastBatch = variants;
totalBatchSizeCount += variants.size();
lastBatchSize = variants.size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ public final class VariantQueryUtils {
public static final String GT = "GT";

// Some private query params
public static final QueryParam ID_INTERSECT = QueryParam.create("id_intersect",
"List of VariantIds that should be intersected with the rest of positional filters", QueryParam.Type.TEXT_ARRAY);
public static final QueryParam ANNOT_EXPRESSION_GENES = QueryParam.create("annot_expression_genes", "", QueryParam.Type.TEXT_ARRAY);
public static final QueryParam ANNOT_GO_GENES = QueryParam.create("annot_go_genes", "", QueryParam.Type.TEXT_ARRAY);
public static final QueryParam ANNOT_GENE_REGIONS = QueryParam.create("annot_gene_regions", "", QueryParam.Type.TEXT_ARRAY);
Expand All @@ -87,8 +89,11 @@ public final class VariantQueryUtils {
"", QueryParam.Type.TEXT_ARRAY);
public static final QueryParam NUM_SAMPLES = QueryParam.create("numSamples", "", QueryParam.Type.INTEGER);
public static final QueryParam NUM_TOTAL_SAMPLES = QueryParam.create("numTotalSamples", "", QueryParam.Type.INTEGER);

public static final String NON_EXISTING_REGION = "non_existing_region";
public static final List<QueryParam> INTERNAL_VARIANT_QUERY_PARAMS = Arrays.asList(ANNOT_EXPRESSION_GENES,
public static final List<QueryParam> INTERNAL_VARIANT_QUERY_PARAMS = Arrays.asList(
ID_INTERSECT,
ANNOT_EXPRESSION_GENES,
ANNOT_GO_GENES,
ANNOT_GENE_REGIONS,
VARIANTS_TO_INDEX,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,18 @@
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.opencga.core.response.VariantQueryResult;
import org.opencb.opencga.core.config.storage.StorageConfiguration;
import org.opencb.opencga.core.response.VariantQueryResult;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.exceptions.VariantSearchException;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.*;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantField;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils;
import org.opencb.opencga.storage.core.variant.search.solr.VariantSearchManager;
import org.opencb.opencga.storage.core.variant.search.solr.SolrNativeIterator;
import org.opencb.opencga.storage.core.variant.search.solr.VariantSearchManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -29,7 +32,6 @@
import java.util.stream.Collectors;

import static org.opencb.opencga.storage.core.variant.VariantStorageOptions.*;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam.ID;
import static org.opencb.opencga.storage.core.variant.query.VariantQueryUtils.MODIFIER_QUERY_PARAMS;
import static org.opencb.opencga.storage.core.variant.search.VariantSearchUtils.*;
import static org.opencb.opencga.storage.core.variant.search.solr.VariantSearchManager.SEARCH_ENGINE_ID;
Expand Down Expand Up @@ -207,7 +209,7 @@ public VariantQueryResult<Long> approximateCount(Query query, QueryOptions optio
// Do not count if empty. It will not apply the filter and count through the whole database.
numResults = 0;
} else {
engineQuery.put(ID.key(), variantIds);
engineQuery.put(VariantQueryUtils.ID_INTERSECT.key(), variantIds);
numResults = dbAdaptor.count(engineQuery).first();
}
logger.debug("NumResults: {}, NumSearchResults: {}, NumSamples: {}", numResults, numSearchResults, sampling);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package org.opencb.opencga.storage.hadoop.variant.adaptors;

import com.google.common.collect.Iterables;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hbase.client.Scan;
Expand Down Expand Up @@ -116,6 +117,11 @@ public static Set<String> unsupportedParamsFromQuery(Query query) {
}
otherParams.remove(ID);
}

if ((isValidParam(query, REGION) || isValidParam(query, ANNOT_GENE_REGIONS) || isValidParam(query, ID))
&& isValidParam(query, ID_INTERSECT)) {
messages.add("Unable to mix REGION or ID with ID_INTERSECT");
}
// if (otherParams.contains(REGION)) {
// if (query.getAsStringList(REGION.key()).size() != 1) {
// messages.add("Only one region is supported at a time");
Expand Down Expand Up @@ -203,6 +209,7 @@ public List<Scan> parseQueryMultiRegion(VariantQueryProjection selectElements, Q

List<Region> regions = getRegions(query);
List<Variant> variants = xrefs.getVariants();
List<Variant> idIntersect = query.getAsStringList(ID_INTERSECT.key()).stream().map(Variant::new).collect(Collectors.toList());

regions = mergeRegions(regions);
if (!regions.isEmpty()) {
Expand All @@ -215,15 +222,16 @@ public List<Scan> parseQueryMultiRegion(VariantQueryProjection selectElements, Q
}

List<Scan> scans;
if ((regions.isEmpty() || regions.size() == 1) && variants.isEmpty()) {
if ((regions.isEmpty() || regions.size() == 1) && variants.isEmpty() && idIntersect.isEmpty()) {
scans = Collections.singletonList(parseQuery(selectElements, query, options));
} else {
scans = new ArrayList<>(regions.size() + variants.size());
scans = new ArrayList<>(regions.size() + variants.size() + idIntersect.size());
Query subQuery = new Query(query);
subQuery.remove(REGION.key());
subQuery.remove(ANNOT_GENE_REGIONS.key());
subQuery.remove(ANNOT_XREF.key());
subQuery.remove(ID.key());
subQuery.remove(ID_INTERSECT.key());

subQuery.put(REGION.key(), "MULTI_REGION");
Scan templateScan = parseQuery(selectElements, subQuery, options);
Expand All @@ -239,11 +247,11 @@ public List<Scan> parseQueryMultiRegion(VariantQueryProjection selectElements, Q
}
}
subQuery.remove(REGION.key());
for (Variant variant : variants) {

for (Variant variant : Iterables.concat(variants, idIntersect)) {
subQuery.put(ID.key(), variant);
try {
Scan scan = new Scan(templateScan);
scan.setSmall(true);
addVariantIdFilter(scan, variant);
scans.add(scan);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,22 @@ protected List<String> getRegionFilters(Query query, List<String> otherFilters)
}
}

if (isValidParam(query, ID_INTERSECT)) {
List<Variant> idIntersect = query.getAsStringList(ID_INTERSECT.key()).stream().map(Variant::new).collect(Collectors.toList());
String idIntersectFilter = getVariantFilter(idIntersect);
if (regionFilters.isEmpty()) {
logger.info("ID_INTERSECT with {} variants", idIntersect.size());
regionFilters.add(idIntersectFilter);
} else {
logger.info("ID_INTERSECT with {} variants, and {} other region filters", idIntersect.size(), regionFilters.size());
String allRegionFilters = appendFilters(regionFilters, QueryOperation.OR);
String allRegionFiltersAndIdIntersect = appendFilters(
Arrays.asList(idIntersectFilter, allRegionFilters), QueryOperation.AND);
regionFilters.clear();
regionFilters.add(allRegionFiltersAndIdIntersect);
}
}

// if (regionFilters.isEmpty()) {
// // chromosome != _METADATA
// regionFilters.add(VariantColumn.CHROMOSOME + " != '" + genomeHelper.getMetaRowKeyString() + "'");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,24 +424,50 @@ public SampleIndexQuery parse(Query query) {
}
}

ParsedVariantQuery.VariantQueryXref xref = VariantQueryParser.parseXrefs(query);
boolean mixRegionsOrIdsWithGenesAndCtBt =
(isValidParam(query, REGION) || !xref.getVariants().isEmpty())
&&
(!xref.getGenes().isEmpty() && (isValidParam(query, ANNOT_CONSEQUENCE_TYPE) || isValidParam(query, ANNOT_BIOTYPE)));

// Extract regions
List<Region> regions = new ArrayList<>();
if (isValidParam(query, REGION)) {
regions.addAll(Region.parseRegions(query.getString(REGION.key()), true));
query.remove(REGION.key());
if (!mixRegionsOrIdsWithGenesAndCtBt) {
query.remove(REGION.key());
}
// else {
// logger.info("Keep region!");
// }
}
// Extract IDs
List<Variant> variants = xref.getVariants();
if (!variants.isEmpty()) {
if (!mixRegionsOrIdsWithGenesAndCtBt) {
query.remove(ID.key());
}
// else {
// logger.info("Keep variants!");
// }
}

if (isValidParam(query, ANNOT_GENE_REGIONS)) {
regions.addAll(Region.parseRegions(query.getString(ANNOT_GENE_REGIONS.key()), true));
if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE) || isValidParam(query, ANNOT_BIOTYPE)) {
query.put(ANNOT_GENE_REGIONS.key(), SKIP_GENE_REGIONS);
} else {
query.remove(ANNOT_GENE_REGIONS.key());
query.remove(GENE.key());
if (!mixRegionsOrIdsWithGenesAndCtBt) {
if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE) || isValidParam(query, ANNOT_BIOTYPE)) {
query.put(ANNOT_GENE_REGIONS.key(), SKIP_GENE_REGIONS);
} else {
query.remove(ANNOT_GENE_REGIONS.key());
query.remove(GENE.key());
}
}
// else {
// logger.info("Keep genes!");
// }
}

Collection<LocusQuery> regionGroups = buildLocusQueries(regions, VariantQueryParser.parseXrefs(query).getVariants());
Collection<LocusQuery> regionGroups = buildLocusQueries(regions, variants);

return new SampleIndexQuery(schema, regionGroups, variantTypes, study, samplesMap, multiFileSamples, negatedSamples,
fatherFilterMap, motherFilterMap,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,11 @@ public void testQueryAnnotationIndex() throws Exception {
// Should NOT return the variant // 11:62951221:C:G
testQueryAnnotationIndex(new Query().append(GENE.key(), "SLC22A10").append(ANNOT_CONSEQUENCE_TYPE.key(), "missense_variant"));

// Should return the variant // 11:62951221:C:G
testQueryAnnotationIndex(new Query().append(GENE.key(), "SLC22A25")
.append(REGION.key(), "2")
.append(ANNOT_CONSEQUENCE_TYPE.key(), "missense_variant"));


testQueryAnnotationIndex(new Query(ANNOT_CONSEQUENCE_TYPE.key(), "missense_variant,stop_gained")
.append(ANNOT_BIOTYPE.key(), "protein_coding")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package org.opencb.opencga.storage.mongodb.variant.adaptors;

import com.google.common.collect.Iterables;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.mongodb.QueryBuilder;
Expand Down Expand Up @@ -118,10 +119,11 @@ protected Document parseQuery(ParsedVariantQuery parsedVariantQuery) {
variantQueryXref.getOtherXrefs(), builder, QueryOperation.OR);
}

if (!variantQueryXref.getVariants().isEmpty()) {
List<Variant> idIntersect = query.getAsStringList(ID_INTERSECT.key()).stream().map(Variant::new).collect(Collectors.toList());
if (!variantQueryXref.getVariants().isEmpty() || !idIntersect.isEmpty()) {
nonGeneRegionFilter = true;
List<String> mongoIds = new ArrayList<>(variantQueryXref.getVariants().size());
for (Variant variant : variantQueryXref.getVariants()) {
List<String> mongoIds = new ArrayList<>(variantQueryXref.getVariants().size() + idIntersect.size());
for (Variant variant : Iterables.concat(idIntersect, variantQueryXref.getVariants())) {
mongoIds.add(STRING_ID_CONVERTER.buildId(variant));
}
if (mongoIds.size() == 1) {
Expand Down

0 comments on commit 7da0666

Please sign in to comment.