From cb9c9ef7914f50915af2cfac65db49b96ad830c3 Mon Sep 17 00:00:00 2001 From: pfurio Date: Tue, 10 Jan 2023 12:56:04 +0100 Subject: [PATCH 01/27] analysis: properly count deletion overlap pairs, #TASK-2478 --- .../org/opencb/opencga/analysis/rga/RgaUtils.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java index 10bec244387..74c8325746e 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java @@ -978,7 +978,7 @@ public int getNumPairedCompHetIds() { } public int getNumPairedDelOverlapIds() { - int numPairedDelOverlap = 0; + Set delOverlapPairs = new HashSet<>(); for (Map.Entry> entry : transcriptDelOverlapIdsMap.entrySet()) { Set chSet = entry.getValue(); if (chSet.size() > 1) { @@ -987,13 +987,14 @@ public int getNumPairedDelOverlapIds() { for (int j = i + 1; j < variantList.size(); j++) { // We simply check if two variants overlap. If they do, they are a valid pair if (variantList.get(i).overlapWith(variantList.get(j), true)) { - numPairedDelOverlap++; + String pair = concatSortedVariants(variantList.get(i).toString(), variantList.get(j).toString()); + delOverlapPairs.add(pair); } } } } } - return numPairedDelOverlap; + return delOverlapPairs.size(); } public int getNumHomIds() { @@ -1030,6 +1031,10 @@ public Map> getTranscriptCompHetIdsMap() { } return compHetMap; } + + private String concatSortedVariants(String v1, String v2) { + return StringUtils.compare(v1, v2) <= 0 ? v1 + "__" + v2 : v2 + "__" + v1; + } } } From 005ad3667404f479c3a1b2c91fd89c8a0436b0ae Mon Sep 17 00:00:00 2001 From: pfurio Date: Wed, 11 Jan 2023 12:30:49 +0100 Subject: [PATCH 02/27] analysis: exclude missense variants for DEL_OVERLAP, #TASK-2478 --- .../opencga/analysis/rga/RgaQueryParser.java | 63 +++++++++++++++++-- .../opencb/opencga/analysis/rga/RgaUtils.java | 31 ++++----- 2 files changed, 73 insertions(+), 21 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index a4f27825c1d..32f8c4db934 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -15,6 +15,7 @@ import java.util.*; import java.util.function.Predicate; import java.util.regex.Pattern; +import java.util.stream.Collectors; import static org.opencb.opencga.analysis.rga.RgaQueryParams.*; import static org.opencb.opencga.core.models.analysis.knockout.KnockoutVariant.KnockoutType.*; @@ -28,6 +29,25 @@ public class RgaQueryParser { protected static Logger logger = LoggerFactory.getLogger(RgaQueryParser.class); + private static final List ALL_CONSEQUENCE_TYPES; + private static final List ALL_PAIRED_CONSEQUENCE_TYPES; + private static final List INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES; + private static final List INCLUDED_DEL_OVERLAP_PAIR_CTS; + + static { + List excludedDelOverlapCts = getEncodedConsequenceTypes(Collections.singletonList("missense_variant")); +// List excludedDelOverlapCts = getEncodedConsequenceTypes(Collections.singletonList("transcript_ablation")); + + // Exclude DELETION_OVERLAP variants with consequence types: missense_variant + ALL_CONSEQUENCE_TYPES = getEncodedConsequenceTypes(RgaUtils.CONSEQUENCE_TYPE_LIST); + ALL_PAIRED_CONSEQUENCE_TYPES = generateSortedCombinations(ALL_CONSEQUENCE_TYPES); + INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES = ALL_CONSEQUENCE_TYPES + .stream() + .filter(ct -> !excludedDelOverlapCts.contains(ct)) + .collect(Collectors.toList()); + INCLUDED_DEL_OVERLAP_PAIR_CTS = generateSortedCombinations(INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES); + } + public RgaQueryParser() { this(CompHetQueryMode.SINGLE); } @@ -265,7 +285,7 @@ private void buildComplexQueryFilter(List filterList, List knock buildComplexQuery(koValues, filterValues, ctValues, popFreqQueryList, filterList); } - private List getEncodedConsequenceTypes(List originalCtList) { + private static List getEncodedConsequenceTypes(List originalCtList) { if (CollectionUtils.isEmpty(originalCtList)) { return Collections.emptyList(); } @@ -281,6 +301,8 @@ private void buildComplexQuery(List koValues, List filterValues, Map> popFreqQueryList, List filterList) throws RgaException { String encodedChString = RgaUtils.encode(COMP_HET.name()); + String delOverlap = RgaUtils.parseKnockoutTypeQuery(Collections.singletonList(DELETION_OVERLAP.name())).get(0); + List chFilterValues = filterValues; List chCtValues = ctValues; if (compHetQueryMode.equals(CompHetQueryMode.PAIR)) { @@ -294,8 +316,13 @@ private void buildComplexQuery(List koValues, List filterValues, List orFilterList = new LinkedList<>(); for (String koValue : koValues) { List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; + List ctList = koValue.equals(delOverlap) ? INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES : ALL_CONSEQUENCE_TYPES; for (String filterVal : finalFilterValues) { - orFilterList.add(koValue + SEPARATOR + filterVal); + // This is how it should be filtered +// orFilterList.add(koValue + SEPARATOR + filterVal); + for (String ctValue : ctList) { + orFilterList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue); + } } } parseStringValue(orFilterList, RgaDataModel.COMPOUND_FILTERS, filterList, "||"); @@ -313,6 +340,10 @@ private void buildComplexQuery(List koValues, List filterValues, List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; for (String filterVal : finalFilterValues) { for (String ctValue : finalCtValues) { + if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_PAIR_CTS.contains(ctValue)) { + // Don't process this filter + continue; + } orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR + sortedPopFreq.get(1)); } @@ -329,6 +360,10 @@ private void buildComplexQuery(List koValues, List filterValues, List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; for (String filterVal : finalFilterValues) { for (String ctValue : finalCtValues) { + if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { + // Don't process this filter + continue; + } if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreq + SEPARATOR + popFreq); @@ -351,6 +386,10 @@ private void buildComplexQuery(List koValues, List filterValues, List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; for (String filterVal : finalFilterValues) { for (String ctValue : finalCtValues) { + if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { + // Don't process this filter + continue; + } orFilterList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue); } } @@ -367,10 +406,15 @@ private void buildComplexQuery(List koValues, List filterValues, for (List sortedPopFreq : sortedPopFreqs) { for (String koValue : koValues) { List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; + List ctList = koValue.equals(delOverlap) ? INCLUDED_DEL_OVERLAP_PAIR_CTS : ALL_PAIRED_CONSEQUENCE_TYPES; for (String filterVal : finalFilterValues) { - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR - + sortedPopFreq.get(1)); - + // This is how it should be filtered +// orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR +// + sortedPopFreq.get(1)); + for (String ctValue : ctList) { + orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) + + SEPARATOR + sortedPopFreq.get(1)); + } } } } @@ -380,9 +424,15 @@ private void buildComplexQuery(List koValues, List filterValues, List orQueryList = new LinkedList<>(); for (String popFreq : tmpPopFreqList) { for (String koValue : koValues) { + List ctList = koValue.equals(delOverlap) ? INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES + : ALL_CONSEQUENCE_TYPES; List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; for (String filterVal : finalFilterValues) { - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + popFreq); + // This is how it should be filtered +// orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + popFreq); + for (String ctValue : ctList) { + orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreq); + } } } } @@ -391,6 +441,7 @@ private void buildComplexQuery(List koValues, List filterValues, } parseStringValue(andQueryList, RgaDataModel.COMPOUND_FILTERS, filterList, "&&"); } + } public static List generateSortedCombinations(List list) { diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java index 74c8325746e..03c70b64522 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java @@ -43,6 +43,21 @@ class RgaUtils { public static final Set ALL_PARAMS; public static final Map> PARAM_TYPES; + // CONSEQUENCE TYPE + public static final List CONSEQUENCE_TYPE_LIST = Arrays.asList("start_retained_variant", "upstream_variant", + "3_prime_UTR_variant", "splice_acceptor_variant", "transcript_amplification", "upstream_gene_variant", + "RNA_polymerase_promoter", "non_coding_transcript_exon_variant", "non_coding_transcript_variant", "inframe_variant", + "transcript_ablation", "splice_donor_variant", "synonymous_variant", "feature_elongation", "feature_truncation", + "miRNA_target_site", "exon_variant", "downstream_gene_variant", "stop_retained_variant", "TF_binding_site_variant", + "initiator_codon_variant", "coding_sequence_variant", "protein_altering_variant", "intergenic_variant", + "terminator_codon_variant", "frameshift_variant", "DNAseI_hypersensitive_site", "feature_variant", "2KB_downstream_variant", + "intron_variant", "splice_region_variant", "5_prime_UTR_variant", "SNP", "stop_gained", "regulatory_region_amplification", + "2KB_upstream_variant", "miRNA", "lincRNA", "start_lost", "SNV", "CpG_island", "downstream_variant", + "NMD_transcript_variant", "2KB_downstream_gene_variant", "TFBS_amplification", "missense_variant", + "regulatory_region_ablation", "mature_miRNA_variant", "stop_lost", "structural_variant", "regulatory_region_variant", + "TFBS_ablation", "copy_number_change", "2KB_upstream_gene_variant", "polypeptide_variation_site", "inframe_deletion", + "inframe_insertion", "incomplete_terminal_codon_variant"); + private static final Logger logger; static { @@ -60,21 +75,7 @@ class RgaUtils { ENCODE_MAP.put(PASS, "P"); ENCODE_MAP.put(NOT_PASS, "NP"); - // CONSEQUENCE TYPE - List consequenceTypeList = Arrays.asList("start_retained_variant", "upstream_variant", "3_prime_UTR_variant", - "splice_acceptor_variant", "transcript_amplification", "upstream_gene_variant", "RNA_polymerase_promoter", - "non_coding_transcript_exon_variant", "non_coding_transcript_variant", "inframe_variant", "transcript_ablation", - "splice_donor_variant", "synonymous_variant", "feature_elongation", "feature_truncation", "miRNA_target_site", - "exon_variant", "downstream_gene_variant", "stop_retained_variant", "TF_binding_site_variant", "initiator_codon_variant", - "coding_sequence_variant", "protein_altering_variant", "intergenic_variant", "terminator_codon_variant", - "frameshift_variant", "DNAseI_hypersensitive_site", "feature_variant", "2KB_downstream_variant", "intron_variant", - "splice_region_variant", "5_prime_UTR_variant", "SNP", "stop_gained", "regulatory_region_amplification", - "2KB_upstream_variant", "miRNA", "lincRNA", "start_lost", "SNV", "CpG_island", "downstream_variant", - "NMD_transcript_variant", "2KB_downstream_gene_variant", "TFBS_amplification", "missense_variant", - "regulatory_region_ablation", "mature_miRNA_variant", "stop_lost", "structural_variant", "regulatory_region_variant", - "TFBS_ablation", "copy_number_change", "2KB_upstream_gene_variant", "polypeptide_variation_site", "inframe_deletion", - "inframe_insertion", "incomplete_terminal_codon_variant"); - for (String consequenceType : consequenceTypeList) { + for (String consequenceType : CONSEQUENCE_TYPE_LIST) { ENCODE_MAP.put(consequenceType, String.valueOf(VariantQueryUtils.parseConsequenceType(consequenceType))); } From 47b509c5bec3c85011a1c0c81f1cfd01627d6146 Mon Sep 17 00:00:00 2001 From: pfurio Date: Wed, 18 Jan 2023 11:25:01 +0100 Subject: [PATCH 03/27] analysis: add cts to exclude, #TASK-2478 --- .../java/org/opencb/opencga/analysis/rga/RgaQueryParser.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index 32f8c4db934..589252cdbfc 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -35,8 +35,9 @@ public class RgaQueryParser { private static final List INCLUDED_DEL_OVERLAP_PAIR_CTS; static { - List excludedDelOverlapCts = getEncodedConsequenceTypes(Collections.singletonList("missense_variant")); -// List excludedDelOverlapCts = getEncodedConsequenceTypes(Collections.singletonList("transcript_ablation")); + List excludedDelOverlapCts = getEncodedConsequenceTypes(Arrays.asList("missense_variant", "frameshift_variant", + "incomplete_terminal_codon_variant", "start_lost", "stop_gained", "stop_lost", "splice_acceptor_variant", + "splice_donor_variant", "splice_region_variant")); // Exclude DELETION_OVERLAP variants with consequence types: missense_variant ALL_CONSEQUENCE_TYPES = getEncodedConsequenceTypes(RgaUtils.CONSEQUENCE_TYPE_LIST); From 99a25da1478de1b1bf6d0267857da6eb2253a452 Mon Sep 17 00:00:00 2001 From: pfurio Date: Tue, 24 Jan 2023 16:52:22 +0100 Subject: [PATCH 04/27] analysis: calculate missing chPairVariantStats, #TASK-2478 --- .../analysis/rga/GeneRgaConverter.java | 2 +- .../opencga/analysis/rga/RgaManager.java | 23 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/GeneRgaConverter.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/GeneRgaConverter.java index bfc61a2fab2..84a0203a155 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/GeneRgaConverter.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/GeneRgaConverter.java @@ -19,7 +19,7 @@ public class GeneRgaConverter extends AbstractRgaConverter { static { CONVERTER_MAP = new HashMap<>(); // We always include individual id in the response because we always want to return the numIndividuals populated - CONVERTER_MAP.put("id", Arrays.asList(RgaDataModel.GENE_ID, RgaDataModel.INDIVIDUAL_ID)); + CONVERTER_MAP.put("id", Arrays.asList(RgaDataModel.GENE_ID, RgaDataModel.INDIVIDUAL_ID, RgaDataModel.CH_PAIRS)); CONVERTER_MAP.put("name", Arrays.asList(RgaDataModel.GENE_ID, RgaDataModel.GENE_NAME, RgaDataModel.INDIVIDUAL_ID)); CONVERTER_MAP.put("chromosome", Arrays.asList(RgaDataModel.GENE_ID, RgaDataModel.CHROMOSOME, RgaDataModel.INDIVIDUAL_ID)); CONVERTER_MAP.put("start", Arrays.asList(RgaDataModel.GENE_ID, RgaDataModel.START, RgaDataModel.INDIVIDUAL_ID)); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index c534e03d9ec..ba28c0a7750 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1533,14 +1533,26 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer // 1. Get KnockoutByGene information Query individualQuery = new Query(RgaQueryParams.GENE_ID.key(), geneId); QueryOptions options = new QueryOptions() - .append(QueryOptions.LIMIT, 1) .append(QueryOptions.EXCLUDE, "individuals"); RgaIterator rgaIterator = rgaEngine.geneQuery(collection, individualQuery, options); if (!rgaIterator.hasNext()) { throw RgaException.noResultsMatching(); } - RgaDataModel rgaDataModel = rgaIterator.next(); + + KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); + RgaDataModel rgaDataModel = null; + while (rgaIterator.hasNext()) { + rgaDataModel = rgaIterator.next(); + if (CollectionUtils.isNotEmpty(rgaDataModel.getChPairs())) { + for (String chPair : rgaDataModel.getChPairs()) { + CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(chPair); + knockoutTypeCount.processChPairFeature(codedChPairVariants); + } + } + } + + // To get the basic gene information, we can use any document from RgaDataModel. In this case, we use the last document KnockoutByGeneSummary geneSummary = new KnockoutByGeneSummary(rgaDataModel.getGeneId(), rgaDataModel.getGeneName(), rgaDataModel.getChromosome(), rgaDataModel.getStart(), rgaDataModel.getEnd(), rgaDataModel.getStrand(), rgaDataModel.getGeneBiotype(), null, null); @@ -1550,13 +1562,6 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.VARIANT_SUMMARY); DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, knockoutTypeFacet); - KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); - if (CollectionUtils.isNotEmpty(rgaDataModel.getChPairs())) { - for (String chPair : rgaDataModel.getChPairs()) { - CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(chPair); - knockoutTypeCount.processChPairFeature(codedChPairVariants); - } - } for (FacetField.Bucket variantBucket : facetFieldDataResult.first().getBuckets()) { CodedVariant codedFeature = CodedVariant.parseEncodedId(variantBucket.getValue()); From 7aeddf735d436bfb9bea7c6b7e607a46f6826721 Mon Sep 17 00:00:00 2001 From: pfurio Date: Wed, 25 Jan 2023 16:23:58 +0100 Subject: [PATCH 05/27] analysis: fix HOM_ALT filters with 2 pop freqs, #TASK-2478 --- .../opencga/analysis/rga/RgaQueryParser.java | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index 589252cdbfc..e79dcc32442 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -301,8 +301,7 @@ private static List getEncodedConsequenceTypes(List originalCtLi private void buildComplexQuery(List koValues, List filterValues, List ctValues, Map> popFreqQueryList, List filterList) throws RgaException { String encodedChString = RgaUtils.encode(COMP_HET.name()); - - String delOverlap = RgaUtils.parseKnockoutTypeQuery(Collections.singletonList(DELETION_OVERLAP.name())).get(0); + String delOverlap = RgaUtils.encode(DELETION_OVERLAP.name()); List chFilterValues = filterValues; List chCtValues = ctValues; @@ -399,7 +398,7 @@ private void buildComplexQuery(List koValues, List filterValues, } else { // POP_FREQ not empty // KT + FILTER + POP_FREQ List andQueryList = new ArrayList<>(popFreqQueryList.size()); - if (popFreqQueryList.size() == 2) { + if (popFreqQueryList.size() == 2) { // + 2x POP FREQ ArrayList popFreqKeys = new ArrayList<>(popFreqQueryList.keySet()); List> sortedPopFreqs = RgaUtils.generateSortedCombinations(popFreqQueryList.get(popFreqKeys.get(0)), popFreqQueryList.get(popFreqKeys.get(1))); @@ -412,15 +411,36 @@ private void buildComplexQuery(List koValues, List filterValues, // This is how it should be filtered // orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR // + sortedPopFreq.get(1)); - for (String ctValue : ctList) { - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) - + SEPARATOR + sortedPopFreq.get(1)); + if (koValue.equals(delOverlap)) { + for (String ctValue : ctList) { + orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) + + SEPARATOR + sortedPopFreq.get(1)); + } +// } else if (koValue.equals(encodedChString)) { +// orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR +// + sortedPopFreq.get(1)); +// } else { +// List tmpAndQueryList = new ArrayList<>(2); +// tmpAndQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0)); +// tmpAndQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(1)); +// parseStringValue(tmpAndQueryList, "", orQueryList, "&&"); +// } + } else { + List tmpAndQueryList = new ArrayList<>(2); + tmpAndQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0)); + tmpAndQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(1)); + parseStringValue(tmpAndQueryList, "", orQueryList, "&&"); + + if (koValue.equals(encodedChString)) { + orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR + + sortedPopFreq.get(1)); + } } } } } parseStringValue(orQueryList, "", andQueryList, "||"); - } else { + } else { // + 1x POP FREQ for (List tmpPopFreqList : popFreqQueryList.values()) { List orQueryList = new LinkedList<>(); for (String popFreq : tmpPopFreqList) { From d0f5eeeb7a4da0e4635662c9f60454a3d811bd62 Mon Sep 17 00:00:00 2001 From: pfurio Date: Thu, 26 Jan 2023 12:03:40 +0100 Subject: [PATCH 06/27] analysis: ensure RGA always have the same behaviour, #TASK-2478 When no KnockoutType filters are applied, we internally add all possible values to the query to ensure we always get the same behaviour. --- .../opencga/analysis/rga/RgaManager.java | 95 ++++++++++++------- .../rga/iterators/SolrNativeIterator.java | 3 + 2 files changed, 63 insertions(+), 35 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index ba28c0a7750..c3f334177ba 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -69,6 +69,8 @@ public class RgaManager implements AutoCloseable { private final GeneRgaConverter geneConverter; private final VariantRgaConverter variantConverter; + private static final RgaQueryParams.CompHetQueryMode COMP_HET_QUERY_MODE = RgaQueryParams.CompHetQueryMode.PAIR; + private final Logger logger; private static final int KNOCKOUT_INSERT_BATCH_SIZE = 25; @@ -78,7 +80,7 @@ public RgaManager(CatalogManager catalogManager, VariantStorageManager variantSt this.catalogManager = catalogManager; this.storageConfiguration = variantStorageManager.getStorageConfiguration(); // TODO: Add CompHetQueryMode to configuration file in v2.5.0 - this.rgaEngine = new RgaEngine(this.storageConfiguration, RgaQueryParams.CompHetQueryMode.PAIR); + this.rgaEngine = new RgaEngine(this.storageConfiguration, COMP_HET_QUERY_MODE); this.variantStorageManager = variantStorageManager; this.individualRgaConverter = new IndividualRgaConverter(); @@ -508,11 +510,13 @@ public OpenCGAResult individualQuery(String studyStr, Quer Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String collection = getMainCollectionName(study.getFqn()); + Query finalQuery = parseQuery(query); + StopWatch stopWatch = new StopWatch(); stopWatch.start(); Preprocess preprocess; try { - preprocess = individualQueryPreprocess(study, query, options, token); + preprocess = individualQueryPreprocess(study, finalQuery, options, token); } catch (RgaException e) { if (RgaException.NO_RESULTS_FOUND.equals(e.getMessage())) { stopWatch.stop(); @@ -524,7 +528,7 @@ public OpenCGAResult individualQuery(String studyStr, Quer } VariantDBIterator variantDBIterator = VariantDBIterator.EMPTY_ITERATOR; - if (query.containsKey(RgaQueryParams.VARIANTS.key())) { + if (finalQuery.containsKey(RgaQueryParams.VARIANTS.key())) { try { variantDBIterator = variantStorageQuery(studyStr, preprocess.getQuery().getAsStringList(RgaQueryParams.SAMPLE_ID.key()), preprocess.getQuery(), QueryOptions.empty(), token); @@ -599,7 +603,7 @@ public OpenCGAResult geneQuery(String studyStr, Query query, List includeIndividuals = queryOptions.getAsStringList(RgaQueryParams.INCLUDE_INDIVIDUAL); Boolean isOwnerOrAdmin = catalogManager.getAuthorizationManager().isOwnerOrAdmin(study.getUid(), userId); - Query auxQuery = query != null ? new Query(query) : new Query(); + Query finalQuery = parseQuery(query); // Get number of matches Future numMatchesFuture = null; @@ -607,7 +611,7 @@ public OpenCGAResult geneQuery(String studyStr, Query query, numMatchesFuture = executor.submit(() -> { QueryOptions facetOptions = new QueryOptions(QueryOptions.FACET, "unique(" + RgaQueryParams.GENE_ID.key() + ")"); try { - DataResult result = rgaEngine.facetedQuery(collection, auxQuery, facetOptions); + DataResult result = rgaEngine.facetedQuery(collection, finalQuery, facetOptions); return ((Number) result.first().getAggregationValues().get(0)).intValue(); } catch (Exception e) { logger.error("Could not obtain the count: {}", e.getMessage(), e); @@ -618,8 +622,8 @@ public OpenCGAResult geneQuery(String studyStr, Query query, List geneIds; try { - geneIds = getGeneIds(collection, auxQuery, queryOptions); - auxQuery.put(RgaQueryParams.GENE_ID.key(), geneIds); + geneIds = getGeneIds(collection, finalQuery, queryOptions); + finalQuery.put(RgaQueryParams.GENE_ID.key(), geneIds); } catch (RgaException e) { if (RgaException.NO_RESULTS_FOUND.equals(e.getMessage())) { return OpenCGAResult.empty(RgaKnockoutByGene.class, (int) stopWatch.getTime(TimeUnit.MILLISECONDS)); @@ -641,7 +645,7 @@ public OpenCGAResult geneQuery(String studyStr, Query query, includeSampleIds = new HashSet<>((List) authorisedSampleIdResult.getResults()); } else { // 2. Check permissions - DataResult result = rgaEngine.facetedQuery(collection, auxQuery, + DataResult result = rgaEngine.facetedQuery(collection, finalQuery, new QueryOptions(QueryOptions.FACET, RgaDataModel.SAMPLE_ID).append(QueryOptions.LIMIT, -1)); if (result.getNumResults() == 0) { stopWatch.stop(); @@ -674,7 +678,7 @@ public OpenCGAResult geneQuery(String studyStr, Query query, includeSampleIds = new HashSet<>((List) sampleResult.getResults()); } - RgaIterator rgaIterator = rgaEngine.geneQuery(collection, auxQuery, queryOptions); + RgaIterator rgaIterator = rgaEngine.geneQuery(collection, finalQuery, queryOptions); int skipIndividuals = queryOptions.getInt(RgaQueryParams.SKIP_INDIVIDUAL); int limitIndividuals = queryOptions.getInt(RgaQueryParams.LIMIT_INDIVIDUAL, RgaQueryParams.DEFAULT_INDIVIDUAL_LIMIT); @@ -733,12 +737,12 @@ public OpenCGAResult variantQuery(String studyStr, Query quer List includeIndividuals = queryOptions.getAsStringList(RgaQueryParams.INCLUDE_INDIVIDUAL); Boolean isOwnerOrAdmin = catalogManager.getAuthorizationManager().isOwnerOrAdmin(study.getUid(), userId); - Query auxQuery = query != null ? new Query(query) : new Query(); + Query finalQuery = parseQuery(query); ResourceIds resourceIds; try { - resourceIds = getVariantIds(collection, auxCollection, auxQuery, queryOptions, executor); - auxQuery.put(RgaDataModel.VARIANTS, resourceIds.getIds()); + resourceIds = getVariantIds(collection, auxCollection, finalQuery, queryOptions, executor); + finalQuery.put(RgaDataModel.VARIANTS, resourceIds.getIds()); } catch (RgaException e) { if (RgaException.NO_RESULTS_FOUND.equals(e.getMessage())) { return OpenCGAResult.empty(KnockoutByVariant.class, (int) stopWatch.getTime(TimeUnit.MILLISECONDS)); @@ -759,7 +763,7 @@ public OpenCGAResult variantQuery(String studyStr, Query quer includeSampleIds = new HashSet<>((List) authorisedSampleIdResult.getResults()); } else { // 2. Check permissions - DataResult result = rgaEngine.facetedQuery(collection, auxQuery, + DataResult result = rgaEngine.facetedQuery(collection, finalQuery, new QueryOptions(QueryOptions.FACET, RgaDataModel.SAMPLE_ID).append(QueryOptions.LIMIT, -1)); if (result.getNumResults() == 0) { stopWatch.stop(); @@ -794,10 +798,10 @@ public OpenCGAResult variantQuery(String studyStr, Query quer } Future variantFuture = executor.submit( - () -> variantStorageQuery(study.getFqn(), new ArrayList<>(includeSampleIds), auxQuery, options, token) + () -> variantStorageQuery(study.getFqn(), new ArrayList<>(includeSampleIds), finalQuery, options, token) ); - Future rgaIteratorFuture = executor.submit(() -> rgaEngine.variantQuery(collection, auxQuery, queryOptions)); + Future rgaIteratorFuture = executor.submit(() -> rgaEngine.variantQuery(collection, finalQuery, queryOptions)); VariantDBIterator variantDBIterator; try { @@ -818,7 +822,7 @@ public OpenCGAResult variantQuery(String studyStr, Query quer // 4. Solr gene query List knockoutResultList = variantConverter.convertToDataModelType(rgaIterator, variantDBIterator, - auxQuery.getAsStringList(RgaQueryParams.VARIANTS.key()), includeIndividuals, skipIndividuals, limitIndividuals); + finalQuery.getAsStringList(RgaQueryParams.VARIANTS.key()), includeIndividuals, skipIndividuals, limitIndividuals); int time = (int) stopWatch.getTime(TimeUnit.MILLISECONDS); OpenCGAResult knockoutResult = new OpenCGAResult<>(time, Collections.emptyList(), knockoutResultList.size(), @@ -880,6 +884,8 @@ public OpenCGAResult individualSummary(String study Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String collection = getMainCollectionName(study.getFqn()); + Query finalQuery = parseQuery(query); + ExecutorService executor = Executors.newFixedThreadPool(4); // Check number of individuals matching query without checking their permissions @@ -892,7 +898,7 @@ public OpenCGAResult individualSummary(String study } else { totalIndividualsFuture = executor.submit(() -> { QueryOptions facetOptions = new QueryOptions(QueryOptions.FACET, "unique(" + RgaDataModel.INDIVIDUAL_ID + ")"); - DataResult result = rgaEngine.facetedQuery(collection, query, facetOptions); + DataResult result = rgaEngine.facetedQuery(collection, finalQuery, facetOptions); return ((Number) result.first().getAggregationValues().get(0)).intValue(); }); } @@ -900,7 +906,7 @@ public OpenCGAResult individualSummary(String study Preprocess preprocess; try { - preprocess = individualQueryPreprocess(study, query, options, token); + preprocess = individualQueryPreprocess(study, finalQuery, options, token); } catch (RgaException e) { if (RgaException.NO_RESULTS_FOUND.equals(e.getMessage())) { stopWatch.stop(); @@ -1010,7 +1016,7 @@ public OpenCGAResult geneSummary(String studyStr, Query q ExecutorService executor = Executors.newFixedThreadPool(4); QueryOptions queryOptions = setDefaultLimit(options); - Query auxQuery = query != null ? new Query(query) : new Query(); + Query finalQuery = parseQuery(query); // Get number of matches Future numMatchesFuture = null; @@ -1018,7 +1024,7 @@ public OpenCGAResult geneSummary(String studyStr, Query q numMatchesFuture = executor.submit(() -> { QueryOptions facetOptions = new QueryOptions(QueryOptions.FACET, "unique(" + RgaQueryParams.GENE_ID.key() + ")"); try { - DataResult result = rgaEngine.facetedQuery(collection, auxQuery, facetOptions); + DataResult result = rgaEngine.facetedQuery(collection, finalQuery, facetOptions); return ((Number) result.first().getAggregationValues().get(0)).intValue(); } catch (Exception e) { logger.error("Could not obtain the count: {}", e.getMessage(), e); @@ -1029,8 +1035,8 @@ public OpenCGAResult geneSummary(String studyStr, Query q List geneIds; try { - geneIds = getGeneIds(collection, auxQuery, queryOptions); - auxQuery.remove(RgaQueryParams.GENE_ID.key()); + geneIds = getGeneIds(collection, finalQuery, queryOptions); + finalQuery.remove(RgaQueryParams.GENE_ID.key()); } catch (RgaException e) { if (RgaException.NO_RESULTS_FOUND.equals(e.getMessage())) { return OpenCGAResult.empty(KnockoutByGeneSummary.class, (int) stopWatch.getTime(TimeUnit.MILLISECONDS)); @@ -1040,7 +1046,7 @@ public OpenCGAResult geneSummary(String studyStr, Query q List> geneSummaryFutureList = new ArrayList<>(geneIds.size()); for (String geneId : geneIds) { - geneSummaryFutureList.add(executor.submit(() -> calculateGeneSummary(collection, auxQuery, geneId))); + geneSummaryFutureList.add(executor.submit(() -> calculateGeneSummary(collection, finalQuery, geneId))); } List knockoutByGeneSummaryList = new ArrayList<>(geneIds.size()); @@ -1090,12 +1096,12 @@ public OpenCGAResult variantSummary(String studyStr, Q ExecutorService executor = Executors.newFixedThreadPool(4); QueryOptions queryOptions = setDefaultLimit(options); - Query auxQuery = query != null ? new Query(query) : new Query(); + Query finalQuery = parseQuery(query); ResourceIds resourceIds; try { - resourceIds = getVariantIds(collection, auxCollection, auxQuery, queryOptions, executor); - auxQuery.put(RgaDataModel.VARIANTS, resourceIds.getIds()); + resourceIds = getVariantIds(collection, auxCollection, finalQuery, queryOptions, executor); + finalQuery.put(RgaDataModel.VARIANTS, resourceIds.getIds()); } catch (RgaException e) { if (RgaException.NO_RESULTS_FOUND.equals(e.getMessage())) { return OpenCGAResult.empty(KnockoutByVariantSummary.class, (int) stopWatch.getTime(TimeUnit.MILLISECONDS)); @@ -1104,12 +1110,12 @@ public OpenCGAResult variantSummary(String studyStr, Q } Future variantFuture = executor.submit( - () -> variantStorageQuery(study.getFqn(), Collections.emptyList(), auxQuery, QueryOptions.empty(), token) + () -> variantStorageQuery(study.getFqn(), Collections.emptyList(), finalQuery, QueryOptions.empty(), token) ); List> variantSummaryList = new ArrayList<>(resourceIds.getIds().size()); for (String variantId : resourceIds.getIds()) { - variantSummaryList.add(executor.submit(() -> calculatePartialSolrVariantSummary(collection, auxQuery, variantId))); + variantSummaryList.add(executor.submit(() -> calculatePartialSolrVariantSummary(collection, finalQuery, variantId))); } Map variantSummaryMap = new HashMap<>(); @@ -1175,6 +1181,24 @@ public OpenCGAResult variantSummary(String studyStr, Q return result; } + private Query parseQuery(Query query) { + Query myQuery = query != null ? new Query(query) : new Query(); + // That's the condition we would need to apply to change these filters. + // TODO: + // Because we are also adding some special filters for the DELETION_OVERLAP variants, we ALWAYS need to ensure that the query + // filters by knockout type. In the future, we should fix the DELETION_OVERLAP issue and then we will be able to uncomment + // the condition below. +// if (COMP_HET_QUERY_MODE.equals(RgaQueryParams.CompHetQueryMode.PAIR) && !myQuery.containsKey(RgaQueryParams.KNOCKOUT.key())) { + // Fill with all knockout types to ensure comp_het queries are performed as pairs + List knockoutValues = EnumSet.allOf(KnockoutVariant.KnockoutType.class) + .stream() + .map(Enum::name) + .collect(Collectors.toList()); + myQuery.append(RgaQueryParams.KNOCKOUT.key(), knockoutValues); +// } + return myQuery; + } + public OpenCGAResult aggregationStats(String studyStr, Query query, QueryOptions options, String fields, String token) throws CatalogException, IOException, RgaException { Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); @@ -1628,7 +1652,6 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection // 1. Get KnockoutByIndividual information QueryOptions options = new QueryOptions() - .append(QueryOptions.LIMIT, 1) .append(QueryOptions.EXCLUDE, "genes"); RgaIterator rgaIterator = rgaEngine.individualQuery(collection, auxQuery, options); @@ -1637,15 +1660,17 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection } KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); - RgaDataModel rgaDataModel = rgaIterator.next(); - if (CollectionUtils.isNotEmpty(rgaDataModel.getChPairs())) { - for (String chPair : rgaDataModel.getChPairs()) { - CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(chPair); - knockoutTypeCount.processChPairFeature(codedChPairVariants); + RgaDataModel rgaDataModel = null; + while (rgaIterator.hasNext()) { + rgaDataModel = rgaIterator.next(); + if (CollectionUtils.isNotEmpty(rgaDataModel.getChPairs())) { + for (String chPair : rgaDataModel.getChPairs()) { + CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(chPair); + knockoutTypeCount.processChPairFeature(codedChPairVariants); + } } } - KnockoutByIndividual knockoutByIndividual = AbstractRgaConverter.fillIndividualInfo(rgaDataModel); KnockoutByIndividualSummary knockoutByIndividualSummary = new KnockoutByIndividualSummary(knockoutByIndividual); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/iterators/SolrNativeIterator.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/iterators/SolrNativeIterator.java index 10fe15491d0..0ccc66a5603 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/iterators/SolrNativeIterator.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/iterators/SolrNativeIterator.java @@ -68,6 +68,9 @@ public SolrNativeIterator(SolrClient solrClient, String collection, SolrQuery so @Override public boolean hasNext() { + if (listBuffer.isEmpty()) { + fetchNextBatch(); + } return !listBuffer.isEmpty(); } From 528b49081b9e38f4f615aa534f67252d8247a0c7 Mon Sep 17 00:00:00 2001 From: pfurio Date: Thu, 26 Jan 2023 13:13:59 +0100 Subject: [PATCH 07/27] analysis: fix aux collection query parser, #TASK-2478 --- .../opencb/opencga/analysis/rga/RgaQueryParser.java | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index e79dcc32442..34068880786 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -172,7 +172,9 @@ private void parseMainCollCompoundFilters(Query query, List filterList) count += ctValues.isEmpty() ? 0 : 1; count += popFreqValues.isEmpty() ? 0 : 1; - if (count == 1) { + boolean simpleFilter = !knockoutValues.contains(COMP_HET.name()) && !knockoutValues.contains(DELETION_OVERLAP.name()) && count == 1; + + if (simpleFilter) { // Simple filter parseStringValue(query, KNOCKOUT, RgaDataModel.KNOCKOUT_TYPES, filterList); parseStringValue(query, FILTER, RgaDataModel.FILTERS, filterList); @@ -185,7 +187,7 @@ private void parseMainCollCompoundFilters(Query query, List filterList) parseStringValue(entry.getValue(), RgaDataModel.POPULATION_FREQUENCIES.replace("*", entry.getKey()), filterList, "||"); } } - } else if (count > 1) { + } else { buildComplexQueryFilter(filterList, knockoutValues, filterValue, ctValues, popFreqValues); } } @@ -202,9 +204,7 @@ private void parseAuxCollCompoundFilters(Query query, List filterList) t count += ctValues.isEmpty() ? 0 : 1; count += popFreqValues.isEmpty() ? 0 : 1; - // In this case, we may need to use both filters if users are filtering by COMP_HET and another ko type + (ct | pf) - boolean simpleFilter = !knockoutValues.contains(COMP_HET.name()) || count == 1; - boolean complexFilter = knockoutValues.contains(COMP_HET.name()) && count > 1; + boolean simpleFilter = !knockoutValues.contains(COMP_HET.name()) && !knockoutValues.contains(DELETION_OVERLAP.name()) && count == 1; if (simpleFilter) { // Simple filters @@ -222,8 +222,7 @@ private void parseAuxCollCompoundFilters(Query query, List filterList) t AuxiliarRgaDataModel.POPULATION_FREQUENCIES.replace("*", entry.getKey()), filterList, "||"); } } - } - if (complexFilter) { + } else { buildComplexQueryFilter(filterList, knockoutValues, "", ctValues, popFreqValues); } } From b15d64181eaa3d36e39ed674832234eaa6670618 Mon Sep 17 00:00:00 2001 From: pfurio Date: Fri, 27 Jan 2023 12:32:39 +0100 Subject: [PATCH 08/27] analysis: fix facet queries, #TASK-2478 --- .../opencga/analysis/rga/RgaEngine.java | 16 +++--- .../opencga/analysis/rga/RgaManager.java | 13 +++-- .../opencga/analysis/rga/RgaQueryParser.java | 49 +++++++++++-------- 3 files changed, 44 insertions(+), 34 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java index d623dcf3dc5..01456f19a1f 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java @@ -283,13 +283,15 @@ public long count(String collection, Query query) throws RgaException, IOExcepti public DataResult joinFacetQuery(String collection, String externalCollection, Query query, Query externalQuery, QueryOptions queryOptions) throws RgaException, IOException { SolrQuery mainSolrQuery = parser.parseAuxQuery(query); - SolrQuery externalSolrQuery = parser.parseQuery(externalQuery); - - if (externalSolrQuery.getFilterQueries() != null && externalSolrQuery.getFilterQueries().length > 0) { - String externalQueryStr = StringUtils.join(externalSolrQuery.getFilterQueries(), " AND "); - mainSolrQuery.set("v1", externalQueryStr); - mainSolrQuery.addFilterQuery("{!join from=" + RgaDataModel.VARIANTS + " to=" + AuxiliarRgaDataModel.ID - + " fromIndex=" + externalCollection + " v=$v1}"); + if (!externalQuery.isEmpty()) { + SolrQuery externalSolrQuery = parser.parseQuery(externalQuery); + + if (externalSolrQuery.getFilterQueries() != null && externalSolrQuery.getFilterQueries().length > 0) { + String externalQueryStr = StringUtils.join(externalSolrQuery.getFilterQueries(), " AND "); + mainSolrQuery.set("v1", externalQueryStr); + mainSolrQuery.addFilterQuery("{!join from=" + RgaDataModel.VARIANTS + " to=" + AuxiliarRgaDataModel.ID + + " fromIndex=" + externalCollection + " v=$v1}"); + } } return facetedQuery(collection, mainSolrQuery, queryOptions); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index c3f334177ba..d226884dd88 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1326,8 +1326,8 @@ private ResourceIds getVariantIdsJoiningCollections(String mainCollection, Strin ExecutorService executor) throws RgaException, IOException { Future numMatchesFuture = null; List ids; - Query mainCollQuery = generateQuery(query, AuxiliarRgaDataModel.MAIN_TO_AUXILIAR_DATA_MODEL_MAP.keySet(), true); - Query auxCollQuery = generateQuery(query, AuxiliarRgaDataModel.MAIN_TO_AUXILIAR_DATA_MODEL_MAP.keySet(), false); + Query mainCollQuery = new Query(query); // Everything is used for the main collection + Query auxCollQuery = generateQuery(query, AuxiliarRgaDataModel.MAIN_TO_AUXILIAR_DATA_MODEL_MAP.keySet()); // Make a join with the main collection to get all the data we need !! @@ -1369,15 +1369,14 @@ private boolean isQueryingByIndividualFields(Query query) { /** * Generate a new query based on the original query. * - * @param query Original query from where it will be generated the new query. - * @param fields Fields to be added in the new query (unless inverse is true). - * @param inverse Flag indicating to generate a new query with the fields passed or absent. + * @param query Original query from where it will be generated the new query. + * @param fields Fields to be added in the new query. * @return a new query object. */ - private Query generateQuery(Query query, Set fields, boolean inverse) { + private Query generateQuery(Query query, Set fields) { Query newQuery = new Query(); for (Map.Entry entry : query.entrySet()) { - if ((fields.contains(entry.getKey()) && !inverse) || (!fields.contains(entry.getKey()) && inverse)) { + if (fields.contains(entry.getKey())) { newQuery.put(entry.getKey(), entry.getValue()); } } diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index 34068880786..3b9ebb083e8 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -314,13 +314,19 @@ private void buildComplexQuery(List koValues, List filterValues, // KT + FILTER List orFilterList = new LinkedList<>(); for (String koValue : koValues) { - List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; - List ctList = koValue.equals(delOverlap) ? INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES : ALL_CONSEQUENCE_TYPES; - for (String filterVal : finalFilterValues) { - // This is how it should be filtered -// orFilterList.add(koValue + SEPARATOR + filterVal); - for (String ctValue : ctList) { - orFilterList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue); + if (koValue.equals(delOverlap)) { + for (String filterVal : filterValues) { + for (String ctValue : INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES) { + orFilterList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue); + } + } + } else if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { + for (String filterVal : chFilterValues) { + orFilterList.add(koValue + SEPARATOR + filterVal); + } + } else { + for (String filterVal : filterValues) { + orFilterList.add(koValue + SEPARATOR + filterVal); } } } @@ -353,20 +359,23 @@ private void buildComplexQuery(List koValues, List filterValues, } else { for (List tmpPopFreqList : popFreqQueryList.values()) { List orQueryList = new LinkedList<>(); - for (String popFreq : tmpPopFreqList) { - for (String koValue : koValues) { - List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; - List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; - for (String filterVal : finalFilterValues) { - for (String ctValue : finalCtValues) { - if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { - // Don't process this filter - continue; + for (String koValue : koValues) { + List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; + List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; + for (String filterVal : finalFilterValues) { + for (String ctValue : finalCtValues) { + if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { + // Don't process this filter + continue; + } + if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString) + && tmpPopFreqList.size() > 1) { + List sortedCombinations = generateSortedCombinations(tmpPopFreqList); + for (String popFreqPair : sortedCombinations) { + orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreqPair); } - if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreq - + SEPARATOR + popFreq); - } else { + } else { + for (String popFreq : tmpPopFreqList) { orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreq); } } From 35ae29aa3fd5f96eb12bb20236876aca8031019a Mon Sep 17 00:00:00 2001 From: pfurio Date: Mon, 30 Jan 2023 12:30:53 +0100 Subject: [PATCH 09/27] analysis: fix query parser options, #TASK-2478 --- .../opencga/analysis/rga/RgaQueryParser.java | 77 +++++++++++-------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index 3b9ebb083e8..bc45f5b1afe 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -314,19 +314,19 @@ private void buildComplexQuery(List koValues, List filterValues, // KT + FILTER List orFilterList = new LinkedList<>(); for (String koValue : koValues) { - if (koValue.equals(delOverlap)) { - for (String filterVal : filterValues) { - for (String ctValue : INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES) { - orFilterList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue); - } - } - } else if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { + if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { for (String filterVal : chFilterValues) { orFilterList.add(koValue + SEPARATOR + filterVal); } } else { for (String filterVal : filterValues) { - orFilterList.add(koValue + SEPARATOR + filterVal); + if (koValue.equals(delOverlap)) { + for (String ctValue : INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES) { + orFilterList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue); + } + } else { + orFilterList.add(koValue + SEPARATOR + filterVal); + } } } } @@ -335,26 +335,39 @@ private void buildComplexQuery(List koValues, List filterValues, // KT + FILTER + CT + POP_FREQ List andQueryList = new ArrayList<>(popFreqQueryList.size()); if (popFreqQueryList.size() == 2) { - ArrayList popFreqKeys = new ArrayList<>(popFreqQueryList.keySet()); - List> sortedPopFreqs = RgaUtils.generateSortedCombinations(popFreqQueryList.get(popFreqKeys.get(0)), - popFreqQueryList.get(popFreqKeys.get(1))); - for (List sortedPopFreq : sortedPopFreqs) { - List orQueryList = new LinkedList<>(); - for (String koValue : koValues) { - List finalFilterValues = koValue.equals(encodedChString) ? chFilterValues : filterValues; - List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; - for (String filterVal : finalFilterValues) { - for (String ctValue : finalCtValues) { - if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_PAIR_CTS.contains(ctValue)) { - // Don't process this filter - continue; - } - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) + List orQueryList = new LinkedList<>(); + for (String koValue : koValues) { + if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { + ArrayList popFreqKeys = new ArrayList<>(popFreqQueryList.keySet()); + List> sortedPopFreqs = RgaUtils.generateSortedCombinations(popFreqQueryList.get(popFreqKeys.get(0)), + popFreqQueryList.get(popFreqKeys.get(1))); + for (List sortedPopFreq : sortedPopFreqs) { + for (String filterVal : chFilterValues) { + for (String ctValue : chCtValues) { + orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR + sortedPopFreq.get(1)); + } + } + } + } else { + for (String ctValue : ctValues) { + if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { + // Don't process this filter + continue; + } + for (String filterValue : filterValues) { + List tmpAndQueryList = new ArrayList<>(popFreqQueryList.size()); + for (List popFreqs : popFreqQueryList.values()) { + List tmpOrQueryList = new ArrayList<>(popFreqs.size()); + for (String popFreq : popFreqs) { + tmpOrQueryList.add(koValue + SEPARATOR + filterValue + SEPARATOR + ctValue + SEPARATOR + popFreq); + } + parseStringValue(tmpOrQueryList, "", tmpAndQueryList, "||"); + } + parseStringValue(tmpAndQueryList, "", orQueryList, "&&"); } } } - parseStringValue(orQueryList, "", andQueryList, "||"); } } else { for (List tmpPopFreqList : popFreqQueryList.values()) { @@ -364,10 +377,6 @@ private void buildComplexQuery(List koValues, List filterValues, List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; for (String filterVal : finalFilterValues) { for (String ctValue : finalCtValues) { - if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { - // Don't process this filter - continue; - } if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString) && tmpPopFreqList.size() > 1) { List sortedCombinations = generateSortedCombinations(tmpPopFreqList); @@ -375,6 +384,10 @@ private void buildComplexQuery(List koValues, List filterValues, orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreqPair); } } else { + if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { + // Don't process this filter + continue; + } for (String popFreq : tmpPopFreqList) { orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreq); } @@ -421,8 +434,12 @@ private void buildComplexQuery(List koValues, List filterValues, // + sortedPopFreq.get(1)); if (koValue.equals(delOverlap)) { for (String ctValue : ctList) { - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) - + SEPARATOR + sortedPopFreq.get(1)); + List tmpAndQueryList = new ArrayList<>(2); + tmpAndQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + + sortedPopFreq.get(0)); + tmpAndQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + + sortedPopFreq.get(1)); + parseStringValue(tmpAndQueryList, "", orQueryList, "&&"); } // } else if (koValue.equals(encodedChString)) { // orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + sortedPopFreq.get(0) + SEPARATOR From 269f1b0fb8d4df70d3428112653259370bc74bea Mon Sep 17 00:00:00 2001 From: pfurio Date: Mon, 30 Jan 2023 14:24:41 +0100 Subject: [PATCH 10/27] analysis: fix ko filter, #TASK-2478 --- .../main/java/org/opencb/opencga/analysis/rga/RgaManager.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index d226884dd88..c5a25281c78 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1190,11 +1190,13 @@ private Query parseQuery(Query query) { // the condition below. // if (COMP_HET_QUERY_MODE.equals(RgaQueryParams.CompHetQueryMode.PAIR) && !myQuery.containsKey(RgaQueryParams.KNOCKOUT.key())) { // Fill with all knockout types to ensure comp_het queries are performed as pairs + if (!myQuery.containsKey(RgaQueryParams.KNOCKOUT.key())) { List knockoutValues = EnumSet.allOf(KnockoutVariant.KnockoutType.class) .stream() .map(Enum::name) .collect(Collectors.toList()); myQuery.append(RgaQueryParams.KNOCKOUT.key(), knockoutValues); + } // } return myQuery; } From 24381c41b1a0efa3c29bff7b9065a1b2b8db951f Mon Sep 17 00:00:00 2001 From: pfurio Date: Tue, 31 Jan 2023 12:51:10 +0100 Subject: [PATCH 11/27] analysis: improve summary performance, #TASK-2478 --- .../opencga/analysis/rga/RgaEngine.java | 4 +- .../opencga/analysis/rga/RgaManager.java | 81 +++++++++++++------ 2 files changed, 58 insertions(+), 27 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java index 01456f19a1f..ea33cae2f92 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java @@ -192,7 +192,7 @@ private void fixIndividualOptions(QueryOptions queryOptions, Query query, SolrQu public RgaIterator geneQuery(String collection, Query query, QueryOptions queryOptions) throws RgaException { SolrQuery solrQuery = parser.parseQuery(query); fixGeneOptions(queryOptions, query, solrQuery); - solrQuery.setRows(Integer.MAX_VALUE); + solrQuery.setRows(queryOptions.getInt(QueryOptions.LIMIT, Integer.MAX_VALUE)); try { return new RgaIterator(solrManager.getSolrClient(), collection, solrQuery); } catch (SolrServerException e) { @@ -226,7 +226,7 @@ private void fixGeneOptions(QueryOptions queryOptions, Query query, SolrQuery so public RgaIterator variantQuery(String collection, Query query, QueryOptions queryOptions) throws RgaException { SolrQuery solrQuery = parser.parseQuery(query); fixVariantOptions(queryOptions, query, solrQuery); - solrQuery.setRows(Integer.MAX_VALUE); + solrQuery.setRows(queryOptions.getInt(QueryOptions.LIMIT, Integer.MAX_VALUE)); try { return new RgaIterator(solrManager.getSolrClient(), collection, solrQuery); } catch (SolrServerException e) { diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index c5a25281c78..e0a64cccb95 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1189,7 +1189,7 @@ private Query parseQuery(Query query) { // filters by knockout type. In the future, we should fix the DELETION_OVERLAP issue and then we will be able to uncomment // the condition below. // if (COMP_HET_QUERY_MODE.equals(RgaQueryParams.CompHetQueryMode.PAIR) && !myQuery.containsKey(RgaQueryParams.KNOCKOUT.key())) { - // Fill with all knockout types to ensure comp_het queries are performed as pairs + // Fill with all knockout types to ensure comp_het queries are performed as pairs if (!myQuery.containsKey(RgaQueryParams.KNOCKOUT.key())) { List knockoutValues = EnumSet.allOf(KnockoutVariant.KnockoutType.class) .stream() @@ -1555,38 +1555,48 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer Query auxQuery = new Query(query); auxQuery.put(RgaQueryParams.GENE_ID.key(), geneId); + StopWatch stopWatch = StopWatch.createStarted(); // 1. Get KnockoutByGene information - Query individualQuery = new Query(RgaQueryParams.GENE_ID.key(), geneId); + Query geneQuery = new Query(RgaQueryParams.GENE_ID.key(), geneId); QueryOptions options = new QueryOptions() - .append(QueryOptions.EXCLUDE, "individuals"); - RgaIterator rgaIterator = rgaEngine.geneQuery(collection, individualQuery, options); + .append(QueryOptions.EXCLUDE, "individuals") + .append(QueryOptions.LIMIT, 1); + RgaIterator rgaIterator = rgaEngine.geneQuery(collection, geneQuery, options); + logger.debug("Gene query: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); if (!rgaIterator.hasNext()) { throw RgaException.noResultsMatching(); } KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); - RgaDataModel rgaDataModel = null; - while (rgaIterator.hasNext()) { - rgaDataModel = rgaIterator.next(); - if (CollectionUtils.isNotEmpty(rgaDataModel.getChPairs())) { - for (String chPair : rgaDataModel.getChPairs()) { - CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(chPair); - knockoutTypeCount.processChPairFeature(codedChPairVariants); - } - } + RgaDataModel rgaDataModel = rgaIterator.next(); + + stopWatch.reset(); + stopWatch.start(); + QueryOptions variantFacet = new QueryOptions() + .append(QueryOptions.LIMIT, -1) + .append(QueryOptions.FACET, RgaDataModel.CH_PAIRS); + DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, geneQuery, variantFacet); + logger.debug("Gene CH pairs facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); + for (FacetField.Bucket variantBucket : facetFieldDataResult.first().getBuckets()) { + CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(variantBucket.getValue()); + knockoutTypeCount.processChPairFeature(codedChPairVariants); } + logger.debug("Gene CH pairs facet and process: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); // To get the basic gene information, we can use any document from RgaDataModel. In this case, we use the last document KnockoutByGeneSummary geneSummary = new KnockoutByGeneSummary(rgaDataModel.getGeneId(), rgaDataModel.getGeneName(), rgaDataModel.getChromosome(), rgaDataModel.getStart(), rgaDataModel.getEnd(), rgaDataModel.getStrand(), rgaDataModel.getGeneBiotype(), null, null); + stopWatch.reset(); + stopWatch.start(); // 2. Get KnockoutType counts QueryOptions knockoutTypeFacet = new QueryOptions() .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.VARIANT_SUMMARY); - DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, knockoutTypeFacet); + facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, knockoutTypeFacet); + logger.debug("Gene VariantSummary facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); for (FacetField.Bucket variantBucket : facetFieldDataResult.first().getBuckets()) { CodedVariant codedFeature = CodedVariant.parseEncodedId(variantBucket.getValue()); @@ -1596,12 +1606,16 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer knockoutTypeCount.getNumCompHetIds(), knockoutTypeCount.getNumPairedCompHetIds(), knockoutTypeCount.getNumPairedDelOverlapIds(), knockoutTypeCount.getNumHetIds(), knockoutTypeCount.getNumDelOverlapIds()); geneSummary.setVariantStats(variantStats); + logger.debug("Gene VariantSummary facet and process: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); + stopWatch.reset(); + stopWatch.start(); // 3. Get individual knockout type counts QueryOptions geneFacet = new QueryOptions() .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.INDIVIDUAL_SUMMARY); facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, geneFacet); + logger.debug("Gene IndividualSummary facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); KnockoutTypeCount noParentsCount = new KnockoutTypeCount(auxQuery); KnockoutTypeCount singleParentCount = new KnockoutTypeCount(auxQuery); KnockoutTypeCount bothParentsCount = new KnockoutTypeCount(auxQuery); @@ -1639,6 +1653,7 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer bothParentsCount.getNumDelOverlapIds(), bothParentsCount.getNumHomAltCompHetIds(), bothParentsCount.getNumCompHetDelOverlapIds() ); + logger.debug("Gene IndividualSummary facet and process: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); geneSummary.setIndividualStats(new GlobalIndividualKnockoutStats(noParentIndividualStats, singleParentIndividualStats, bothParentIndividualStats)); @@ -1651,35 +1666,46 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection Query auxQuery = new Query(query); auxQuery.put(RgaQueryParams.SAMPLE_ID.key(), sampleId); + StopWatch stopWatch = StopWatch.createStarted(); // 1. Get KnockoutByIndividual information QueryOptions options = new QueryOptions() - .append(QueryOptions.EXCLUDE, "genes"); + .append(QueryOptions.EXCLUDE, "genes") + .append(QueryOptions.LIMIT, 1); RgaIterator rgaIterator = rgaEngine.individualQuery(collection, auxQuery, options); + logger.debug("Individual query: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); if (!rgaIterator.hasNext()) { throw RgaException.noResultsMatching(); } KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); - RgaDataModel rgaDataModel = null; - while (rgaIterator.hasNext()) { - rgaDataModel = rgaIterator.next(); - if (CollectionUtils.isNotEmpty(rgaDataModel.getChPairs())) { - for (String chPair : rgaDataModel.getChPairs()) { - CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(chPair); - knockoutTypeCount.processChPairFeature(codedChPairVariants); - } - } + RgaDataModel rgaDataModel = rgaIterator.next(); + + stopWatch.reset(); + stopWatch.start(); + QueryOptions variantFacet = new QueryOptions() + .append(QueryOptions.LIMIT, -1) + .append(QueryOptions.FACET, RgaDataModel.CH_PAIRS); + DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, variantFacet); + logger.debug("Individual CH pairs facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); + for (FacetField.Bucket variantBucket : facetFieldDataResult.first().getBuckets()) { + CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(variantBucket.getValue()); + knockoutTypeCount.processChPairFeature(codedChPairVariants); } + logger.debug("Individual CH pairs facet and process: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); KnockoutByIndividual knockoutByIndividual = AbstractRgaConverter.fillIndividualInfo(rgaDataModel); KnockoutByIndividualSummary knockoutByIndividualSummary = new KnockoutByIndividualSummary(knockoutByIndividual); + stopWatch.reset(); + stopWatch.start(); // 2. Get KnockoutType counts QueryOptions knockoutTypeFacet = new QueryOptions() .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.VARIANT_SUMMARY); - DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, knockoutTypeFacet); + facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, knockoutTypeFacet); + logger.debug("Individual VariantSummary facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); + for (FacetField.Bucket variantBucket : facetFieldDataResult.first().getBuckets()) { CodedVariant codedFeature = CodedVariant.parseEncodedId(variantBucket.getValue()); knockoutTypeCount.processFeature(codedFeature); @@ -1688,6 +1714,7 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection knockoutTypeCount.getNumCompHetIds(), knockoutTypeCount.getNumPairedCompHetIds(), knockoutTypeCount.getNumPairedDelOverlapIds(), knockoutTypeCount.getNumHetIds(), knockoutTypeCount.getNumDelOverlapIds()); knockoutByIndividualSummary.setVariantStats(variantStats); + logger.debug("Individual VariantSummary facet and process: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); // Use list of variants filtered matching all criteria if the number of variants is lower than 100. Otherwise, variants will not be // used to get the list of genes. If we don't apply this limit, the url may be too long and fail. @@ -1695,16 +1722,20 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection auxQuery.put(RgaQueryParams.VARIANTS.key(), new ArrayList<>(knockoutTypeCount.getIds())); } + stopWatch.reset(); + stopWatch.start(); // 3. Get gene name list QueryOptions geneFacet = new QueryOptions() .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.GENE_NAME); facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, geneFacet); + logger.debug("Individual GeneName facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); List geneIds = facetFieldDataResult.first().getBuckets() .stream() .map(FacetField.Bucket::getValue) .collect(Collectors.toList()); knockoutByIndividualSummary.setGenes(geneIds); + logger.debug("Individual GeneName facet and process: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); return knockoutByIndividualSummary; } From 2ffd48499310ecdcb01330264314de98181f01fc Mon Sep 17 00:00:00 2001 From: pfurio Date: Thu, 18 Aug 2022 16:08:55 +0200 Subject: [PATCH 12/27] analysis: add cache for RGA, #TASK-1750 --- .../opencga/analysis/rga/RgaManager.java | 134 +++++++++++++++--- .../core/config/RgaSearchConfiguration.java | 46 ++++++ .../config/storage/StorageConfiguration.java | 9 +- 3 files changed, 164 insertions(+), 25 deletions(-) create mode 100644 opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index e0a64cccb95..ae31b5d5e4e 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; @@ -67,6 +68,7 @@ public class RgaManager implements AutoCloseable { private final IndividualRgaConverter individualRgaConverter; private final GeneRgaConverter geneConverter; + private final VariantRgaConverter variantConverter; private static final RgaQueryParams.CompHetQueryMode COMP_HET_QUERY_MODE = RgaQueryParams.CompHetQueryMode.PAIR; @@ -75,6 +77,14 @@ public class RgaManager implements AutoCloseable { private static final int KNOCKOUT_INSERT_BATCH_SIZE = 25; + private static Map> cacheMap; + private final int CACHE_SIZE; + private static final int DEFAULT_CACHE_SIZE = 1000; + + static { + cacheMap = new HashMap<>(); + } + public RgaManager(CatalogManager catalogManager, VariantStorageManager variantStorageManager) { this.catalogManager = catalogManager; @@ -88,6 +98,9 @@ public RgaManager(CatalogManager catalogManager, VariantStorageManager variantSt this.variantConverter = new VariantRgaConverter(); this.logger = LoggerFactory.getLogger(getClass()); + this.CACHE_SIZE = storageConfiguration.getRga().getCacheSize() > 0 + ? storageConfiguration.getRga().getCacheSize() + : DEFAULT_CACHE_SIZE; } // Visible for testing @@ -102,10 +115,13 @@ public RgaManager(CatalogManager catalogManager, VariantStorageManager variantSt this.variantConverter = new VariantRgaConverter(); this.logger = LoggerFactory.getLogger(getClass()); + + this.CACHE_SIZE = storageConfiguration.getRga().getCacheSize() > 0 + ? storageConfiguration.getRga().getCacheSize() + : DEFAULT_CACHE_SIZE; } // Data load - public void index(String studyStr, String fileStr, String token) throws CatalogException, RgaException, IOException { File file = catalogManager.getFileManager().get(studyStr, fileStr, FileManager.INCLUDE_FILE_URI_PATH, token).first(); Path filePath = Paths.get(file.getUri()); @@ -856,31 +872,37 @@ public OpenCGAResult variantQuery(String studyStr, Query quer // Added to improve performance issues. Need to be addressed properly and add this information in study internal.rga.stats field @Deprecated private Integer getTotalIndividuals(Study study) { - // In the future, this will need to be fetched from study internal. - // Atm, it will be fetched from study.attributes.rga.stats.totalIndividuals - if (study.getAttributes() == null) { - return null; - } - Object rga = study.getAttributes().get("RGA"); - if (rga == null) { - return null; - } - Object stats = ((Map) rga).get("stats"); - if (stats == null) { - return null; - } - Object totalIndividuals = ((Map) stats).get("totalIndividuals"); - if (totalIndividuals != null) { - return Integer.parseInt(String.valueOf(totalIndividuals)); - } else { - return null; - } + return null; +// // In the future, this will need to be fetched from study internal. +// // Atm, it will be fetched from study.attributes.rga.stats.totalIndividuals +// if (study.getAttributes() == null) { +// return null; +// } +// Object rga = study.getAttributes().get("RGA"); +// if (rga == null) { +// return null; +// } +// Object stats = ((Map) rga).get("stats"); +// if (stats == null) { +// return null; +// } +// Object totalIndividuals = ((Map) stats).get("totalIndividuals"); +// if (totalIndividuals != null) { +// return Integer.parseInt(String.valueOf(totalIndividuals)); +// } else { +// return null; +// } } public OpenCGAResult individualSummary(String studyStr, Query query, QueryOptions options, String token) throws RgaException, CatalogException, IOException { StopWatch stopWatch = StopWatch.createStarted(); + OpenCGAResult cacheResults = getCacheResults("individualSummary", studyStr, query, options, stopWatch); + if (cacheResults != null) { + return cacheResults; + } + Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String collection = getMainCollectionName(study.getFqn()); @@ -997,12 +1019,19 @@ public OpenCGAResult individualSummary(String study result.setEvents(Collections.singletonList(preprocess.getEvent())); } + cacheResults("individualSummary", studyStr, query, options, result); return result; } public OpenCGAResult geneSummary(String studyStr, Query query, QueryOptions options, String token) throws CatalogException, IOException, RgaException { StopWatch stopWatch = StopWatch.createStarted(); + + OpenCGAResult cacheResults = getCacheResults("geneSummary", studyStr, query, options, stopWatch); + if (cacheResults != null) { + return cacheResults; + } + Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String userId = catalogManager.getUserManager().getUserId(token); String collection = getMainCollectionName(study.getFqn()); @@ -1072,13 +1101,21 @@ public OpenCGAResult geneSummary(String studyStr, Query q } int time = (int) stopWatch.getTime(TimeUnit.MILLISECONDS); - return new OpenCGAResult<>(time, Collections.emptyList(), knockoutByGeneSummaryList.size(), knockoutByGeneSummaryList, numMatches); + OpenCGAResult result = new OpenCGAResult<>(time, Collections.emptyList(), knockoutByGeneSummaryList.size(), + knockoutByGeneSummaryList, numMatches); + cacheResults("geneSummary", studyStr, query, options, result); + return result; } public OpenCGAResult variantSummary(String studyStr, Query query, QueryOptions options, String token) throws CatalogException, IOException, RgaException { StopWatch stopWatch = StopWatch.createStarted(); + OpenCGAResult cacheResults = getCacheResults("variantSummary", studyStr, query, options, stopWatch); + if (cacheResults != null) { + return cacheResults; + } + Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String userId = catalogManager.getUserManager().getUserId(token); String collection = getMainCollectionName(study.getFqn()); @@ -1178,6 +1215,8 @@ public OpenCGAResult variantSummary(String studyStr, Q if (CollectionUtils.isNotEmpty(resourceIds.getEvents())) { result.setEvents(resourceIds.getEvents()); } + + cacheResults("variantSummary", studyStr, query, options, result); return result; } @@ -2046,4 +2085,57 @@ public Preprocess setEvent(Event event) { return this; } } + + /* + CACHE METHODS + */ + private String generateCacheKey(String method, String studyStr, Query query, QueryOptions options) { + ObjectMap map = new ObjectMap() + .append("method", method) + .append("study", studyStr); + if (query != null) { + map.putAll(query); + } + if (options != null) { + map.putAll(options); + } + // Sort the keys + List sortedKeys = map.keySet().stream().sorted().collect(Collectors.toList()); + List queryList = new ArrayList<>(map.size()); + for (String key : sortedKeys) { + queryList.add(key + "=" + map.get(key)); + } + return DigestUtils.sha256Hex(StringUtils.join(queryList, ";")); + } + + private void cacheResults(String method, String studyStr, Query query, QueryOptions options, OpenCGAResult result) { + if (!storageConfiguration.getRga().isCache()) { + // Cache is disabled + return; + } + + if (cacheMap.size() > CACHE_SIZE) { + // Cache is already full + logger.warn("Query not cached. Cache is already full (size: {}).", CACHE_SIZE); + return; + } + + String cacheKey = generateCacheKey(method, studyStr, query, options); + cacheMap.put(cacheKey, result); + } + + private OpenCGAResult getCacheResults(String method, String studyStr, Query query, QueryOptions options, StopWatch stopWatch) { + if (!storageConfiguration.getRga().isCache()) { + // Cache is disabled + return null; + } + String cacheKey = generateCacheKey(method, studyStr, query, options); + OpenCGAResult result = cacheMap.get(cacheKey); + if (result != null) { + result.addEvent(new Event(Event.Type.INFO, "Results obtained from cache")); + result.setTime((int) stopWatch.getTime(TimeUnit.MILLISECONDS)); + return (OpenCGAResult) result; + } + return null; + } } diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java b/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java new file mode 100644 index 00000000000..688ad0c7563 --- /dev/null +++ b/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java @@ -0,0 +1,46 @@ +package org.opencb.opencga.core.config; + +import java.util.List; + +public class RgaSearchConfiguration extends SearchConfiguration { + + private boolean cache; + private int cacheSize; + + public RgaSearchConfiguration() { + } + + public RgaSearchConfiguration(List hosts, String configSet, String mode, String user, String password, String manager, + boolean active, int timeout, int insertBatchSize, boolean cache, int cacheSize) { + super(hosts, configSet, mode, user, password, manager, active, timeout, insertBatchSize); + this.cache = cache; + this.cacheSize = cacheSize; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("RgaSearchConfiguration{"); + sb.append("cache=").append(cache); + sb.append(", cacheSize=").append(cacheSize); + sb.append('}'); + return sb.toString(); + } + + public boolean isCache() { + return cache; + } + + public RgaSearchConfiguration setCache(boolean cache) { + this.cache = cache; + return this; + } + + public int getCacheSize() { + return cacheSize; + } + + public RgaSearchConfiguration setCacheSize(int cacheSize) { + this.cacheSize = cacheSize; + return this; + } +} diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/StorageConfiguration.java b/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/StorageConfiguration.java index 89157650f4f..45c5e6091d7 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/StorageConfiguration.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/StorageConfiguration.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.lang3.StringUtils; import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.opencga.core.config.RgaSearchConfiguration; import org.opencb.opencga.core.config.SearchConfiguration; import org.opencb.opencga.core.config.ServerConfiguration; import org.slf4j.Logger; @@ -43,7 +44,7 @@ public class StorageConfiguration { private CacheConfiguration cache; private SearchConfiguration search; private SearchConfiguration clinical; - private SearchConfiguration rga; + private RgaSearchConfiguration rga; private ObjectMap alignment; private StorageEnginesConfiguration variant; private IOConfiguration io; @@ -61,7 +62,7 @@ public StorageConfiguration() { this.cache = new CacheConfiguration(); this.search = new SearchConfiguration(); this.clinical = new SearchConfiguration(); - this.rga = new SearchConfiguration(); + this.rga = new RgaSearchConfiguration(); } @@ -192,11 +193,11 @@ public StorageConfiguration setClinical(SearchConfiguration clinical) { return this; } - public SearchConfiguration getRga() { + public RgaSearchConfiguration getRga() { return rga; } - public StorageConfiguration setRga(SearchConfiguration rga) { + public StorageConfiguration setRga(RgaSearchConfiguration rga) { this.rga = rga; return this; } From 0567f68178effd336890362b3220ea01ce1eccb8 Mon Sep 17 00:00:00 2001 From: pfurio Date: Thu, 18 Aug 2022 20:29:59 +0200 Subject: [PATCH 13/27] analysis: make map thread-safe, #TASK-1750 --- .../main/java/org/opencb/opencga/analysis/rga/RgaManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index ae31b5d5e4e..fe172e2479d 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -82,7 +82,7 @@ public class RgaManager implements AutoCloseable { private static final int DEFAULT_CACHE_SIZE = 1000; static { - cacheMap = new HashMap<>(); + cacheMap = new ConcurrentHashMap<>(); } From 21d1033d0544c9d63f4ec3b4d8382fa720e50f07 Mon Sep 17 00:00:00 2001 From: pfurio Date: Fri, 19 Aug 2022 08:22:12 +0200 Subject: [PATCH 14/27] analysis: remove static modifier, #TASK-1750 --- .../java/org/opencb/opencga/analysis/rga/RgaManager.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index fe172e2479d..f7847a30811 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -77,14 +77,10 @@ public class RgaManager implements AutoCloseable { private static final int KNOCKOUT_INSERT_BATCH_SIZE = 25; - private static Map> cacheMap; + private Map> cacheMap; private final int CACHE_SIZE; private static final int DEFAULT_CACHE_SIZE = 1000; - static { - cacheMap = new ConcurrentHashMap<>(); - } - public RgaManager(CatalogManager catalogManager, VariantStorageManager variantStorageManager) { this.catalogManager = catalogManager; @@ -98,6 +94,8 @@ public RgaManager(CatalogManager catalogManager, VariantStorageManager variantSt this.variantConverter = new VariantRgaConverter(); this.logger = LoggerFactory.getLogger(getClass()); + + this.cacheMap = new ConcurrentHashMap<>(); this.CACHE_SIZE = storageConfiguration.getRga().getCacheSize() > 0 ? storageConfiguration.getRga().getCacheSize() : DEFAULT_CACHE_SIZE; @@ -116,6 +114,7 @@ public RgaManager(CatalogManager catalogManager, VariantStorageManager variantSt this.logger = LoggerFactory.getLogger(getClass()); + this.cacheMap = new ConcurrentHashMap<>(); this.CACHE_SIZE = storageConfiguration.getRga().getCacheSize() > 0 ? storageConfiguration.getRga().getCacheSize() : DEFAULT_CACHE_SIZE; From cac9f0251a00e63dbbe7567b02023deaf0c62e17 Mon Sep 17 00:00:00 2001 From: pfurio Date: Fri, 19 Aug 2022 14:48:30 +0200 Subject: [PATCH 15/27] analysis: cache only queries taking longer than 4 seconds, #TASK-1750 --- .../opencga/analysis/rga/RgaManager.java | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index f7847a30811..a3952d149f0 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -522,13 +522,17 @@ public OpenCGAResult updateRgaInternalIndexStatus(String studyStr, String public OpenCGAResult individualQuery(String studyStr, Query query, QueryOptions options, String token) throws CatalogException, IOException, RgaException { + StopWatch stopWatch = StopWatch.createStarted(); + OpenCGAResult cacheResults = getCacheResults("individualQuery", studyStr, query, options, stopWatch); + if (cacheResults != null) { + return cacheResults; + } + Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String collection = getMainCollectionName(study.getFqn()); Query finalQuery = parseQuery(query); - StopWatch stopWatch = new StopWatch(); - stopWatch.start(); Preprocess preprocess; try { preprocess = individualQueryPreprocess(study, finalQuery, options, token); @@ -597,11 +601,18 @@ public OpenCGAResult individualQuery(String studyStr, Quer result.setEvents(Collections.singletonList(preprocess.getEvent())); } + cacheResults("individualQuery", studyStr, query, options, stopWatch, result); return result; } public OpenCGAResult geneQuery(String studyStr, Query query, QueryOptions options, String token) throws CatalogException, IOException, RgaException { + StopWatch stopWatch = StopWatch.createStarted(); + OpenCGAResult cacheResults = getCacheResults("geneQuery", studyStr, query, options, stopWatch); + if (cacheResults != null) { + return cacheResults; + } + Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String userId = catalogManager.getUserManager().getUserId(token); String collection = getMainCollectionName(study.getFqn()); @@ -609,9 +620,6 @@ public OpenCGAResult geneQuery(String studyStr, Query query, throw new RgaException("Missing RGA indexes for study '" + study.getFqn() + "' or solr server not alive"); } - StopWatch stopWatch = new StopWatch(); - stopWatch.start(); - ExecutorService executor = Executors.newFixedThreadPool(4); QueryOptions queryOptions = setDefaultLimit(options); @@ -712,6 +720,7 @@ public OpenCGAResult geneQuery(String studyStr, Query query, knockoutResult.setNumMatches(-1); } if (isOwnerOrAdmin && includeSampleIds.isEmpty()) { + cacheResults("geneQuery", studyStr, query, options, stopWatch, knockoutResult); return knockoutResult; } else { // 5. Filter out individual or samples for which user does not have permissions @@ -725,12 +734,19 @@ public OpenCGAResult geneQuery(String studyStr, Query query, knockout.setIndividuals(individualList); } + cacheResults("geneQuery", studyStr, query, options, stopWatch, knockoutResult); return knockoutResult; } } public OpenCGAResult variantQuery(String studyStr, Query query, QueryOptions options, String token) throws CatalogException, IOException, RgaException { + StopWatch stopWatch = StopWatch.createStarted(); + OpenCGAResult cacheResults = getCacheResults("variantQuery", studyStr, query, options, stopWatch); + if (cacheResults != null) { + return cacheResults; + } + Study study = catalogManager.getStudyManager().get(studyStr, QueryOptions.empty(), token).first(); String userId = catalogManager.getUserManager().getUserId(token); String collection = getMainCollectionName(study.getFqn()); @@ -742,13 +758,8 @@ public OpenCGAResult variantQuery(String studyStr, Query quer throw new RgaException("Missing auxiliar RGA collection for study '" + study.getFqn() + "'"); } - StopWatch stopWatch = new StopWatch(); - stopWatch.start(); - ExecutorService executor = Executors.newFixedThreadPool(4); - QueryOptions queryOptions = setDefaultLimit(options); - List includeIndividuals = queryOptions.getAsStringList(RgaQueryParams.INCLUDE_INDIVIDUAL); Boolean isOwnerOrAdmin = catalogManager.getAuthorizationManager().isOwnerOrAdmin(study.getUid(), userId); @@ -851,6 +862,7 @@ public OpenCGAResult variantQuery(String studyStr, Query quer knockoutResult.setNumMatches(-1); } if (isOwnerOrAdmin && includeSampleIds.isEmpty()) { + cacheResults("variantQuery", studyStr, query, options, stopWatch, knockoutResult); return knockoutResult; } else { // 5. Filter out individual or samples for which user does not have permissions @@ -864,6 +876,7 @@ public OpenCGAResult variantQuery(String studyStr, Query quer knockout.setIndividuals(individualList); } + cacheResults("variantQuery", studyStr, query, options, stopWatch, knockoutResult); return knockoutResult; } } @@ -1018,7 +1031,7 @@ public OpenCGAResult individualSummary(String study result.setEvents(Collections.singletonList(preprocess.getEvent())); } - cacheResults("individualSummary", studyStr, query, options, result); + cacheResults("individualSummary", studyStr, query, options, stopWatch, result); return result; } @@ -1102,7 +1115,7 @@ public OpenCGAResult geneSummary(String studyStr, Query q int time = (int) stopWatch.getTime(TimeUnit.MILLISECONDS); OpenCGAResult result = new OpenCGAResult<>(time, Collections.emptyList(), knockoutByGeneSummaryList.size(), knockoutByGeneSummaryList, numMatches); - cacheResults("geneSummary", studyStr, query, options, result); + cacheResults("geneSummary", studyStr, query, options, stopWatch, result); return result; } @@ -1215,7 +1228,7 @@ public OpenCGAResult variantSummary(String studyStr, Q result.setEvents(resourceIds.getEvents()); } - cacheResults("variantSummary", studyStr, query, options, result); + cacheResults("variantSummary", studyStr, query, options, stopWatch, result); return result; } @@ -2107,7 +2120,8 @@ private String generateCacheKey(String method, String studyStr, Query query, Que return DigestUtils.sha256Hex(StringUtils.join(queryList, ";")); } - private void cacheResults(String method, String studyStr, Query query, QueryOptions options, OpenCGAResult result) { + private void cacheResults(String method, String studyStr, Query query, QueryOptions options, StopWatch stopWatch, + OpenCGAResult result) { if (!storageConfiguration.getRga().isCache()) { // Cache is disabled return; @@ -2119,6 +2133,10 @@ private void cacheResults(String method, String studyStr, Query query, QueryOpti return; } + if (stopWatch.getTime(TimeUnit.SECONDS) < 4) { + logger.debug("Query not cached. It took less than 4 seconds: {} ms.", stopWatch.getTime(TimeUnit.MILLISECONDS)); + } + String cacheKey = generateCacheKey(method, studyStr, query, options); cacheMap.put(cacheKey, result); } From 40d3c1dc4e72dfc4559d9f0087a14fc755dce730 Mon Sep 17 00:00:00 2001 From: pfurio Date: Tue, 31 Jan 2023 17:04:29 +0100 Subject: [PATCH 16/27] analysis: put if absent, #TASK-2478 --- .../main/java/org/opencb/opencga/analysis/rga/RgaManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index a3952d149f0..c53c6c2ef3b 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -77,7 +77,7 @@ public class RgaManager implements AutoCloseable { private static final int KNOCKOUT_INSERT_BATCH_SIZE = 25; - private Map> cacheMap; + private ConcurrentHashMap> cacheMap; private final int CACHE_SIZE; private static final int DEFAULT_CACHE_SIZE = 1000; @@ -2138,7 +2138,7 @@ private void cacheResults(String method, String studyStr, Query query, QueryOpti } String cacheKey = generateCacheKey(method, studyStr, query, options); - cacheMap.put(cacheKey, result); + cacheMap.putIfAbsent(cacheKey, result); } private OpenCGAResult getCacheResults(String method, String studyStr, Query query, QueryOptions options, StopWatch stopWatch) { From 38fcd58c970e39bf4565a61aa7cf092a132f2004 Mon Sep 17 00:00:00 2001 From: pfurio Date: Thu, 2 Feb 2023 15:42:17 +0100 Subject: [PATCH 17/27] analysis: count CH variant stats properly, #TASK-2478 --- .../opencga/analysis/rga/RgaManager.java | 51 ++-- .../opencb/opencga/analysis/rga/RgaUtils.java | 258 ++++++++++++------ 2 files changed, 209 insertions(+), 100 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index c53c6c2ef3b..42ab636cbe8 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1331,7 +1331,7 @@ private ResourceIds getVariantIdsFromMainCollection(String mainCollection, Query List eventList = new ArrayList<>(); Future numMatchesFuture = null; - KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(query); + VariantKnockoutTypeCount knockoutTypeCount = new VariantKnockoutTypeCount(query, COMP_HET_QUERY_MODE); Set ids = new HashSet<>(); Set skippedIds = new HashSet<>(); List buckets = facetFieldDataResult.first().getBuckets(); @@ -1505,13 +1505,13 @@ private KnockoutByVariantSummary calculatePartialSolrVariantSummary(String colle .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.INDIVIDUAL_SUMMARY); facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, geneFacet); - KnockoutTypeCount noParentsCount = new KnockoutTypeCount(auxQuery); - KnockoutTypeCount singleParentCount = new KnockoutTypeCount(auxQuery); - KnockoutTypeCount bothParentsCount = new KnockoutTypeCount(auxQuery); + IndividualKnockoutTypeCount noParentsCount = new IndividualKnockoutTypeCount(auxQuery); + IndividualKnockoutTypeCount singleParentCount = new IndividualKnockoutTypeCount(auxQuery); + IndividualKnockoutTypeCount bothParentsCount = new IndividualKnockoutTypeCount(auxQuery); for (FacetField.Bucket bucket : facetFieldDataResult.first().getBuckets()) { CodedIndividual codedIndividual = CodedIndividual.parseEncodedId(bucket.getValue()); - KnockoutTypeCount auxKnockoutType; + IndividualKnockoutTypeCount auxKnockoutType; switch (codedIndividual.getNumParents()) { case 0: auxKnockoutType = noParentsCount; @@ -1528,18 +1528,22 @@ private KnockoutByVariantSummary calculatePartialSolrVariantSummary(String colle auxKnockoutType.processFeature(codedIndividual); } + noParentsCount.calculateStats(); + singleParentCount.calculateStats(); + bothParentsCount.calculateStats(); + IndividualKnockoutStats noParentIndividualStats = new IndividualKnockoutStats(noParentsCount.getNumIds(), - noParentsCount.getNumHomIds(), noParentsCount.getNumCompHetIds(), noParentsCount.getNumHetIds(), + noParentsCount.getNumHomAltIds(), noParentsCount.getNumCompHetIds(), noParentsCount.getNumHetIds(), noParentsCount.getNumDelOverlapIds(), noParentsCount.getNumHomAltCompHetIds(), noParentsCount.getNumCompHetDelOverlapIds() ); IndividualKnockoutStats singleParentIndividualStats = new IndividualKnockoutStats(singleParentCount.getNumIds(), - singleParentCount.getNumHomIds(), singleParentCount.getNumCompHetIds(), singleParentCount.getNumHetIds(), + singleParentCount.getNumHomAltIds(), singleParentCount.getNumCompHetIds(), singleParentCount.getNumHetIds(), singleParentCount.getNumDelOverlapIds(), singleParentCount.getNumHomAltCompHetIds(), singleParentCount.getNumCompHetDelOverlapIds() ); IndividualKnockoutStats bothParentIndividualStats = new IndividualKnockoutStats(bothParentsCount.getNumIds(), - bothParentsCount.getNumHomIds(), bothParentsCount.getNumCompHetIds(), bothParentsCount.getNumHetIds(), + bothParentsCount.getNumHomAltIds(), bothParentsCount.getNumCompHetIds(), bothParentsCount.getNumHetIds(), bothParentsCount.getNumDelOverlapIds(), bothParentsCount.getNumHomAltCompHetIds(), bothParentsCount.getNumCompHetDelOverlapIds() ); @@ -1562,7 +1566,7 @@ private KnockoutByVariantSummary calculatePartialSolrVariantSummary(String colle Query knockoutTypeQuery = new Query(query); knockoutTypeQuery.remove(RgaQueryParams.VARIANTS.key()); knockoutTypeQuery.remove(RgaQueryParams.DB_SNPS.key()); - KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(knockoutTypeQuery); + VariantKnockoutTypeCount knockoutTypeCount = new VariantKnockoutTypeCount(knockoutTypeQuery, COMP_HET_QUERY_MODE); for (FacetField.Bucket bucket : facetFieldDataResult.first().getBuckets()) { CodedVariant codedVariant = CodedVariant.parseEncodedId(bucket.getValue()); @@ -1581,6 +1585,7 @@ private KnockoutByVariantSummary calculatePartialSolrVariantSummary(String colle otherVariantSet.add(auxKnockoutVariant); } } + knockoutTypeCount.calculateStats(); List sequenceOntologyTermList = new ArrayList<>(sequenceOntologyTerms.size()); for (String ct : sequenceOntologyTerms) { String ctName = decode(ct); @@ -1619,7 +1624,7 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer throw RgaException.noResultsMatching(); } - KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); + VariantKnockoutTypeCount knockoutTypeCount = new VariantKnockoutTypeCount(auxQuery, COMP_HET_QUERY_MODE); RgaDataModel rgaDataModel = rgaIterator.next(); stopWatch.reset(); @@ -1653,7 +1658,8 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer CodedVariant codedFeature = CodedVariant.parseEncodedId(variantBucket.getValue()); knockoutTypeCount.processFeature(codedFeature); } - VariantKnockoutStats variantStats = new VariantKnockoutStats(knockoutTypeCount.getNumIds(), knockoutTypeCount.getNumHomIds(), + knockoutTypeCount.calculateStats(); + VariantKnockoutStats variantStats = new VariantKnockoutStats(knockoutTypeCount.getNumIds(), knockoutTypeCount.getNumHomAltIds(), knockoutTypeCount.getNumCompHetIds(), knockoutTypeCount.getNumPairedCompHetIds(), knockoutTypeCount.getNumPairedDelOverlapIds(), knockoutTypeCount.getNumHetIds(), knockoutTypeCount.getNumDelOverlapIds()); geneSummary.setVariantStats(variantStats); @@ -1667,13 +1673,13 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer .append(QueryOptions.FACET, RgaDataModel.INDIVIDUAL_SUMMARY); facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, geneFacet); logger.debug("Gene IndividualSummary facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); - KnockoutTypeCount noParentsCount = new KnockoutTypeCount(auxQuery); - KnockoutTypeCount singleParentCount = new KnockoutTypeCount(auxQuery); - KnockoutTypeCount bothParentsCount = new KnockoutTypeCount(auxQuery); + IndividualKnockoutTypeCount noParentsCount = new IndividualKnockoutTypeCount(auxQuery); + IndividualKnockoutTypeCount singleParentCount = new IndividualKnockoutTypeCount(auxQuery); + IndividualKnockoutTypeCount bothParentsCount = new IndividualKnockoutTypeCount(auxQuery); for (FacetField.Bucket bucket : facetFieldDataResult.first().getBuckets()) { CodedIndividual codedIndividual = CodedIndividual.parseEncodedId(bucket.getValue()); - KnockoutTypeCount auxKnockoutType; + IndividualKnockoutTypeCount auxKnockoutType; switch (codedIndividual.getNumParents()) { case 0: auxKnockoutType = noParentsCount; @@ -1690,17 +1696,21 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer auxKnockoutType.processFeature(codedIndividual); } + noParentsCount.calculateStats(); + singleParentCount.calculateStats(); + bothParentsCount.calculateStats(); + IndividualKnockoutStats noParentIndividualStats = new IndividualKnockoutStats(noParentsCount.getNumIds(), - noParentsCount.getNumHomIds(), noParentsCount.getNumCompHetIds(), noParentsCount.getNumHetIds(), + noParentsCount.getNumHomAltIds(), noParentsCount.getNumCompHetIds(), noParentsCount.getNumHetIds(), noParentsCount.getNumDelOverlapIds(), noParentsCount.getNumHomAltCompHetIds(), noParentsCount.getNumCompHetDelOverlapIds() ); IndividualKnockoutStats singleParentIndividualStats = new IndividualKnockoutStats(singleParentCount.getNumIds(), - singleParentCount.getNumHomIds(), singleParentCount.getNumCompHetIds(), singleParentCount.getNumHetIds(), + singleParentCount.getNumHomAltIds(), singleParentCount.getNumCompHetIds(), singleParentCount.getNumHetIds(), singleParentCount.getNumDelOverlapIds(), singleParentCount.getNumHomAltCompHetIds(), singleParentCount.getNumCompHetDelOverlapIds() ); IndividualKnockoutStats bothParentIndividualStats = new IndividualKnockoutStats(bothParentsCount.getNumIds(), - bothParentsCount.getNumHomIds(), bothParentsCount.getNumCompHetIds(), bothParentsCount.getNumHetIds(), + bothParentsCount.getNumHomAltIds(), bothParentsCount.getNumCompHetIds(), bothParentsCount.getNumHetIds(), bothParentsCount.getNumDelOverlapIds(), bothParentsCount.getNumHomAltCompHetIds(), bothParentsCount.getNumCompHetDelOverlapIds() ); @@ -1729,7 +1739,7 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection throw RgaException.noResultsMatching(); } - KnockoutTypeCount knockoutTypeCount = new KnockoutTypeCount(auxQuery); + VariantKnockoutTypeCount knockoutTypeCount = new VariantKnockoutTypeCount(auxQuery, COMP_HET_QUERY_MODE); RgaDataModel rgaDataModel = rgaIterator.next(); stopWatch.reset(); @@ -1761,7 +1771,8 @@ private KnockoutByIndividualSummary calculateIndividualSummary(String collection CodedVariant codedFeature = CodedVariant.parseEncodedId(variantBucket.getValue()); knockoutTypeCount.processFeature(codedFeature); } - VariantKnockoutStats variantStats = new VariantKnockoutStats(knockoutTypeCount.getNumIds(), knockoutTypeCount.getNumHomIds(), + knockoutTypeCount.calculateStats(); + VariantKnockoutStats variantStats = new VariantKnockoutStats(knockoutTypeCount.getNumIds(), knockoutTypeCount.getNumHomAltIds(), knockoutTypeCount.getNumCompHetIds(), knockoutTypeCount.getNumPairedCompHetIds(), knockoutTypeCount.getNumPairedDelOverlapIds(), knockoutTypeCount.getNumHetIds(), knockoutTypeCount.getNumDelOverlapIds()); knockoutByIndividualSummary.setVariantStats(variantStats); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java index 03c70b64522..eb985a4a2d1 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java @@ -798,23 +798,28 @@ private static CodedVariant decodeEncodedVariantId(String encodedVariant) throws } - public static class KnockoutTypeCount { - private Set variantIdQuery; - private Set dbSnpQuery; - private Set typeQuery; - private Set knockoutTypeQuery; - private Set clinicalSignificanceQuery; - private Set consequenceTypeQuery; - private List> popFreqQuery; - - // Valid CH pair variants - private Map> validChPairVariants; + public abstract static class KnockoutTypeCount { + private final Set variantIdQuery; + private final Set dbSnpQuery; + private final Set typeQuery; + private final Set knockoutTypeQuery; + private final Set clinicalSignificanceQuery; + private final Set consequenceTypeQuery; + private final List> popFreqQuery; private Set ids; - private Map> transcriptCompHetIdsMap; - private Map> transcriptDelOverlapIdsMap; - private Set homIds; - private Set hetIds; + protected Map> transcriptCompHetIdsMap; + protected Map> transcriptDelOverlapIdsMap; + protected Set compHetIds; + protected Set deletionOverlapIds; + protected Set homIds; + protected Set hetIds; + + private int numIds; + private int numHomAltIds; + private int numHetIds; + private int numCompHetIds; + private int numDelOverlapIds; public KnockoutTypeCount(Query query) throws RgaException { variantIdQuery = new HashSet<>(); @@ -824,10 +829,11 @@ public KnockoutTypeCount(Query query) throws RgaException { clinicalSignificanceQuery = new HashSet<>(); typeQuery = new HashSet<>(); consequenceTypeQuery = new HashSet<>(); - validChPairVariants = new HashMap<>(); ids = new HashSet<>(); transcriptCompHetIdsMap = new HashMap<>(); transcriptDelOverlapIdsMap = new HashMap<>(); + compHetIds = new HashSet<>(); + deletionOverlapIds = new HashSet<>(); homIds = new HashSet<>(); hetIds = new HashSet<>(); @@ -849,6 +855,12 @@ public KnockoutTypeCount(Query query) throws RgaException { popFreqQuery.add(new HashSet<>(values)); } } + + numIds = 0; + numHomAltIds = 0; + numHetIds = 0; + numCompHetIds = 0; + numDelOverlapIds = 0; } public boolean passesFilter(RgaUtils.CodedFeature codedFeature) { @@ -893,7 +905,7 @@ public void processFeature(RgaUtils.CodedFeature codedFeature) { return; } - ids.add(codedFeature.getId()); +// ids.add(codedFeature.getId()); KnockoutVariant.KnockoutType knockoutType = KnockoutVariant.KnockoutType.valueOf(codedFeature.getKnockoutType()); switch (knockoutType) { case HOM_ALT: @@ -919,6 +931,114 @@ public void processFeature(RgaUtils.CodedFeature codedFeature) { } } + protected void calculateStats() { + numCompHetIds = compHetIds.size(); + numDelOverlapIds = deletionOverlapIds.size(); + numHomAltIds = homIds.size(); + numHetIds = hetIds.size(); + + ids.addAll(homIds); + ids.addAll(hetIds); + ids.addAll(compHetIds); + ids.addAll(deletionOverlapIds); + numIds = ids.size(); + } + + public Set getIds() { + return ids; + } + + public int getNumIds() { + return numIds; + } + + public int getNumCompHetIds() { + return numCompHetIds; + } + + public int getNumHomAltIds() { + return numHomAltIds; + } + + public int getNumHetIds() { + return numHetIds; + } + + public int getNumDelOverlapIds() { + return numDelOverlapIds; + } + + public Map> getTranscriptCompHetIdsMap() { + Map> compHetMap = new HashMap<>(); + for (Map.Entry> entry : transcriptCompHetIdsMap.entrySet()) { + if (entry.getValue().size() > 1) { + compHetMap.put(entry.getKey(), new ArrayList<>(entry.getValue())); + } + } + return compHetMap; + } + } + + public static class IndividualKnockoutTypeCount extends KnockoutTypeCount { + + private int numHomAltCompHetIds; + private int numCompHetDelOverlapIds; + + public IndividualKnockoutTypeCount(Query query) throws RgaException { + super(query); + } + + @Override + public void calculateStats() { + compHetIds = transcriptCompHetIdsMap.values() + .stream() + .flatMap(Set::stream) + .collect(Collectors.toSet()); + + deletionOverlapIds = transcriptDelOverlapIdsMap.values() + .stream() + .flatMap(Set::stream) + .collect(Collectors.toSet()); + + Set homAltCompHetIds = new HashSet<>(homIds); + homAltCompHetIds.addAll(compHetIds); + numHomAltCompHetIds = homAltCompHetIds.size(); + + Set compHetDelOverlapIds = new HashSet<>(compHetIds); + compHetDelOverlapIds.addAll(deletionOverlapIds); + numCompHetDelOverlapIds = compHetDelOverlapIds.size(); + + super.calculateStats(); + } + + public int getNumHomAltCompHetIds() { + return numHomAltCompHetIds; + } + + public int getNumCompHetDelOverlapIds() { + return numCompHetDelOverlapIds; + } + } + + public static class VariantKnockoutTypeCount extends KnockoutTypeCount { + + // Valid CH pair variants + private Map> validPairedChPairVariants; + private Set validChPairVariants; + + private final RgaQueryParams.CompHetQueryMode compHetQueryMode; + + private int numPairedCompHetIds; + private int numPairedDelOverlapIds; + + public VariantKnockoutTypeCount(Query query, RgaQueryParams.CompHetQueryMode compHetQueryMode) throws RgaException { + super(query); + this.compHetQueryMode = compHetQueryMode; + + this.validPairedChPairVariants = new HashMap<>(); + this.validChPairVariants = new HashSet<>(); + } + public void processChPairFeature(RgaUtils.CodedChPairVariants codedFeature) { String leftVariant = codedFeature.getMaternalCodedVariant().getId(); String rightVariant = codedFeature.getPaternalCodedVariant().getId(); @@ -929,37 +1049,23 @@ public void processChPairFeature(RgaUtils.CodedChPairVariants codedFeature) { } // Keys are always lexicographically less than variants as values - if (!validChPairVariants.containsKey(leftVariant)) { - validChPairVariants.put(leftVariant, new HashSet<>()); + if (!validPairedChPairVariants.containsKey(leftVariant)) { + validPairedChPairVariants.put(leftVariant, new HashSet<>()); } - validChPairVariants.get(leftVariant).add(rightVariant); - } - - public Set getIds() { - return ids; + validPairedChPairVariants.get(leftVariant).add(rightVariant); + validChPairVariants.add(leftVariant); + validChPairVariants.add(rightVariant); } - public int getNumIds() { - return ids.size(); - } - public int getNumCompHetIds() { - return (int) transcriptCompHetIdsMap.values().stream().flatMap(Set::stream).distinct().count(); - } - - public int getNumPairedCompHetIds() { - int threshold = 250; + @Override + public void calculateStats() { + // Calculate number of comp_het pairs Set chPairs = new HashSet<>(); + Set pairedChPairs = new HashSet<>(); for (Map.Entry> entry : transcriptCompHetIdsMap.entrySet()) { Set chSet = entry.getValue(); if (chSet.size() > 1) { - if (chSet.size() > threshold) { - logger.warn("Showing a -1 value for the numPairedCompHet stats. More than {} COMP_HET variants found in" - + " transcript {}", threshold, entry.getKey()); - // Don't calculate this if the number of possible pairs is too big - return -1; - } - // Sort variants lexicographically so we just need to check once List sortedVariants = chSet.stream().sorted(String::compareTo).collect(Collectors.toList()); for (int i = 0; i < sortedVariants.size() - 1; i++) { @@ -967,18 +1073,33 @@ public int getNumPairedCompHetIds() { for (int j = i + 1; j < sortedVariants.size(); j++) { String rightVariant = sortedVariants.get(j); - if (validChPairVariants.containsKey(leftVariant) - && validChPairVariants.get(leftVariant).contains(rightVariant)) { - chPairs.add(leftVariant + "-" + rightVariant); + if (validPairedChPairVariants.containsKey(leftVariant) + && validPairedChPairVariants.get(leftVariant).contains(rightVariant)) { + pairedChPairs.add(leftVariant + "-" + rightVariant); + chPairs.add(leftVariant); + chPairs.add(rightVariant); } } } } } - return chPairs.size(); - } + numPairedCompHetIds = pairedChPairs.size(); + + // If we are searching by pairs, we should only count those that actually formed a pair + if (compHetQueryMode.equals(RgaQueryParams.CompHetQueryMode.PAIR)) { + compHetIds = transcriptCompHetIdsMap.values() + .stream() + .flatMap(Set::stream) + .filter(chPairs::contains) + .collect(Collectors.toSet()); + } else { + compHetIds = transcriptCompHetIdsMap.values() + .stream() + .flatMap(Set::stream) + .collect(Collectors.toSet()); + } - public int getNumPairedDelOverlapIds() { + // Process deletion overlap pairs Set delOverlapPairs = new HashSet<>(); for (Map.Entry> entry : transcriptDelOverlapIdsMap.entrySet()) { Set chSet = entry.getValue(); @@ -986,6 +1107,9 @@ public int getNumPairedDelOverlapIds() { List variantList = chSet.stream().map(Variant::new).collect(Collectors.toList()); for (int i = 0; i < variantList.size() - 1; i++) { for (int j = i + 1; j < variantList.size(); j++) { + deletionOverlapIds.add(variantList.get(i).toString()); + deletionOverlapIds.add(variantList.get(j).toString()); + // We simply check if two variants overlap. If they do, they are a valid pair if (variantList.get(i).overlapWith(variantList.get(j), true)) { String pair = concatSortedVariants(variantList.get(i).toString(), variantList.get(j).toString()); @@ -995,47 +1119,21 @@ public int getNumPairedDelOverlapIds() { } } } - return delOverlapPairs.size(); - } - - public int getNumHomIds() { - return homIds.size(); - } + numPairedDelOverlapIds = delOverlapPairs.size(); - public int getNumHetIds() { - return hetIds.size(); - } - - public int getNumDelOverlapIds() { - return (int) transcriptDelOverlapIdsMap.values().stream().flatMap(Set::stream).distinct().count(); + super.calculateStats(); } - public int getNumHomAltCompHetIds() { - Set ids = new HashSet<>(homIds); - ids.addAll(transcriptCompHetIdsMap.values().stream().flatMap(Set::stream).collect(Collectors.toSet())); - return ids.size(); - } - - public int getNumCompHetDelOverlapIds() { - Set ids = new HashSet<>(); - ids.addAll(transcriptDelOverlapIdsMap.values().stream().flatMap(Set::stream).collect(Collectors.toSet())); - ids.addAll(transcriptCompHetIdsMap.values().stream().flatMap(Set::stream).collect(Collectors.toSet())); - return ids.size(); + private String concatSortedVariants(String v1, String v2) { + return StringUtils.compare(v1, v2) <= 0 ? v1 + "__" + v2 : v2 + "__" + v1; } - public Map> getTranscriptCompHetIdsMap() { - Map> compHetMap = new HashMap<>(); - for (Map.Entry> entry : transcriptCompHetIdsMap.entrySet()) { - if (entry.getValue().size() > 1) { - compHetMap.put(entry.getKey(), new ArrayList<>(entry.getValue())); - } - } - return compHetMap; + public int getNumPairedCompHetIds() { + return numPairedCompHetIds; } - private String concatSortedVariants(String v1, String v2) { - return StringUtils.compare(v1, v2) <= 0 ? v1 + "__" + v2 : v2 + "__" + v1; + public int getNumPairedDelOverlapIds() { + return numPairedDelOverlapIds; } } - } From 822d012fd8c2a9e4c404b0eed46fb1ff3dd8446f Mon Sep 17 00:00:00 2001 From: pfurio Date: Fri, 3 Feb 2023 16:21:28 +0100 Subject: [PATCH 18/27] analysis: improve query parser with 2 pop freqs, #TASK-2478 --- .../opencga/analysis/rga/RgaManager.java | 2 +- .../opencga/analysis/rga/RgaQueryParser.java | 30 ++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index 42ab636cbe8..60b72c11392 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1632,7 +1632,7 @@ private KnockoutByGeneSummary calculateGeneSummary(String collection, Query quer QueryOptions variantFacet = new QueryOptions() .append(QueryOptions.LIMIT, -1) .append(QueryOptions.FACET, RgaDataModel.CH_PAIRS); - DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, geneQuery, variantFacet); + DataResult facetFieldDataResult = rgaEngine.facetedQuery(collection, auxQuery, variantFacet); logger.debug("Gene CH pairs facet: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); for (FacetField.Bucket variantBucket : facetFieldDataResult.first().getBuckets()) { CodedChPairVariants codedChPairVariants = CodedChPairVariants.parseEncodedId(variantBucket.getValue()); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index bc45f5b1afe..5187deaf719 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -333,23 +333,43 @@ private void buildComplexQuery(List koValues, List filterValues, parseStringValue(orFilterList, RgaDataModel.COMPOUND_FILTERS, filterList, "||"); } else if (!ctValues.isEmpty() && !popFreqQueryList.isEmpty()) { // KT + FILTER + CT + POP_FREQ - List andQueryList = new ArrayList<>(popFreqQueryList.size()); + List andQueryList = new LinkedList<>(); if (popFreqQueryList.size() == 2) { - List orQueryList = new LinkedList<>(); + List koQueryList = new LinkedList<>(); for (String koValue : koValues) { if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { ArrayList popFreqKeys = new ArrayList<>(popFreqQueryList.keySet()); List> sortedPopFreqs = RgaUtils.generateSortedCombinations(popFreqQueryList.get(popFreqKeys.get(0)), popFreqQueryList.get(popFreqKeys.get(1))); + List popFreqAndQueryList = new LinkedList<>(); + List tmpOrQueryList = new LinkedList<>(); for (List sortedPopFreq : sortedPopFreqs) { for (String filterVal : chFilterValues) { for (String ctValue : chCtValues) { - orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + sortedPopFreq.get(0) - + SEPARATOR + sortedPopFreq.get(1)); + // CH__P__P__1583__1583__P1-1__P2-2 + tmpOrQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + + sortedPopFreq.get(0) + SEPARATOR + sortedPopFreq.get(1)); + } + } + } + parseStringValue(tmpOrQueryList, "", popFreqAndQueryList, "||"); + + List filterValuesOrList = new LinkedList<>(); + for (String filterVal : chFilterValues) { + List tmpAndList = new LinkedList<>(); + for (List popFreqList : popFreqQueryList.values()) { + List popFreqOrQueryList = new LinkedList<>(); + for (String popFreq : popFreqList) { + popFreqOrQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + popFreq); } + parseStringValue(popFreqOrQueryList, "", tmpAndList, "||"); } + parseStringValue(tmpAndList, "", filterValuesOrList, "&&"); } + parseStringValue(filterValuesOrList, "", popFreqAndQueryList, "||"); + parseStringValue(popFreqAndQueryList, "", koQueryList, "&&"); } else { + List orQueryList = new LinkedList<>(); for (String ctValue : ctValues) { if (koValue.equals(delOverlap) && !INCLUDED_DEL_OVERLAP_CONSEQUENCE_TYPES.contains(ctValue)) { // Don't process this filter @@ -367,8 +387,10 @@ private void buildComplexQuery(List koValues, List filterValues, parseStringValue(tmpAndQueryList, "", orQueryList, "&&"); } } + parseStringValue(orQueryList, "", koQueryList, "||"); } } + parseStringValue(koQueryList, "", andQueryList, "||"); } else { for (List tmpPopFreqList : popFreqQueryList.values()) { List orQueryList = new LinkedList<>(); From 054d7631c2dcc19071f6d64ab99d282d64151ea2 Mon Sep 17 00:00:00 2001 From: pfurio Date: Mon, 6 Feb 2023 11:38:24 +0100 Subject: [PATCH 19/27] analysis: store paired PFs to be able to filter, #TASK-2478 --- .../opencb/opencga/analysis/rga/RgaUtils.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java index eb985a4a2d1..0dcd7b9ceb7 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java @@ -327,6 +327,8 @@ public static Set generateCompoundHeterozygousPairCombination(List generateCompoundHeterozygousPairCombination(List> sortedFilterList = generateSortedCombinations(variant1.get(1), variant2.get(1)); + List> sortedCtList = generateSortedCombinations(variant1.get(2), variant2.get(2)); List simplifiedPopFreqList = generateSimplifiedPopulationFrequencyList(variant1.get(3), variant2.get(3)); for (List filterList : sortedFilterList) { for (String popFreq : simplifiedPopFreqList) { @@ -375,6 +378,23 @@ public static Set generateCompoundHeterozygousPairCombination(List terms = new LinkedList<>(); + terms.add(knockout); + terms.addAll(filterList); + terms.addAll(simplifiedPopFreqList); + result.add(terms); + + // And: KO - F1 - F2 - CT1 - CT2 - PF1' - PF2' ; where PF' is equivalent to the highest PF of both variants + for (List ctList : sortedCtList) { + terms = new LinkedList<>(); + terms.add(knockout); + terms.addAll(filterList); + terms.addAll(ctList); + terms.addAll(simplifiedPopFreqList); + result.add(terms); + } } Set combinations = new HashSet<>(); From dbab0967b888df8b87a40c253f978276b25f2b88 Mon Sep 17 00:00:00 2001 From: pfurio Date: Mon, 6 Feb 2023 12:45:14 +0100 Subject: [PATCH 20/27] analysis: remove useless CH pair combinations, #TASK-2478 --- .../opencb/opencga/analysis/rga/RgaUtils.java | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java index 0dcd7b9ceb7..a506c98c34b 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaUtils.java @@ -324,8 +324,6 @@ public static Set generateCompoundHeterozygousPairCombination(List generateCompoundHeterozygousPairCombination(List> previousIteration = result; - for (int i = 1; i < 4; i++) { + for (int i = 1; i < 3; i++) { // The list will contain all Filter, CT or PF combinations between variant1 and variant2 in a sorted manner to reduce the // number of terms List> sortedCombinations = generateSortedCombinations(variant1.get(i), variant2.get(i)); @@ -351,16 +349,6 @@ public static Set generateCompoundHeterozygousPairCombination(List> sortedPfCombinations = generateSortedCombinations(variant1.get(3), variant2.get(3)); - for (List previousValues : previousIteration) { - for (List values : sortedPfCombinations) { - List newValues = new ArrayList<>(previousValues); - newValues.addAll(values); - result.add(newValues); - } - } } result.addAll(newResults); previousIteration = newResults; From 09aaaff18421e598503fa40a01737818bd8734c7 Mon Sep 17 00:00:00 2001 From: pfurio Date: Fri, 10 Mar 2023 14:05:29 +0100 Subject: [PATCH 21/27] analysis: fix queries with pop freq < 0.001 --- .../org/opencb/opencga/analysis/rga/RgaQueryParser.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index 5187deaf719..599275f46f6 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -399,8 +399,11 @@ private void buildComplexQuery(List koValues, List filterValues, List finalCtValues = koValue.equals(encodedChString) ? chCtValues : ctValues; for (String filterVal : finalFilterValues) { for (String ctValue : finalCtValues) { - if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString) - && tmpPopFreqList.size() > 1) { + if (compHetQueryMode.equals(CompHetQueryMode.PAIR) && koValue.equals(encodedChString)) { + if (tmpPopFreqList.size() == 1) { + // Replicate the same value so it filters as a pair + tmpPopFreqList.add(tmpPopFreqList.get(0)); + } List sortedCombinations = generateSortedCombinations(tmpPopFreqList); for (String popFreqPair : sortedCombinations) { orQueryList.add(koValue + SEPARATOR + filterVal + SEPARATOR + ctValue + SEPARATOR + popFreqPair); From 4527509c12040b0474d524e433ef71f701b16cc5 Mon Sep 17 00:00:00 2001 From: pfurio Date: Mon, 20 Mar 2023 15:15:43 +0100 Subject: [PATCH 22/27] analysis: add collection suffix for RGA, #TASK-2478 --- .../opencb/opencga/analysis/rga/RgaManager.java | 6 ++++-- .../core/config/RgaSearchConfiguration.java | 14 +++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index 60b72c11392..db9c36e9793 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1962,11 +1962,13 @@ public void testConnection() throws StorageEngineException { } private String getMainCollectionName(String study) { - return catalogManager.getConfiguration().getDatabasePrefix() + "-rga-" + study.replace("@", "_").replace(":", "_"); + return catalogManager.getConfiguration().getDatabasePrefix() + "-rga-" + study.replace("@", "_").replace(":", "_") + + storageConfiguration.getRga().getSuffix(); } private String getAuxCollectionName(String study) { - return catalogManager.getConfiguration().getDatabasePrefix() + "-rga-aux-" + study.replace("@", "_").replace(":", "_"); + return catalogManager.getConfiguration().getDatabasePrefix() + "-rga-aux-" + study.replace("@", "_").replace(":", "_") + + storageConfiguration.getRga().getSuffix(); } @Override diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java b/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java index 688ad0c7563..fc89f23420d 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/config/RgaSearchConfiguration.java @@ -6,15 +6,17 @@ public class RgaSearchConfiguration extends SearchConfiguration { private boolean cache; private int cacheSize; + private String suffix; public RgaSearchConfiguration() { } public RgaSearchConfiguration(List hosts, String configSet, String mode, String user, String password, String manager, - boolean active, int timeout, int insertBatchSize, boolean cache, int cacheSize) { + boolean active, int timeout, int insertBatchSize, boolean cache, int cacheSize, String suffix) { super(hosts, configSet, mode, user, password, manager, active, timeout, insertBatchSize); this.cache = cache; this.cacheSize = cacheSize; + this.suffix = suffix; } @Override @@ -22,6 +24,7 @@ public String toString() { final StringBuilder sb = new StringBuilder("RgaSearchConfiguration{"); sb.append("cache=").append(cache); sb.append(", cacheSize=").append(cacheSize); + sb.append(", suffix='").append(suffix).append('\''); sb.append('}'); return sb.toString(); } @@ -43,4 +46,13 @@ public RgaSearchConfiguration setCacheSize(int cacheSize) { this.cacheSize = cacheSize; return this; } + + public String getSuffix() { + return suffix; + } + + public RgaSearchConfiguration setSuffix(String suffix) { + this.suffix = suffix; + return this; + } } From cd680096566f4329da19becd15165d36bda62dbd Mon Sep 17 00:00:00 2001 From: pfurio Date: Mon, 20 Mar 2023 15:35:28 +0100 Subject: [PATCH 23/27] analysis: check suffix is null, #TASK-2478 --- .../main/java/org/opencb/opencga/analysis/rga/RgaManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index db9c36e9793..9e4d665ef34 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -1963,12 +1963,12 @@ public void testConnection() throws StorageEngineException { private String getMainCollectionName(String study) { return catalogManager.getConfiguration().getDatabasePrefix() + "-rga-" + study.replace("@", "_").replace(":", "_") - + storageConfiguration.getRga().getSuffix(); + + (storageConfiguration.getRga().getSuffix() != null ? storageConfiguration.getRga().getSuffix() : ""); } private String getAuxCollectionName(String study) { return catalogManager.getConfiguration().getDatabasePrefix() + "-rga-aux-" + study.replace("@", "_").replace(":", "_") - + storageConfiguration.getRga().getSuffix(); + + (storageConfiguration.getRga().getSuffix() != null ? storageConfiguration.getRga().getSuffix() : ""); } @Override From 7293adcb90b865f974ed46e4724f72e22f9ee033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 21 Mar 2023 18:19:07 +0000 Subject: [PATCH 24/27] storage: Fix trio serialization in CompoundHeterozygous variant query. --- .../manager/VariantCatalogQueryUtils.java | 5 ++- .../storage/core/metadata/models/Trio.java | 43 +++++++++++++++++++ .../core/variant/VariantStorageEngine.java | 2 +- .../CompoundHeterozygousQueryExecutor.java | 12 ++++-- ...CompoundHeterozygousQueryExecutorTest.java | 15 +++++++ 5 files changed, 70 insertions(+), 7 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java index 1cdba30f1f3..74114c5e1f2 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java @@ -58,6 +58,7 @@ import org.opencb.opencga.core.response.OpenCGAResult; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.utils.CellBaseUtils; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException; @@ -549,7 +550,7 @@ public Query parseQuery(Query query, QueryOptions queryOptions, CellBaseUtils ce "Require at least one parent to get compound heterozygous"); } - query.append(SAMPLE_COMPOUND_HETEROZYGOUS.key(), Arrays.asList(childId, fatherId, motherId)); + query.append(SAMPLE_COMPOUND_HETEROZYGOUS.key(), new Trio(fatherId, motherId, childId)); } else { if (family.getDisorders().isEmpty()) { throw VariantQueryException.malformedParam(FAMILY, familyId, "Family doesn't have disorders"); @@ -1024,7 +1025,7 @@ private void processSampleFilter(Query query, String defaultStudyStr, String tok String fatherId = member.getFather() != null ? member.getFather().getId() : MISSING_SAMPLE; String motherId = member.getMother() != null ? member.getMother().getId() : MISSING_SAMPLE; - query.put(SAMPLE_COMPOUND_HETEROZYGOUS.key(), Arrays.asList(member.getId(), fatherId, motherId)); + query.put(SAMPLE_COMPOUND_HETEROZYGOUS.key(), new Trio(fatherId, motherId, member.getId())); query.remove(SAMPLE.key()); } else if (moi == ClinicalProperty.ModeOfInheritance.DE_NOVO) { query.put(SAMPLE_DE_NOVO.key(), member.getId()); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java index 58e0e6cff7f..2c12a0021e0 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java @@ -1,7 +1,10 @@ package org.opencb.opencga.storage.core.metadata.models; +import org.apache.logging.log4j.util.Strings; + import java.util.ArrayList; import java.util.List; +import java.util.Objects; public class Trio { private final String id; @@ -9,6 +12,21 @@ public class Trio { private final String mother; private final String child; + public Trio(List trio) { + this(null, trio); + } + + public Trio(String id, List trio) { + this.id = id; + this.father = trio.get(1); + this.mother = trio.get(2); + this.child = trio.get(0); + } + + public Trio(String father, String mother, String child) { + this(null, father, mother, child); + } + public Trio(String id, String father, String mother, String child) { this.id = id; this.father = father; @@ -43,4 +61,29 @@ public List toList() { } return list; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Trio trio = (Trio) o; + return Objects.equals(id, trio.id) + && Objects.equals(father, trio.father) + && Objects.equals(mother, trio.mother) + && Objects.equals(child, trio.child); + } + + @Override + public int hashCode() { + return Objects.hash(id, father, mother, child); + } + + @Override + public String toString() { + return Strings.join(toList(), ','); + } } diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index 5e2558339de..f1ae7490d50 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -1073,7 +1073,7 @@ public VariantQueryResult getCompoundHeterozygous(String study, String father = StringUtils.isEmpty(father) ? CompoundHeterozygousQueryExecutor.MISSING_SAMPLE : father; mother = StringUtils.isEmpty(mother) ? CompoundHeterozygousQueryExecutor.MISSING_SAMPLE : mother; query = new Query(query) - .append(VariantQueryUtils.SAMPLE_COMPOUND_HETEROZYGOUS.key(), Arrays.asList(child, father, mother)) + .append(VariantQueryUtils.SAMPLE_COMPOUND_HETEROZYGOUS.key(), new Trio(father, mother, child)) .append(VariantQueryParam.STUDY.key(), study); return get(query, options); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutor.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutor.java index ece9915aa76..db0815b5135 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutor.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutor.java @@ -210,15 +210,15 @@ protected List getAndCheckIncludeSample(Query query, String proband, Str // Check it has all required members if (!includeSamples.contains(proband)) { throw VariantQueryException.malformedParam(VariantQueryParam.INCLUDE_SAMPLE, includeSamples.toString(), - "Can not compute CompoundHeterozygous not including the proband in the query"); + "Can not compute CompoundHeterozygous not including the proband '" + proband + "' in the query"); } if (!mother.equals(MISSING_SAMPLE) && !includeSamples.contains(mother)) { throw VariantQueryException.malformedParam(VariantQueryParam.INCLUDE_SAMPLE, includeSamples.toString(), - "Can not compute CompoundHeterozygous not including the mother in the query"); + "Can not compute CompoundHeterozygous not including the mother '" + mother + "' in the query"); } if (!father.equals(MISSING_SAMPLE) && !includeSamples.contains(father)) { throw VariantQueryException.malformedParam(VariantQueryParam.INCLUDE_SAMPLE, includeSamples.toString(), - "Can not compute CompoundHeterozygous not including the father in the query"); + "Can not compute CompoundHeterozygous not including the father '" + father + "' in the query"); } } else { if (father.equals(MISSING_SAMPLE)) { @@ -265,9 +265,13 @@ protected VariantDBIterator getRawIterator(String proband, String father, String } protected Trio getCompHetTrio(Query query) { + Object o = query.get(SAMPLE_COMPOUND_HETEROZYGOUS.key()); + if (o instanceof Trio) { + return ((Trio) o); + } List samples = query.getAsStringList(VariantQueryUtils.SAMPLE_COMPOUND_HETEROZYGOUS.key()); if (samples.size() == 3) { - return new Trio(null, samples.get(2), samples.get(0), samples.get(1)); + return new Trio(samples); } else if (samples.size() == 1) { int studyId = metadataManager.getStudyId(query.getString(VariantQueryParam.STUDY.key())); String sample = samples.get(0); diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutorTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutorTest.java index 23144754972..d0f10800f9b 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutorTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/CompoundHeterozygousQueryExecutorTest.java @@ -5,6 +5,7 @@ import org.mockito.Mockito; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.adaptors.VariantIterable; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException; @@ -19,6 +20,7 @@ import static org.junit.Assert.assertFalse; import static org.opencb.opencga.storage.core.variant.adaptors.VariantField.*; import static org.opencb.opencga.storage.core.variant.query.VariantQueryUtils.ALL; +import static org.opencb.opencga.storage.core.variant.query.VariantQueryUtils.SAMPLE_COMPOUND_HETEROZYGOUS; /** * Created on 09/04/19. @@ -65,6 +67,19 @@ public void testBuildQueryOptions() { STUDIES, STUDIES_SAMPLES)), includeFields); } + @Test + public void getCompHetTrio() { + Trio expected = new Trio("F", "M", "C"); + Trio actual = ch.getCompHetTrio(new Query(SAMPLE_COMPOUND_HETEROZYGOUS.key(), expected.toList())); + assertEquals(expected, actual); + + actual = ch.getCompHetTrio(new Query(SAMPLE_COMPOUND_HETEROZYGOUS.key(), expected.toString())); + assertEquals(expected, actual); + + actual = ch.getCompHetTrio(new Query(SAMPLE_COMPOUND_HETEROZYGOUS.key(), expected)); + assertEquals(expected, actual); + } + @Test public void testGetAndCheckIncludeSample() { From 37d809a067e7dbda6ce62e42159725f09fd4a96b Mon Sep 17 00:00:00 2001 From: pfurio Date: Wed, 22 Mar 2023 14:51:42 +0100 Subject: [PATCH 25/27] analysis: remove limit to variant query, #TASK-2478 --- .../main/java/org/opencb/opencga/analysis/rga/RgaEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java index ea33cae2f92..1251874eb1e 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaEngine.java @@ -226,7 +226,7 @@ private void fixGeneOptions(QueryOptions queryOptions, Query query, SolrQuery so public RgaIterator variantQuery(String collection, Query query, QueryOptions queryOptions) throws RgaException { SolrQuery solrQuery = parser.parseQuery(query); fixVariantOptions(queryOptions, query, solrQuery); - solrQuery.setRows(queryOptions.getInt(QueryOptions.LIMIT, Integer.MAX_VALUE)); + solrQuery.setRows(Integer.MAX_VALUE); try { return new RgaIterator(solrManager.getSolrClient(), collection, solrQuery); } catch (SolrServerException e) { From 2d95ee27d039df9ef0feccf9f4f3ae65a92f6fd0 Mon Sep 17 00:00:00 2001 From: pfurio Date: Wed, 5 Apr 2023 11:07:20 +0200 Subject: [PATCH 26/27] analysis: reduce number of parallel tasks, #TASK-2478 --- .../main/java/org/opencb/opencga/analysis/rga/RgaManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java index 9e4d665ef34..848aca3796a 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaManager.java @@ -199,7 +199,7 @@ private void load(String study, Path file, String token) throws RgaException { writer, ParallelTaskRunner.Config.builder() .setBatchSize(1) - .setNumTasks(2) // Write is definitely slower than process. More threads won't help much. + .setNumTasks(1) // Write is definitely slower than process. More threads won't help much. .build() ); From 1a3c9a86245472f3d488ef65c04a697059599e98 Mon Sep 17 00:00:00 2001 From: pfurio Date: Wed, 5 Apr 2023 15:44:26 +0200 Subject: [PATCH 27/27] analysis: fix pop freq queries over CH variants, #TASK-2478 --- .../opencb/opencga/analysis/rga/RgaQueryParser.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java index 599275f46f6..644f70f8cf2 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/rga/RgaQueryParser.java @@ -308,6 +308,18 @@ private void buildComplexQuery(List koValues, List filterValues, // To generate pairs to query for complete COMP_HET variants chFilterValues = generateSortedCombinations(filterValues); chCtValues = generateSortedCombinations(ctValues); + if (popFreqQueryList.size() == 1) { + // Add the missing pair so queries are done properly + if (popFreqQueryList.keySet().contains(RgaUtils.GNOMAD_GENOMES_STUDY)) { + List missingPopFreq = Collections.singletonList(RgaUtils.THOUSAND_GENOMES_STUDY + ":ALL>=0"); + Map> tmpMap = RgaUtils.parsePopulationFrequencyQuery(missingPopFreq); + popFreqQueryList.putAll(tmpMap); + } else if (popFreqQueryList.keySet().contains(RgaUtils.THOUSAND_GENOMES_STUDY)) { + List missingPopFreq = Collections.singletonList(RgaUtils.GNOMAD_GENOMES_STUDY + ":ALL>=0"); + Map> tmpMap = RgaUtils.parsePopulationFrequencyQuery(missingPopFreq); + popFreqQueryList.putAll(tmpMap); + } + } } if (ctValues.isEmpty() && popFreqQueryList.isEmpty()) {