Skip to content

Commit

Permalink
Merge branch 'develop' into next
Browse files Browse the repository at this point in the history
  • Loading branch information
javild committed Mar 18, 2020
2 parents 72ded0f + bf5d8cd commit e910cd5
Show file tree
Hide file tree
Showing 85 changed files with 3,367 additions and 752 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -16,7 +16,7 @@ CellBase constitutes the knowledge-base component of [OpenCB](http://www.opencb.
Note: This repository is a major refactoring of https://github.com/opencb-cloud. All users, please update to this one.

### Documentation
You can find CellBase documentation and tutorials at: https://github.com/opencb/cellbase/wiki.
You can find CellBase documentation and tutorials at: http://docs.opencb.org/display/cellbase.

For documenting RESTful web services [Swagger](http://swagger.io/) has been set-up and is available at http://bioinfo.hpc.cam.ac.uk/cellbase/webservices/.

Expand Down
42 changes: 42 additions & 0 deletions cellbase-app/app/mongodb-scripts/clinical-indexes.js
@@ -0,0 +1,42 @@

/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

db.getCollection('clinical_variants').createIndex({'id': 1})
db.getCollection('clinical_variants').createIndex({'type': 1})
db.getCollection('clinical_variants').createIndex({'chromosome': 1, 'start': 1, 'end': 1})
db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.source.name': 1})
db.getCollection('clinical_variants').createIndex({'annotation.consequenceTypes.sequenceOntologyTerms.name': 1})
db.getCollection('clinical_variants').createIndex({'_featureXrefs': 1})
db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.id': 1})
db.getCollection('clinical_variants').createIndex({'annotation.id': 1})
db.getCollection('clinical_variants').createIndex({'annotation.hgvs': 1})
db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.consistencyStatus': 1}, {sparse: true})
db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.variantClassification.clinicalSignificance': 1}, {sparse: true})
db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.heritableTraits.inheritanceMode': 1}, {sparse: true})
db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.alleleOrigin': 1}, {sparse: true})
db.getCollection('clinical_variants').createIndex({'_traits': 1})

//db.getCollection('clinical_variants').createIndex({'annotation.traitAssociation.heritableTraits.trait':'text',
// 'annotation.traitAssociation.somaticInformation.primarySite': 'text',
// 'annotation.traitAssociation.somaticInformation.siteSubtype': 'text',
// 'annotation.traitAssociation.somaticInformation.primaryHistology': 'text',
// 'annotation.traitAssociation.somaticInformation.histologySubtype': 'text',
// 'annotation.traitAssociation.somaticInformation.sampleSource': 'text',
// 'annotation.traitAssociation.somaticInformation.tumourOrigin': 'text'}, {name: "_diseasePhenotype"})



Expand Up @@ -85,6 +85,7 @@ public class SpeciesAndAssemblyCommandOptions {

@Parameter(names = {"-a", "--assembly"}, description = "Name of the assembly, if empty the first assembly in configuration.yml will be used",
required = false, arity = 1)

public String assembly;

}
Expand Down
Expand Up @@ -249,6 +249,12 @@ public class VariantAnnotationCommandOptions {
+ " By default imprecision annotation is enabled.", required = false, arity = 0)
public boolean noImprecision;

@Parameter(names = {"--check-aminoacid-change"}, description = "true/false to specify whether variant match in " +
"the clinical variant collection should also be performed at the aminoacid change level",
required = false,
arity = 0)
public boolean checkAminoAcidChange;

@DynamicParameter(names = "-D", description = "Dynamic parameters. Available parameters: "
+ "{population-frequencies=for internal purposes mainly. Full path to a json file containing Variant "
+ "documents that include lists of population frequencies objects. Will allow annotating the input file "
Expand Down
Expand Up @@ -108,7 +108,7 @@ private void removeInvalidVariants(List<VariantAnnotation> variantAnnotationList
*/
private boolean isValid(VariantAnnotation variantAnnotation) {
return (variantAnnotation.getAlternate().matches(VARIANT_STRING_PATTERN)
// && variantAnnotation.getReference().matches(VARIANT_STRING_PATTERN)
// && variantAnnotation.getReferenceStart().matches(VARIANT_STRING_PATTERN)
&& !variantAnnotation.getAlternate().equals(variantAnnotation.getReference()));
}

Expand Down
Expand Up @@ -574,6 +574,8 @@ private void checkParameters() throws IOException {
parsePhaseConfiguration();
decompose = !variantAnnotationCommandOptions.skipDecompose;
leftAlign = !variantAnnotationCommandOptions.skipLeftAlign;
// Update serverQueryOptions
serverQueryOptions.put("checkAminoAcidChange", variantAnnotationCommandOptions.checkAminoAcidChange);

// output file
if (variantAnnotationCommandOptions.output != null) {
Expand Down
@@ -0,0 +1,157 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.app.builders;

import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.MapperFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.Before;
import org.junit.Test;
import org.opencb.biodata.models.core.Exon;
import org.opencb.biodata.models.core.Gene;
import org.opencb.biodata.models.core.Transcript;
import org.opencb.cellbase.core.config.SpeciesConfiguration;
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.builders.GeneParser;
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.*;


public class GeneParserTest {
private GeneParser geneParser;
private ObjectMapper jsonObjectMapper;
public GeneParserTest() throws URISyntaxException {
init();
}

@Before
public void init() throws URISyntaxException {
Path genomeSequenceFastaFile
= Paths.get(getClass().getResource("/gene/Homo_sapiens.GRCh38.fa.gz").toURI());
Path geneDirectoryPath = Paths.get(getClass().getResource("/gene").toURI());
// put the results in /tmp
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "gene",
true);
SpeciesConfiguration species = new SpeciesConfiguration("hsapiens", "Homo sapiens",
"human", null, null, null);
geneParser = new GeneParser(geneDirectoryPath, genomeSequenceFastaFile, species, serializer);
jsonObjectMapper = new ObjectMapper();
jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true);
jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
}

/**
* Checks a case in which the stop codon is the first 3 nts of an exon. ENSE00003800362 is in the negative strand.
* genomicCodingEnd, cdnaCodingStart and cdsStart must be set "manually" within the parser as there's no CDS line
* in the GTF since the stop codon itself is not part of the coding sequence (but historically considered part of
* the coding region in CellBase)
*/
@Test
public void testEdgeExonCodingStart() throws Exception {
geneParser.parse();
List<Gene> genes = loadSerializedGenes("/tmp/gene.json.gz");
Exon exon = getExon("ENSE00003800362", genes);
assertNotNull(exon);
assertEquals(28477630, exon.getGenomicCodingEnd());
assertEquals(1302, exon.getCdnaCodingStart());
assertEquals(1198, exon.getCdsStart());
}

private Exon getExon(String exonId, List<Gene> genes) {
for (Gene gene : genes) {
for (Transcript transcript : gene.getTranscripts()) {
for (Exon exon : transcript.getExons()) {
if (exonId.equals(exon.getId())) {
return exon;
}
}
}
}

return null;
}

@Test
public void testTranscriptSequence() throws Exception {
geneParser.parse();
final String TRANSCRIPT_SEQUENCE = "GTTAACTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGACTTCCCGTGTCCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTTCATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGGGATGGGCCATTGTTCATCTTCTGGCCCCTGTTGTCTGCATGTAACTTAATACCACAACCAGGCATAGGGGAAAGATTGGAGGAAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCAGTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAGGCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCTCCTGGACCAGTGATACACCCGGCACCCTGTCCTGGACACGCTGTTGGCCTGGATCTGAGCCCTGGTGGAGGTCAAAGCCACCTTTGGTTCTGCCATTGCTGCTGTGTGGAAGTTCACTCCTGCCTTTTCCTTTCCCTAGAGCCTCCACCACCCCGAGATCACATTTCTCACTGCCTTTTGTCTGCCCAGTTTCACCAGAAGTAGGCCTCTTCCTGACAGGCAGCTGCACCACTGCCTGGCGCTGTGCCCTTCCTTTGCTCTGCCCGCTGGAGACGGTGTTTGTCATGGGCCTGGTCTGCAGGGATCCTGCTACAAAGGTGAAACCCAGGAGAGTGTGGAGTCCAGAGTGTTGCCAGGACCCAGGCACAGGCATTAGTGCCCGTTGGAGAAAACAGGGGAATCCCGAAGAAATGGTGGGTCCTGGCCATCCGTGAGATCTTCCCAGGGCAGCTCCCCTCTGTGGAATCCAATCTGTCTTCCATCCTGCGTGGCCGAGGGCCAGGCTTCTCACTGGGCCTCTGCAGGAGGCTGCCATTTGTCCTGCCCACCTTCTTAGAAGCGAGACGGAGCAGACCCATCTGCTACTGCCCTTTCTATAATAACTAAAGTTAGCTGCCCTGGACTATTCACCCCCTAGTCTCAATTTAAGAAGATCCCCATGGCCACAGGGCCCCTGCCTGGGGGCTTGTCACCTCCCCCACCTTCTTCCTGAGTCATTCCTGCAGCCTTGCTCCCTAACCTGCCCCACAGCCTTGCCTGGATTTCTATCTCCCTGGCTTGGTGCCAGTTCCTCCAAGTCGATGGCACCTCCCTCCCTCTCAACCACTTGAGCAAACTCCAAGACATCTTCTACCCCAACACCAGCAATTGTGCCAAGGGCCATTAGGCTCTCAGCATGACTATTTTTAGAGACCCCGTGTCTGTCACTGAAACCTTTTTTGTGGGAGACTATTCCTCCCATCTGCAACAGCTGCCCCTGCTGACTGCCCTTCTCTCCTCCCTCTCATCCCAGAGAAACAGGTCAGCTGGGAGCTTCTGCCCCCACTGCCTAGGGACCAACAGGGGCAGGAGGCAGTCACTGACCCCGAGACGTTTGCATCCTGCACAGCTAGAGATCCTTTATTAAAAGCACACTGTTGGTTTCTG";
List<Gene> genes = loadSerializedGenes("/tmp/gene.json.gz");
Transcript transcript = getTranscript("ENST00000456328", genes);
assertNotNull(transcript);
assertEquals(TRANSCRIPT_SEQUENCE, transcript.getcDnaSequence());
}

private Transcript getTranscript(String transcriptId, List<Gene> geneList) {
for (Gene gene : geneList) {
for (Transcript transcript : gene.getTranscripts()) {
if (transcript.getId().equals(transcriptId)) {
return transcript;
}
}
}

return null;
}

@Test
public void testProteinSequence() throws Exception {
geneParser.parse();
final String PROTEIN_SEQUENCE = "MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF";
List<Gene> genes = loadSerializedGenes("/tmp/gene.json.gz");
assertEquals(15, genes.size());
for (Gene gene : genes) {
if (gene.getId().equals("ENSG00000223972")) {
for (Transcript transcript : gene.getTranscripts()) {
if (transcript.getId().equals("ENST00000456328")) {
assertEquals(PROTEIN_SEQUENCE, transcript.getProteinSequence());
}
}
}
}
}

private List<Gene> loadSerializedGenes(String fileName) {
List<Gene> geneList = new ArrayList();

try {
BufferedReader bufferedReader = FileUtils.newBufferedReader(Paths.get(fileName));
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith("#") || line.trim().isEmpty()) {
continue;
}
geneList.add(jsonObjectMapper.readValue(line, Gene.class));
}
} catch (IOException e) {
e.printStackTrace();
assertFalse(false);
}

return geneList;
}

}
Expand Up @@ -24,9 +24,14 @@
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.PopulationFrequency;
import org.opencb.biodata.models.variant.avro.VariantAvro;
import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser;
import org.opencb.cellbase.app.cli.admin.executors.LoadCommandExecutor;
import org.opencb.cellbase.app.cli.main.CellBaseCliOptionsParser;
import org.opencb.cellbase.app.cli.main.executors.VariantAnnotationCommandExecutor;
import org.opencb.cellbase.core.variant.AnnotationBasedPhasedQueryManager;
import org.opencb.commons.datastore.core.DataStoreServerAddress;
import org.opencb.commons.datastore.mongodb.MongoDBConfiguration;
import org.opencb.commons.datastore.mongodb.MongoDataStoreManager;
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
Expand All @@ -36,6 +41,7 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;

Expand All @@ -47,6 +53,7 @@
public class VariantAnnotationCommandExecutorTest {

private static final String OUTPUT_FILENAME = "/tmp/test.json.gz";
private static final String GRCH37_DBNAME = "cellbase_hsapiens_grch37_v4";
private Path resourcesFolder = Paths.get(getClass().getResource("/variant/annotation/").toURI());

private ObjectMapper jsonObjectMapper;
Expand All @@ -57,6 +64,70 @@ public VariantAnnotationCommandExecutorTest() throws URISyntaxException {
jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
}

@Test
public void proteinChangeMatchTest() throws IOException, URISyntaxException {
// Remove database content
cleanUp();
// Load test data
AdminCliOptionsParser.LoadCommandOptions loadCommandOptions = new AdminCliOptionsParser().getLoadCommandOptions();
loadCommandOptions.commonOptions.conf = resourcesFolder.resolve("commandExecutor/configuration.json").toString();
loadCommandOptions.data = "clinical_variants,gene";
loadCommandOptions.database = GRCH37_DBNAME;
loadCommandOptions.input = resourcesFolder.resolve("commandExecutor/proteinChangeMatch").toString();
LoadCommandExecutor loadCommandExecutor = new LoadCommandExecutor(loadCommandOptions);
loadCommandExecutor.loadCellBaseConfiguration();
loadCommandExecutor.execute();
// Set up annotation CLI options: NOTE checkAminoAcidChange is NOT enabled
CellBaseCliOptionsParser.VariantAnnotationCommandOptions variantAnnotationCommandOptions
= new CellBaseCliOptionsParser().getVariantAnnotationCommandOptions();
variantAnnotationCommandOptions.assembly = "GRCh37";
variantAnnotationCommandOptions.commonOptions.conf = resourcesFolder.resolve("commandExecutor/configuration.json").toString();
variantAnnotationCommandOptions.input
= resourcesFolder.resolve("commandExecutor/proteinChangeMatch/proband.duprem.atomic.left.split.vcf.gz").toString();
variantAnnotationCommandOptions.output = OUTPUT_FILENAME;
variantAnnotationCommandOptions.local = true;
variantAnnotationCommandOptions.species = "hsapiens";
// Annotate
VariantAnnotationCommandExecutor variantAnnotationCommandExecutor
= new VariantAnnotationCommandExecutor(variantAnnotationCommandOptions);
variantAnnotationCommandExecutor.loadCellBaseConfiguration();
variantAnnotationCommandExecutor.execute();
// Load annotated variants
List<Variant> variantList = loadResult();

// Check results
// Only one variant present in input VCF (2:170361068:G:C)
assertEquals(1, variantList.size());
// 2:170361068:G:C in the VCF file must NOT match 2:170361068:G:T variant in clinvar since checkAminoAcidChange
// is disabled in this run
Variant variant = getByVariant(variantList, new Variant("2:170361068:G:C"));
// No trait association expected
assertNotNull(variant);
assertNotNull(variant.getAnnotation());
assertNull(variant.getAnnotation().getTraitAssociation());

// Enable checkAminoAcidChange
variantAnnotationCommandOptions.checkAminoAcidChange = true;
// Annotate
variantAnnotationCommandExecutor
= new VariantAnnotationCommandExecutor(variantAnnotationCommandOptions);
variantAnnotationCommandExecutor.loadCellBaseConfiguration();
variantAnnotationCommandExecutor.execute();
// Load annotated variants
variantList = loadResult();
// Check results
// Only one variant present in input VCF (2:170361068:G:C)
assertEquals(1, variantList.size());
// 2:170361068:G:C in the VCF file must match xxx variant in clinvar at the protein change level
variant = getByVariant(variantList, new Variant("2:170361068:G:C"));
// Only one COSMIC trait association expected
assertNotNull(variant);
assertNotNull(variant.getAnnotation());
assertNotNull(variant.getAnnotation().getTraitAssociation());
assertEquals(1, variant.getAnnotation().getTraitAssociation().size());
assertEquals("COSM4624460", variant.getAnnotation().getTraitAssociation().get(0).getId());
}

@Test
public void indexedVariantWithoutRequiredAttributeTest() throws IOException, URISyntaxException {
cleanUp();
Expand Down Expand Up @@ -728,6 +799,16 @@ private void cleanUp() throws IOException {
.resolve("commandExecutor/additionalPopulationFrequency/chr1.2017-12-27_01_12.hgva.freq.cellbase.test.json.gz.idx").toFile());
org.apache.commons.io.FileUtils.deleteDirectory(resourcesFolder
.resolve("commandExecutor/customAnnotation/GEL_GL_6628.duprem.sites.annot.subset.atomic.left.split.test.vcf.gz.idx").toFile());

try (MongoDataStoreManager mongoManager
= new MongoDataStoreManager(Collections.singletonList(new DataStoreServerAddress("localhost",
27017)))) {
MongoDBConfiguration.Builder builder = MongoDBConfiguration.builder();
MongoDBConfiguration mongoDBConfiguration = builder.build();
mongoManager.get(GRCH37_DBNAME, mongoDBConfiguration);
mongoManager.drop(GRCH37_DBNAME);
}

}

private CellBaseCliOptionsParser.VariantAnnotationCommandOptions
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Expand Up @@ -93,7 +93,7 @@
"host": "http://mendel.stanford.edu/SidowLab/downloads/gerp/hg19.GERP_scores.tar.gz"
},
"clinvar": {
"host": "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2019-06.xml.gz"
"host": "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2020-01.xml.gz"
},
"clinvarSummary": {
"host": "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit e910cd5

Please sign in to comment.