diff --git a/test/test_cosmic_mutations.py b/test/test_cosmic_mutations.py index 67a9f96..61cd2d9 100644 --- a/test/test_cosmic_mutations.py +++ b/test/test_cosmic_mutations.py @@ -1,6 +1,7 @@ +from pyensembl import EnsemblRelease from varcode import ( - VariantAnnotator, Variant, + # transcript effects Substitution, Deletion, Insertion, @@ -9,15 +10,15 @@ FrameShiftTruncation, ) -annot = VariantAnnotator(75) +ensembl = EnsemblRelease(75) def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id): - variant = Variant(chrom, pos, dna_ref, dna_alt) - result = annot.effect(variant) - assert transcript_id in result.transcript_effects, \ + variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl) + effect_collection = variant.effects() + assert transcript_id in effect_collection.transcript_effect_dict, \ "Expected transcript ID %s for variant %s not found in %s" % ( transcript_id, variant, result) - return result.transcript_effects[transcript_id] + return effect_collection.transcript_effect_dict[transcript_id] def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt): effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id) diff --git a/test/test_dbnsfp_validation.py b/test/test_dbnsfp_validation.py index 2d990f1..07c7a14 100644 --- a/test/test_dbnsfp_validation.py +++ b/test/test_dbnsfp_validation.py @@ -13,20 +13,22 @@ # limitations under the License. import pandas as pd -from varcode import VariantAnnotator, Substitution, Variant +from pyensembl import EnsemblRelease +from varcode import Substitution, Variant -annot = VariantAnnotator(75) +ensembl = EnsemblRelease(75) def validate_transcript_mutation( ensembl_transcript, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): - result = annot.effect( - Variant(chrom, dna_position, dna_ref, dna_alt)) - assert ensembl_transcript in result.transcript_effects, \ + variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl) + result = variant.effects() + + assert ensembl_transcript in result.transcript_effect_dict, \ "%s not found in %s" % (ensembl_transcript, result) - effect = result.transcript_effects[ensembl_transcript] + effect = result.transcript_effect_dict[ensembl_transcript] assert ( isinstance(effect, Substitution) and effect.aa_pos + 1 == aa_pos and diff --git a/test/test_drop_duplicates.py b/test/test_drop_duplicates.py index 6df011b..0b42208 100644 --- a/test/test_drop_duplicates.py +++ b/test/test_drop_duplicates.py @@ -12,15 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pyensembl import EnsemblRelease from varcode import Variant, VariantCollection def test_drop_duplicates(): - v1 = Variant("1", 3000, "A", "G") - v1_copy = Variant("1", 3000, "A", "G") - v2 = Variant("2", 10, "G", "T") + ensembl = EnsemblRelease(78) + v1 = Variant("1", 3000, "A", "G", ensembl=ensembl) + v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl) + v2 = Variant("2", 10, "G", "T", ensembl=ensembl) collection_with_duplicates = VariantCollection( - variants=[v1, v1, v1_copy, v2], - reference_name="hg19") + variants=[v1, v1, v1_copy, v2]) assert len(collection_with_duplicates) == 4 collection_without_duplicates = collection_with_duplicates.drop_duplicates() assert len(collection_without_duplicates) == 2 diff --git a/test/test_maf.py b/test/test_maf.py index 375882b..9d76a72 100644 --- a/test/test_maf.py +++ b/test/test_maf.py @@ -1,15 +1,19 @@ -from varcode import load_variants, VariantCollection, Variant from nose.tools import eq_ +from pyensembl import EnsemblRelease +from varcode import load_maf, VariantCollection, Variant def test_maf(): - variant_collection_from_maf = load_variants("data/tcga_ov.head.maf") - eq_(variant_collection_from_maf.reference_name, "GRCh37") + ensembl = EnsemblRelease(75) + variant_collection_from_maf = load_maf("data/tcga_ov.head.maf") expected_variants = [ - Variant(1, 1650797, "A", "G"), - Variant(1, 231401797, "A", "C"), - Variant(1, 23836447, "C", "A"), - Variant(11,124617502, "C", "G"), + Variant(1, 1650797, "A", "G", ensembl), + Variant(1, 231401797, "A", "C", ensembl), + Variant(1, 23836447, "C", "A", ensembl), + Variant(11,124617502, "C", "G", ensembl), ] eq_(len(variant_collection_from_maf), len(expected_variants)) - for v1, v2 in zip(expected_variants, variant_collection_from_maf): - eq_(v1, v2) + for v_expect, v_maf in zip(expected_variants, variant_collection_from_maf): + eq_(v_expect, v_maf) + gene_name = v_maf.info['Hugo_Symbol'] + assert any(gene.name == gene_name for gene in v_maf.genes()), \ + "Expected gene name %s but got %s" % (gene_name, v_maf.genes()) diff --git a/test/test_mutation_effects.py b/test/test_mutation_effects.py index cf49b1a..9a05cc2 100644 --- a/test/test_mutation_effects.py +++ b/test/test_mutation_effects.py @@ -1,7 +1,8 @@ from varcode import ( - VariantAnnotator, Variant, - infer_transcript_effect, + # + # transcript effects + # NoncodingTranscript, IncompleteTranscript, FivePrimeUTR, @@ -17,8 +18,9 @@ FrameShift, # TODO: SpliceDonor, SpliceReceptor ) +from pyensembl import EnsemblRelease -annot = VariantAnnotator(ensembl_release=77) +ensembl = EnsemblRelease(release=78) def test_incomplete(): # transcript EGFR-009 (ENST00000450046 in Ensembl 77) @@ -27,10 +29,10 @@ def test_incomplete(): # first exon begins: ATCATTCCTTTGGGCCTAGGA # change the first nucleotide of the 5' UTR A>T - variant = Variant("7", 55109723, "A", "T") + variant = Variant("7", 55109723, "A", "T", ensembl=ensembl) - transcript = annot.ensembl.transcript_by_id("ENST00000450046") - effect = infer_transcript_effect(variant, transcript) + transcript = ensembl.transcript_by_id("ENST00000450046") + effect = variant.transcript_effect(transcript) assert isinstance(effect, IncompleteTranscript), \ "Expected %s on %s to be IncompleteTranscript, got %s" % ( variant, transcript, effect) @@ -43,9 +45,9 @@ def test_start_loss(): # which is 55,019,034 + 244 of chr7 = 55019278 # change the first nucleotide of the 5' UTR A>T # making what used to be a start codon into TTG (Leucine) - variant = Variant("7", 55019278, "A", "T") - transcript = annot.ensembl.transcript_by_id("ENST00000420316") - effect = infer_transcript_effect(variant, transcript) + variant = Variant("7", 55019278, "A", "T", ensembl=ensembl) + transcript = ensembl.transcript_by_id("ENST00000420316") + effect = variant.transcript_effect(transcript) assert isinstance(effect, StartLoss), \ "Expected StartLoss, got %s for %s on %s" % ( effect, variant, transcript, ) \ No newline at end of file diff --git a/test/test_trim_shared.py b/test/test_string_helpers.py similarity index 94% rename from test/test_trim_shared.py rename to test/test_string_helpers.py index fa06769..510b88c 100644 --- a/test/test_trim_shared.py +++ b/test/test_string_helpers.py @@ -1,6 +1,6 @@ from nose.tools import eq_ -from varcode.common import trim_shared_flanking_strings +from varcode.string_helpers import trim_shared_flanking_strings def test_trim_shared_string_endings(): # empty strings diff --git a/test/test_vcf.py b/test/test_vcf.py index 6a88f05..23e6e0c 100644 --- a/test/test_vcf.py +++ b/test/test_vcf.py @@ -12,33 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -from varcode import load_variants +from varcode import load_vcf VCF_FILENAME = "data/somatic_hg19_14muts.vcf" def test_vcf_reference_name(): - variants = load_variants(VCF_FILENAME) - # the raw reference name can be a file path to the hg19 FASTA file - assert variants.reference_path and "hg19" in variants.reference_path, \ - "Expected hg19 reference, got %s" % (variants.reference_path,) + variants = load_vcf(VCF_FILENAME) # after normalization, hg19 should be remapped to GRCh37 - assert variants.reference_name == "GRCh37" + assert variants.reference_names() == { "GRCh37" } def test_vcf_number_entries(): # there are 14 mutations listed in the VCF, make sure they are all parsed - variants = load_variants(VCF_FILENAME) + variants = load_vcf(VCF_FILENAME) assert len(variants) == 14, \ "Expected 14 mutations, got %d" % (len(variants),) -def _check_effect_gene_name(effect): - variant = effect.variant +def _check_variant_gene_name(variant): expected_gene_names = variant.info['GE'] - gene_names = [gene.name for gene in effect.genes] - assert expected_gene_names == gene_names, \ + assert variant.gene_names() == expected_gene_names, \ "Expected gene name %s for variant %s, got %s" % ( - expected_gene_name, variant, gene_names) + expected_gene_name, variant, variant.gene_names()) def test_vcf_gene_names(): - variants = load_variants(VCF_FILENAME) - for effect in variants.variant_effects(): - yield (_check_effect_gene_name, effect) + variants = load_vcf(VCF_FILENAME) + for variant in variants: + yield (_check_variant_gene_name, variant) diff --git a/varcode/__init__.py b/varcode/__init__.py index eefbeb9..0dbad7c 100644 --- a/varcode/__init__.py +++ b/varcode/__init__.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .core_logic import infer_transcript_effect +from .effects import * from .effect_ordering import effect_priority, top_priority_transcript_effect -from .load_variants import load_variants -from .transcript_mutation_effects import * +from .maf import load_maf, load_maf_dataframe from .variant import Variant -from .variant_annotator import VariantAnnotator from .variant_collection import VariantCollection +from .vcf import load_vcf \ No newline at end of file diff --git a/varcode/core_logic.py b/varcode/coding_effect.py similarity index 54% rename from varcode/core_logic.py rename to varcode/coding_effect.py index c3ebea8..b2de72b 100644 --- a/varcode/core_logic.py +++ b/varcode/coding_effect.py @@ -15,20 +15,8 @@ from __future__ import print_function, division, absolute_import import logging -import math -from collections import namedtuple -from .common import ( - reverse_complement, - trim_shared_flanking_strings, -) -from .variant import Variant -from .transcript_mutation_effects import ( - NoncodingTranscript, - IncompleteTranscript, - FivePrimeUTR, - ThreePrimeUTR, - Intronic, +from .effects import ( Silent, Insertion, Deletion, @@ -39,10 +27,9 @@ FrameShift, FrameShiftTruncation ) +from .string_helpers import trim_shared_flanking_strings from Bio.Seq import Seq, CodonTable -from pyensembl import (Transcript, find_nearest_locus) -from pyensembl.biotypes import is_coding_biotype def mutate(sequence, position, variant_ref, variant_alt): """ @@ -72,90 +59,29 @@ def mutate(sequence, position, variant_ref, variant_alt): suffix = sequence[position+n_variant_ref:] return prefix + variant_alt + suffix -def first_transcript_offset(start_pos, end_pos, transcript): - """ - Given a variant and a particular transcript, - return the start/end offsets of the variant relative to the - chromosomal positions of the transcript. - """ - # ensure that start_pos:end_pos overlap with transcript positions - assert start_pos <= end_pos, \ - "start_pos %d shouldn't be greater than end_pos %d" % ( - start_pos, end_pos) - assert start_pos <= transcript.end, \ - "Range %d:%d starts after transcript %s (%d:%d)" % ( - start_pos, - end_pos, - transcript, - transcript.start, - transcript.end) - assert end_pos >= transcript.start, \ - "Range %d:%d ends before transcript %s (%d:%d)" % ( - start_pos, - end_pos, - transcript, - transcript.start, - transcript.end) - # trim the start position to the beginning of the transcript - if start_pos < transcript.start: - start_pos = transcript.start - # trim the end position to the end of the transcript - if end_pos > transcript.end: - end_pos = transcript.end - - # offsets into the spliced transcript - offsets = [ - transcript.spliced_offset(start_pos), - transcript.spliced_offset(end_pos) - ] - - start_offset_with_utr5 = min(offsets) - - assert start_offset_with_utr5 >= 0, \ - "Position %d is before start of transcript %s" % ( - start_offset_with_utr5, transcript) - - return start_offset_with_utr5 - -def normalize_variant_fields(variant, transcript): - """ - Given a Variant (with fields `ref`, `alt`, `pos` and `end_pos`), - compute the offset the variant position relative - to the position and direction of the given transcript. - Also reverse the nucleotide strings if the transcript is on the - backwards strand and trim any shared substrings from the beginning - or end of `ref` and `alt`. - - Example: - If Variant(ref="ATGGC", alt="TCGC", pos=3000, end_pos=3005) is normalized - relative to Transcript(start=2000, end=4000) then the result of this - function will be: - ("G", "C", 1002) - where the elements of this tuple are (ref, alt, offset). - """ - if transcript.on_backward_strand: - ref = reverse_complement(variant.ref) - alt = reverse_complement(variant.alt) - else: - ref = variant.ref - alt = variant.alt - - start_offset_with_utr5 = first_transcript_offset( - variant.pos, variant.end_pos, transcript) - - # in case nucleotide strings share prefix (e.g. ref="C", alt="CC") - # bump the offsets and make the strings disjoint (ref="", alt="C") - ref, alt, prefix, _ = trim_shared_flanking_strings(ref, alt) - n_same = len(prefix) - start_offset_with_utr5 += n_same - return ref, alt, start_offset_with_utr5 - def infer_coding_effect( ref, alt, cds_offset, - variant, - transcript): + transcript, + variant): + """ + Given a minimal ref/alt nucleotide string pair and an offset into a given + transcript, determine the coding effect of this nucleotide substitution + onto the translated protein. + + Parameters + ---------- + ref : str + + alt : str + + cds_offset : int + + transcript : Transcript + + variant : Variant + """ # Don't need a pyfaidx.Sequence object here, just convert it to the an str cds_seq = str(transcript.coding_sequence) @@ -348,123 +274,3 @@ def infer_coding_effect( aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt) - -def infer_exonic_effect(variant, transcript): - ref, alt, start_offset_with_utr5 = normalize_variant_fields( - variant, transcript) - - utr5_length = min(transcript.start_codon_spliced_offsets) - - if utr5_length > start_offset_with_utr5: - # TODO: what do we do if the variant spans the beginning of - # the coding sequence? - assert utr5_length > start_offset_with_utr5 + len(ref), \ - "Variant which span the 5' UTR and CDS not yet supported: %s" % ( - variant) - return FivePrimeUTR(variant, transcript) - - utr3_offset = max(transcript.stop_codon_spliced_offsets) + 1 - - if start_offset_with_utr5 >= utr3_offset: - return ThreePrimeUTR(variant, transcript) - - cds_offset = start_offset_with_utr5 - utr5_length - return infer_coding_effect( - ref, - alt, - cds_offset, - variant, - transcript) - -def infer_intronic_effect(variant, transcript, nearest_exon): - """ - Infer effect of variant which does not overlap any exon of - the given transcript. - """ - distance_to_exon = nearest_exon.distance_to_interval( - variant.pos, variant.end_pos) - assert distance_to_exon > 0, \ - "Expected intronic effect to have distance_to_exon > 0, got %d" % ( - distance_to_exon,) - before_forward_exon = ( - transcript.strand == "+" and - variant.pos < nearest_exon.start) - before_backward_exon = ( - transcript.strand == "-" and - variant.end_pos > nearest_exon.end) - before_exon = before_forward_exon or before_backward_exon - - if distance_to_exon <= 2: - if before_exon: - # 2 last nucleotides of intron before exon are the splice acceptor - # site, typically "AG" - effect_class = SpliceAcceptor - else: - # 2 first nucleotides of intron after exon are the splice donor - # site, typically "GT" - effect_class = SpliceDonor - elif not before_exon and distance_to_exon <= 6: - # variants in nucleotides 3-6 at start of intron aren't as certain - # to cause problems as nucleotides 1-2 but still implicated in - # alternative splicing - effect_class = IntronicSpliceSite - elif before_exon and distance_to_exon <= 4: - # nucleotides -4 and -3 before exon are part of the 3' splicing motif - # but allow for more degeneracy than the -2, -1 nucleotides - effect_class = IntronicSpliceSite - else: - assert distance_to_exon > 6, \ - "Looks like we didn't cover all possible splice site mutations" - # intronic mutation unrelated to splicing - effect_class = Intronic - return effect_class( - variant=variant, - transcript=transcript, - nearest_exon=nearest_exon, - distance_to_exon=distance_to_exon) - -def infer_transcript_effect(variant, transcript): - """ - Generate a transcript effect (such as FrameShift) by applying a genomic - variant to a particular transcript. - - Parameters - ---------- - variant : Variant - Genomic variant - - transcript : Transcript - Transcript we're going to mutate - """ - - if not isinstance(variant, Variant): - raise TypeError( - "Expected %s : %s to have type Variant" % ( - variant, type(variant))) - - if not isinstance(transcript, Transcript): - raise TypeError( - "Expected %s : %s to have type Transcript" % ( - transcript, type(transcript))) - - # check for non-coding transcripts first, since - # every non-coding transcript is "incomplete". - if not is_coding_biotype(transcript.biotype): - return NoncodingTranscript(variant, transcript) - - if not transcript.complete: - return IncompleteTranscript(variant, transcript) - - distance_to_exon, nearest_exon = find_nearest_locus( - start=variant.pos, - end=variant.end_pos, - loci=transcript.exons) - - if distance_to_exon > 0: - return infer_intronic_effect( - variant=variant, - transcript=transcript, - nearest_exon=nearest_exon) - - # TODO: exonic splice site mutations - return infer_exonic_effect(variant, transcript) diff --git a/varcode/common.py b/varcode/common.py index 84136cc..04f3282 100644 --- a/varcode/common.py +++ b/varcode/common.py @@ -1,76 +1,20 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function, division, absolute_import from collections import defaultdict -import pyfaidx - -def reverse_complement(x): - """ - Reverse complement of a nucleotide string. - - Parameters - ---------- - - x : str - Original nucleotide string - """ - return pyfaidx.Sequence(seq=x).reverse.complement.seq - - -def trim_shared_prefix(ref, alt): - """ - Sometimes mutations are given with a shared prefix between the reference - and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids). - - This function trims the common prefix and returns the disjoint ref - and alt strings, along with the shared prefix. - """ - n_ref = len(ref) - n_alt = len(alt) - n_min = min(n_ref, n_alt) - i = 0 - while i < n_min and ref[i] == alt[i]: - i += 1 - - # guaranteed that ref and alt agree on all the characters - # up to i'th position, so it doesn't matter which one we pull - # the prefix out of - prefix = ref[:i] - ref_suffix = ref[i:] - alt_suffix = alt[i:] - return ref_suffix, alt_suffix, prefix - -def trim_shared_suffix(ref, alt): - """ - Reuse the `trim_shared_prefix` function above to implement similar - functionality for string suffixes. - - Given ref='ABC' and alt='BC', we first revese both strings: - reverse_ref = 'CBA' - reverse_alt = 'CB' - and then the result of calling trim_shared_prefix will be: - ('A', '', 'CB') - We then reverse all three of the result strings to get back - the shared suffix and both prefixes leading up to it: - ('A', '', 'BC') - """ - reverse_ref = ref[::-1] - reverse_alt = alt[::-1] - results = trim_shared_prefix(reverse_ref, reverse_alt) - return tuple(s[::-1] for s in results) - -def trim_shared_flanking_strings(ref, alt): - """ - Given two nucleotide or amino acid strings, identify - if they have a common prefix, a common suffix, and return - their unique components along with the prefix and suffix. - - For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR" - then the result will be: - ("F", "MLLFI", "SY", "FQGR") - """ - ref, alt, prefix = trim_shared_prefix(ref, alt) - ref, alt, suffix = trim_shared_suffix(ref, alt) - return ref, alt, prefix, suffix def group_by(records, field_name): """ diff --git a/varcode/effect_ordering.py b/varcode/effect_ordering.py index a58ed78..cc87da4 100644 --- a/varcode/effect_ordering.py +++ b/varcode/effect_ordering.py @@ -1,4 +1,4 @@ -from .transcript_mutation_effects import * +from .effects import * transcript_effect_priority_list = [ IncompleteTranscript, diff --git a/varcode/transcript_mutation_effects.py b/varcode/effects.py similarity index 99% rename from varcode/transcript_mutation_effects.py rename to varcode/effects.py index b1ca65c..10cfe1d 100644 --- a/varcode/transcript_mutation_effects.py +++ b/varcode/effects.py @@ -97,7 +97,7 @@ class IntronicSpliceSite(Intronic, SpliceSite): affect splicing and are given their own effect classes below. """ def __init__(self, *args, **kwargs): - Intronic.__init__(self, *self, **kwargs) + Intronic.__init__(self, *args, **kwargs) def short_description(self): return "intronic-splice-site" @@ -107,7 +107,7 @@ class SpliceDonor(IntronicSpliceSite): Mutation in the first two intron residues. """ def __init__(self, *args, **kwargs): - Intronic.__init__(self, *self, **kwargs) + Intronic.__init__(self, *args, **kwargs) def short_description(self): return "splice-donor" diff --git a/varcode/filter_variants.py b/varcode/filter_variants.py deleted file mode 100644 index 99fd760..0000000 --- a/varcode/filter_variants.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import argparse -from collections import namedtuple - -from . import maf -from .variant_annotator import VariantAnnotator -from .variant_collection import VariantCollection - -import numpy as np -import pyensembl - - -parser = argparse.ArgumentParser() - -parser.add_argument("input", - help="VCF input file") - -parser.add_argument("--output-csv", - help="Comma-separated output with chr/pos/ref/alt from VCF and annotation") - -parser.add_argument("--output-tsv", - help="Tab-separated output with chr/pos/ref/alt from VCF and annotation") - - - -if __name__ == "__main__": - # TODO: determine ensembl release from VCF metadata - annot = VariantAnnotator(ensembl_release=75) - - args = parser.parse_args() - - for record in VariantCollection(args.input): - chrom, pos = record.contig, record.pos - ref, alt = record.ref, record.alt - print(chrom, pos, ref, alt) - print("--", annot.describe_variant(chrom, pos, ref, alt)) - """ - genes_to_transcripts = {} - for transcript_name in transcript_names: - gene = ensembl.gene_name_of_transcript_name(transcript_name) - if gene in genes_to_transcripts: - genes_to_transcripts[gene].append(transcript_name) - else: - genes_to_transcripts[gene] = [transcript_name] - for (gene, transcript_names) in genes_to_transcripts.iteritems(): - print "\t", gene - for transcript_name in transcript_names: - print "\t\t", transcript_name - """ diff --git a/varcode/load_variants.py b/varcode/load_variants.py deleted file mode 100644 index 2c1aff4..0000000 --- a/varcode/load_variants.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2014. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .maf import load_maf -from .vcf import load_vcf - -def load_variants(filename): - """ - Creates VariantCollection from name of either VCF or MAF file - """ - assert isinstance(filename, str), \ - "Expected filename to be str, got %s : %s" % ( - filename, type(filename)) - - if filename.endswith(".vcf"): - return load_vcf(filename) - elif filename.endswith(".maf"): - return load_maf(filename) - else: - raise ValueError("Unrecognized file type: %s" % (filename,)) diff --git a/varcode/maf.py b/varcode/maf.py index f10c718..cf4b358 100755 --- a/varcode/maf.py +++ b/varcode/maf.py @@ -16,10 +16,15 @@ import logging from .nucleotides import normalize_nucleotide_string +from .reference_name import ( + infer_reference_name, + ensembl_release_number_for_reference_name +) from .variant import Variant from .variant_collection import VariantCollection import pandas +from pyensembl import EnsemblRelease TCGA_PATIENT_ID_LENGTH = 12 @@ -50,7 +55,11 @@ def load_maf_dataframe(filename, nrows=None, verbose=False): """ Load the guaranteed columns of a TCGA MAF file into a DataFrame """ - logging.info("Opening %s" % filename) + + if not isinstance(filename, str): + raise ValueError( + "Expected filename to be str, got %s : %s" % ( + filename, type(filename))) # skip comments and optional header with open(filename) as f: @@ -77,21 +86,7 @@ def load_maf(filename): if len(maf_df) == 0: raise ValueError("Empty MAF file %s" % filename) - ncbi_builds = maf_df.NCBI_Build.unique() - - if len(ncbi_builds) == 0: - raise ValueError("No NCBI builds for MAF file %s" % filename) - elif len(ncbi_builds) > 1: - raise ValueError( - "Multiple NCBI builds (%s) for MAF file %s" % (ncbi_builds, filename)) - - ncbi_build = ncbi_builds[0] - - if isinstance(ncbi_build, int): - reference_name = "B%d" % ncbi_build - else: - reference_name = str(ncbi_build) - + ensembl_objects = {} variants = [] for _, x in maf_df.iterrows(): contig = x.Chromosome @@ -99,21 +94,61 @@ def load_maf(filename): end_pos = x.End_Position ref = x.Reference_Allele + # it's possible in a MAF file to have multiple Ensembl releases + # mixed in a single MAF file (the genome assembly is + # specified by the NCBI_Build column) + ncbi_build = x.NCBI_Build + if ncbi_build in ensembl_objects: + ensembl = ensembl_objects[ncbi_build] + else: + if isinstance(ncbi_build, int): + reference_name = "B%d" % ncbi_build + else: + reference_name = str(ncbi_build) + + reference_name = infer_reference_name(reference_name) + ensembl_release = ensembl_release_number_for_reference_name( + reference_name) + ensembl = EnsemblRelease(release=ensembl_release) + ensembl_objects[ncbi_build] = ensembl + + # have to try both Tumor_Seq_Allele1 and Tumor_Seq_Allele2 + # to figure out which is different from the reference allele if x.Tumor_Seq_Allele1 != ref: alt = x.Tumor_Seq_Allele1 else: - assert x.Tumor_Seq_Allele2 != ref, \ - "Both tumor alleles agree with reference: %s" % (x,) + if x.Tumor_Seq_Allele2 == ref: + raise ValueError( + "Both tumor alleles agree with reference %s: %s" % ( + ref, x,)) alt = x.Tumor_Seq_Allele2 - assert end_pos == start_pos + len(ref) - 1, \ - "Expected variant %s:%s %s>%s to end at %d but got end_pos=%d" % ( + if end_pos != start_pos + len(ref) - 1: + raise ValueError( + "Expected variant %s:%s %s>%s to end at %d but got end=%d" % ( contig, start_pos, ref, alt, - start_pos + len(ref) - 1, end_pos) - - variants.append(Variant(contig, start_pos, ref, alt)) + start_pos + len(ref) - 1, end_pos)) + + # keep metadata about the variant and its TCGA annotation + info = { + 'Hugo_Symbol' : x.Hugo_Symbol, + 'Center' : x.Center, + 'Strand' : x.Strand, + 'Variant_Classification' : x.Variant_Classification, + 'Variant_Type' : x.Variant_Type, + 'dbSNP_RS' : x.dbSNP_RS, + 'dbSNP_Val_Status' : x.dbSNP_Val_Status, + 'Tumor_Sample_Barcode' : x.Tumor_Sample_Barcode, + 'Matched_Norm_Sample_Barcode' : x.Matched_Norm_Sample_Barcode, + } + + variant = Variant( + contig, start_pos, ref, alt, + ensembl=ensembl, + info=info) + + variants.append(variant) return VariantCollection( variants=variants, - original_filename=filename, - reference_name=reference_name) + original_filename=filename) diff --git a/varcode/string_helpers.py b/varcode/string_helpers.py new file mode 100644 index 0000000..41f3064 --- /dev/null +++ b/varcode/string_helpers.py @@ -0,0 +1,86 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +import pyfaidx + +def reverse_complement(x): + """ + Reverse complement of a nucleotide string. + + Parameters + ---------- + + x : str + Original nucleotide string + """ + return pyfaidx.Sequence(seq=x).reverse.complement.seq + + +def trim_shared_prefix(ref, alt): + """ + Sometimes mutations are given with a shared prefix between the reference + and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids). + + This function trims the common prefix and returns the disjoint ref + and alt strings, along with the shared prefix. + """ + n_ref = len(ref) + n_alt = len(alt) + n_min = min(n_ref, n_alt) + i = 0 + while i < n_min and ref[i] == alt[i]: + i += 1 + + # guaranteed that ref and alt agree on all the characters + # up to i'th position, so it doesn't matter which one we pull + # the prefix out of + prefix = ref[:i] + ref_suffix = ref[i:] + alt_suffix = alt[i:] + return ref_suffix, alt_suffix, prefix + +def trim_shared_suffix(ref, alt): + """ + Reuse the `trim_shared_prefix` function above to implement similar + functionality for string suffixes. + + Given ref='ABC' and alt='BC', we first revese both strings: + reverse_ref = 'CBA' + reverse_alt = 'CB' + and then the result of calling trim_shared_prefix will be: + ('A', '', 'CB') + We then reverse all three of the result strings to get back + the shared suffix and both prefixes leading up to it: + ('A', '', 'BC') + """ + reverse_ref = ref[::-1] + reverse_alt = alt[::-1] + results = trim_shared_prefix(reverse_ref, reverse_alt) + return tuple(s[::-1] for s in results) + +def trim_shared_flanking_strings(ref, alt): + """ + Given two nucleotide or amino acid strings, identify + if they have a common prefix, a common suffix, and return + their unique components along with the prefix and suffix. + + For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR" + then the result will be: + ("F", "MLLFI", "SY", "FQGR") + """ + ref, alt, prefix = trim_shared_prefix(ref, alt) + ref, alt, suffix = trim_shared_suffix(ref, alt) + return ref, alt, prefix, suffix diff --git a/varcode/transcript_helpers.py b/varcode/transcript_helpers.py new file mode 100644 index 0000000..7dbfb39 --- /dev/null +++ b/varcode/transcript_helpers.py @@ -0,0 +1,52 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +def interval_offset_on_transcript(start, end, transcript): + """ + Given an interval [start:end] and a particular transcript, + return the start offset of the interval relative to the + chromosomal positions of the transcript. + """ + # ensure that start_pos:end_pos overlap with transcript positions + if start > end: + raise ValueError( + "start_pos %d shouldn't be greater than end_pos %d" % ( + start, end)) + if start > transcript.end: + raise ValueError( + "Range %d:%d starts after transcript %s (%d:%d)" % ( + start, + end, + transcript, + transcript.start, + transcript.end)) + if end < transcript.start: + raise ValueError( + "Range %d:%d ends before transcript %s (%d:%d)" % ( + start, + end, + transcript, + transcript.start, + transcript.end)) + # trim the start position to the beginning of the transcript + if start < transcript.start: + start = transcript.start + # trim the end position to the end of the transcript + if end > transcript.end: + end = transcript.end + + # return earliest offset into the spliced transcript + return min(transcript.spliced_offset(start), transcript.spliced_offset(end)) \ No newline at end of file diff --git a/varcode/variant.py b/varcode/variant.py index 3390580..4c629e3 100644 --- a/varcode/variant.py +++ b/varcode/variant.py @@ -1,39 +1,106 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function, division, absolute_import +from .coding_effect import infer_coding_effect +from .common import group_by from .nucleotides import normalize_nucleotide_string +from .string_helpers import reverse_complement, trim_shared_flanking_strings +from .transcript_helpers import interval_offset_on_transcript +from .effects import ( + NoncodingTranscript, + IncompleteTranscript, + FivePrimeUTR, + ThreePrimeUTR, + Intronic, + IntronicSpliceSite, + SpliceAcceptor, + SpliceDonor, +) +from .variant_effect_collection import VariantEffectCollection -from memoized_property import memoized_property +from pyensembl import Transcript, find_nearest_locus, EnsemblRelease from pyensembl.locus import normalize_chromosome +from pyensembl.biotypes import is_coding_biotype class Variant(object): - def __init__(self, contig, pos, ref, alt, info=None): + def __init__(self, contig, pos, ref, alt, ensembl, info=None): + """ + Construct a Variant object. + + Parameters + ---------- + contig : str + Chromosome that this variant is on + + pos : int + 1-based position on the chromosome of first reference nucleotide + + ref : str + Reference nucleotide(s) + + alt : str + Alternate nucleotide(s) + + ensembl : EnsemblRelease + Ensembl object used for determining gene/transcript annotations + """ self.contig = normalize_chromosome(contig) - self.pos = int(pos) self.ref = normalize_nucleotide_string(ref) self.alt = normalize_nucleotide_string(alt) + self.pos = int(pos) + self.end = self.pos + len(self.ref) - 1 + + if not isinstance(ensembl, EnsemblRelease): + raise TypeError("Expected EnsemblRelease, got %s : %s" % ( + ensembl, type(ensembl))) + self.ensembl = ensembl + self.info = {} if info is None else info - @memoized_property - def end_pos(self): - return self.pos + len(self.ref) - 1 + @property + def reference_name(self): + return self.ensembl.reference.reference_name def __str__(self): - return "Variant(contig=%s, pos=%d, ref=%s, alt=%s)" % ( - self.contig, self.pos, self.ref, self.alt) + fields = self.fields() + return "Variant(contig=%s, pos=%d, ref=%s, alt=%s, genome=%s)" % fields def __repr__(self): return str(self) def __hash__(self): - return hash((self.contig, self.pos, self.ref, self.alt)) + return hash(self.fields()) + + def fields(self): + """ + All identifying fields of a variant (contig, pos, ref, alt, genome) + in a single tuple. This makes for cleaner printing, hashing, and + comparisons. + """ + return ( + self.contig, + self.pos, + self.ref, + self.alt, + self.reference_name) def __eq__(self, other): return ( isinstance(other, Variant) and - self.contig == other.contig and - self.pos == other.pos and - self.ref == other.ref and - self.alt == other.alt) + self.fields() == other.fields()) def short_description(self): chrom, pos, ref, alt = self.contig, self.pos, self.ref, self.alt @@ -46,3 +113,255 @@ def short_description(self): chrom, pos + len(alt), pos + len(ref), ref[len(alt):]) else: return "chr%s g.%d %s>%s" % (chrom, pos, ref, alt) + + def transcripts(self): + return self.ensembl.transcripts_at_locus( + self.contig, self.pos, self.end) + + def coding_transcripts(self): + """ + Protein coding transcripts + """ + return [ + transcript for transcript in self.transcripts() + if is_coding_biotype(transcript.biotype) + ] + + def genes(self): + """ + Return Gene object for all genes which overlap this variant. + """ + return self.ensembl.genes_at_locus( + self.contig, self.pos, self.end) + + def gene_names(self): + """ + Return names of all genes which overlap this variant. Calling + this method is significantly cheaper than calling `Variant.genes()`, + which has to issue many more queries to construct each Gene object. + """ + return self.ensembl.gene_names_at_locus( + self.contig, self.pos, self.end) + + def coding_genes(self): + """ + Protein coding transcripts + """ + return [ + gene for gene in self.genes() + if is_coding_biotype(gene.biotype) + ] + + def effects(self, only_coding_transcripts=False, raise_on_error=True): + """ + Determine the effects of a variant on any transcripts it overlaps. + Returns a VariantEffectCollection object. + """ + + overlapping_transcripts = self.transcripts() + + if only_coding_transcripts: + # remove non-coding transcripts + overlapping_transcripts = [ + transcript + for transcript + in overlapping_transcripts + if is_coding_biotype(transcript.biotype) + ] + + if len(overlapping_transcripts) == 0: + # intergenic variant + return VariantEffectCollection( + variant=self, + gene_effect_groups={}, + errors={}) + + # group transcripts by their gene ID + overlapping_transcript_groups = group_by( + overlapping_transcripts, field_name='gene_id') + + # dictionary from gene ID to list of transcript effects + gene_transcript_effects_groups = {} + + # mapping from Transcript objects to errors encountered + # while trying to annotated them + errors = {} + + for (gene_id, transcripts) in overlapping_transcript_groups.items(): + effects = [] + for transcript in transcripts: + try: + effects.append(self.transcript_effect(transcript)) + except (AssertionError, ValueError) as error: + if raise_on_error: + raise + else: + logging.warn( + "Encountered error annotating %s for %s: %s", + self, + transcript, + error) + errors[transcript] = error + gene_transcript_effects_groups[gene_id] = effects + + return VariantEffectCollection( + variant=self, + gene_effect_groups=gene_transcript_effects_groups, + errors=errors) + + def transcript_effect(self, transcript): + """ + Return the transcript effect (such as FrameShift) that results from + applying this genomic variant to a particular transcript. + + Parameters + ---------- + + transcript : Transcript + Transcript we're going to apply mutation to. + """ + + if not isinstance(transcript, Transcript): + raise TypeError( + "Expected %s : %s to have type Transcript" % ( + transcript, type(transcript))) + + # check for non-coding transcripts first, since + # every non-coding transcript is "incomplete". + if not is_coding_biotype(transcript.biotype): + return NoncodingTranscript(self, transcript) + + if not transcript.complete: + return IncompleteTranscript(self, transcript) + + distance_to_exon, nearest_exon = find_nearest_locus( + start=self.pos, + end=self.end, + loci=transcript.exons) + + if distance_to_exon > 0: + return self._intronic_transcript_effect( + transcript=transcript, + nearest_exon=nearest_exon) + + # TODO: exonic splice site mutations + return self._exonic_transcript_effect(transcript) + + + def _intronic_transcript_effect(self, transcript, nearest_exon): + """ + Infer effect of variant which does not overlap any exon of + the given transcript. + """ + distance_to_exon = nearest_exon.distance_to_interval( + self.pos, self.end) + + assert distance_to_exon > 0, \ + "Expected intronic effect to have distance_to_exon > 0, got %d" % ( + distance_to_exon,) + + before_forward_exon = ( + transcript.strand == "+" and + self.pos < nearest_exon.start) + + before_backward_exon = ( + transcript.strand == "-" and + self.end > nearest_exon.end) + + before_exon = before_forward_exon or before_backward_exon + + if distance_to_exon <= 2: + if before_exon: + # 2 last nucleotides of intron before exon are the splice acceptor + # site, typically "AG" + effect_class = SpliceAcceptor + else: + # 2 first nucleotides of intron after exon are the splice donor + # site, typically "GT" + effect_class = SpliceDonor + elif not before_exon and distance_to_exon <= 6: + # variants in nucleotides 3-6 at start of intron aren't as certain + # to cause problems as nucleotides 1-2 but still implicated in + # alternative splicing + effect_class = IntronicSpliceSite + elif before_exon and distance_to_exon <= 4: + # nucleotides -4 and -3 before exon are part of the 3' splicing motif + # but allow for more degeneracy than the -2, -1 nucleotides + effect_class = IntronicSpliceSite + else: + assert distance_to_exon > 6, \ + "Looks like we didn't cover all possible splice site mutations" + # intronic mutation unrelated to splicing + effect_class = Intronic + + return effect_class( + variant=self, + transcript=transcript, + nearest_exon=nearest_exon, + distance_to_exon=distance_to_exon) + + + def _offset_on_transcript(self, transcript): + """ + Given a Variant (with fields ref, alt, pos, and end), compute the offset + of the variant position relative to the position and direction of the + transcript. + + Returns a tuple (offset, ref, alt), where ref and alt are this variant's + ref and alt nucleotide strings with shared prefixes and suffixes removed + and the bases reversed if the transcript is on the backward strand. + + For example, + + >> variant = Variant(ref="ATGGC", alt="TCGC", pos=3000, end=3005) + >> variant.offset_on_transcript(Transcript(start=2000, end=4000)) + (1002, "G", "C") + """ + if transcript.on_backward_strand: + ref = reverse_complement(self.ref) + alt = reverse_complement(self.alt) + else: + ref = self.ref + alt = self.alt + + # in case nucleotide strings share prefix (e.g. ref="C", alt="CC") + # bump the offsets and make the strings disjoint (ref="", alt="C") + ref, alt, prefix, _ = trim_shared_flanking_strings(ref, alt) + n_same = len(prefix) + start_offset_with_utr5 = interval_offset_on_transcript( + self.pos, self.end, transcript) + offset = start_offset_with_utr5 + n_same + return offset, ref, alt + + def _exonic_transcript_effect(self, transcript): + """ + Effect of this variant on a Transcript, assuming we already know + that this variant overlaps some exon of the transcript. + """ + offset_with_utr5, ref, alt = self._offset_on_transcript(transcript) + + utr5_length = min(transcript.start_codon_spliced_offsets) + + if utr5_length > offset_with_utr5: + # TODO: what do we do if the variant spans the beginning of + # the coding sequence? + if utr5_length < offset_with_utr5 + len(ref): + raise ValueError( + "Variant which span 5' UTR and CDS not supported: %s" % ( + self,)) + return FivePrimeUTR(self, transcript) + + utr3_offset = max(transcript.stop_codon_spliced_offsets) + 1 + + if offset_with_utr5 >= utr3_offset: + return ThreePrimeUTR(self, transcript) + + cds_offset = offset_with_utr5 - utr5_length + + return infer_coding_effect( + ref, + alt, + cds_offset, + variant=self, + transcript=transcript) + diff --git a/varcode/variant_annotator.py b/varcode/variant_annotator.py deleted file mode 100644 index 63c6168..0000000 --- a/varcode/variant_annotator.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import print_function, division, absolute_import -import logging - -from .common import group_by -from .core_logic import infer_transcript_effect -from .variant import Variant -from .variant_effect import VariantEffect - -import pyensembl - -class VariantAnnotator(object): - def __init__(self, ensembl_release): - self.ensembl = pyensembl.EnsemblRelease(ensembl_release) - - def effect(self, variant, raise_on_error=True): - """ - Determine the effects of a variant on any transcripts it overlaps. - Returns a VariantEffect object. - """ - contig = variant.contig - pos = variant.pos - end_pos = variant.end_pos - ref = variant.ref - alt = variant.alt - - overlapping_transcripts = self.ensembl.transcripts_at_locus( - contig, pos, end_pos) - - if len(overlapping_transcripts) == 0: - # intergenic variant - return VariantEffect( - variant=variant, - genes=[], - gene_transcript_effects={}, - errors={}) - - # group transcripts by their gene ID - overlapping_transcript_groups = group_by( - overlapping_transcripts, field_name='gene_id') - - # dictionary from gene ID to list of transcript effects - gene_transcript_effects_groups = {} - - # mapping from Transcript objects to errors encountered - # while trying to annotated them - errors = {} - - for (gene_id, transcripts) in overlapping_transcript_groups.items(): - effects = [] - for transcript in transcripts: - try: - effects.append(infer_transcript_effect(variant, transcript)) - except (AssertionError, ValueError) as error: - if raise_on_error: - raise - else: - logging.warn( - "Encountered error annotating %s for %s: %s", - variant, - transcript, - error) - errors[transcript] = error - gene_transcript_effects_groups[gene_id] = effects - - # construct Gene objects for all the genes containing - # all the transcripts this variant overlaps - overlapping_genes = [ - self.ensembl.gene_by_id(gene_id) - for gene_id - in overlapping_transcript_groups.keys() - ] - - return VariantEffect( - variant=variant, - genes=overlapping_genes, - gene_transcript_effects=gene_transcript_effects_groups, - errors=errors) diff --git a/varcode/variant_collection.py b/varcode/variant_collection.py index 122482a..537f057 100644 --- a/varcode/variant_collection.py +++ b/varcode/variant_collection.py @@ -13,23 +13,15 @@ # limitations under the License. from __future__ import print_function, division, absolute_import +from collections import Counter + +from .effects import Substitution +from .effect_ordering import effect_priority, transcript_effect_priority_dict -from .effect_ordering import effect_priority -from .reference_name import ( - infer_reference_name, - ensembl_release_number_for_reference_name -) -from .variant_annotator import VariantAnnotator class VariantCollection(object): - def __init__( - self, - variants, - reference_path=None, - reference_name=None, - ensembl_release=None, - original_filename=None): + def __init__(self, variants, original_filename=None): """ Construct a VariantCollection from a list of Variant records and the name of a reference genome. @@ -43,39 +35,8 @@ def __init__( original_filename : str, optional File from which we loaded variants, though the current VariantCollection may only contain a subset of them. - - reference_path : str, optional - Path to reference FASTA file. - - reference_name : str, optional - Name of reference genome (e.g. "GRCh37", "hg18"). If not given - infer from reference path or from ensembl_release. - - ensembl_release : int, optional - If not specified, infer Ensembl release from reference_name """ self.variants = variants - self.reference_path = reference_path - if reference_name: - # convert from e.g. "hg19" to "GRCh37" - # - # TODO: actually handle the differences between these references - # instead of just treating them as interchangeable - self.reference_name = infer_reference_name(reference_name) - else: - if reference_path: - self.reference_name = infer_reference_name(reference_path) - else: - raise ValueError( - "Must specify one of reference_path or reference_name") - - if ensembl_release: - self.ensembl_release = ensembl_release - else: - self.ensembl_release = ensembl_release_number_for_reference_name( - self.reference_name) - - self.annot = VariantAnnotator(ensembl_release=self.ensembl_release) self.original_filename = original_filename def __len__(self): @@ -87,14 +48,13 @@ def __iter__(self): def __eq__(self, other): return ( isinstance(other, VariantCollection) and - self.reference_name == other.reference_name and len(self.variants) == len(other.variants) and all(v1 == v2 for (v1, v2) in zip(self.variants, other.variants))) def __str__(self): fields = [ ("n_variants", len(self.variants)), - ("reference", self.reference_name) + ("reference", ", ".join(self.reference_names())) ] if self.original_filename: @@ -110,83 +70,120 @@ def __str__(self): def __repr__(self): return str(self) - def clone(self, new_variants=None): + def reference_names(self): + """ + All unique reference names associated with variants in this collection. + """ + return {variant.reference_name for variant in self.variants} + + def _clone_metadata(self, new_variants): """ Create copy of VariantCollection with same metadata but possibly - different Variant entries. If no variants provided, then just make a - copy of self.variants. + different Variant entries. """ - if new_variants is None: - new_variants = self.variants return VariantCollection( - variants=list(new_variants), - original_filename=self.original_filename, - reference_path=self.reference_path, - reference_name=self.reference_name, - ensembl_release=self.ensembl_release) + variants=new_variants, + original_filename=self.original_filename) def drop_duplicates(self): """ Create a new VariantCollection without any duplicate variants. """ - return self.clone(set(self.variants)) + return self._clone_metadata(set(self.variants)) - def variant_effects(self, raise_on_error=True): + def variant_effects( + self, + high_impact=False, + only_coding_transcripts=False, + raise_on_error=True): """ - Determine the impact of each variant, return a list of - VariantEffect objects. + Returns a list containing one VariantEffectCollection object for each + variant in this VariantCollection. Parameters ---------- + high_impact : bool, optional + Only show effects which make changes to coding transcripts that + are predicted to be deleterious (amino acid sequence changes, + changes to essential splice sites, frameshifts, &c) - raise_on_error : bool, optional - Raise exception if error is encountered while annotating - transcripts, otherwise track errors in VariantEffect.errors - dictionary (default=True). + only_coding_transcripts : bool, optional + Only annotate variant effects on coding transcripts. + raise_on_error : bool, optional + If exception is raised while determining effect of variant on a + transcript, should it be raised? This default is True, meaning + errors result in raised exceptions. If raise_on_error=False then + exceptions are logged in VariantEffectCollection.errors. """ - return [ - self.annot.effect( - variant=variant, + results = [] + + if high_impact: + min_priority = transcript_effect_priority_dict[Substitution] + else: + min_priority = -1 + + for variant in self.variants: + variant_effect_collection = variant.effects( + only_coding_transcripts=only_coding_transcripts, raise_on_error=raise_on_error) - for variant - in self.variants - ] - def effects_to_string(self, *args, **kwargs): + if only_coding_transcripts and len(variant_effect_collection) == 0: + # if we only want coding transcripts, then skip all + # intergenic and non-coding gene variants + continue + + best_effect = variant_effect_collection.highest_priority_effect + # either this variant is intergenic and there's no minimum + # threshold for effect priority or the highest impact effect + # is higher priority than the min_priority + if ((best_effect is None and min_priority < 0) or + (effect_priority(best_effect) >= min_priority)): + results.append(variant_effect_collection) + return results + + def effect_summary(self, *args, **kwargs): """ Create a long string with all transcript effects for each mutation, grouped by gene (if a mutation affects multiple genes). - Arguments are passed on to variant_effects(*args, **kwargs). + Arguments are passed on to self.variant_effects(*args, **kwargs). """ lines = [] - for variant_effect in self.variant_effects(*args, **kwargs): - transcript_effect_count = 0 - lines.append("\n%s" % variant_effect.variant) - transcript_effect_lists = variant_effect.gene_transcript_effects - for gene, transcript_effects in transcript_effect_lists.iteritems(): - gene_name = self.annot.ensembl.gene_name_of_gene_id(gene) - lines.append(" Gene: %s (%s)" % (gene_name, gene)) - # print transcript effects with more significant impact + for effect_collection in self.variant_effects(*args, **kwargs): + variant = effect_collection.variant + lines.append("\n%s" % variant) + transcript_effect_lists = effect_collection.gene_effect_groups + for gene_id, effects in transcript_effect_lists.iteritems(): + gene_name = variant.ensembl.gene_name_of_gene_id(gene_id) + lines.append(" Gene: %s (%s)" % (gene_name, gene_id)) + # place transcript effects with more significant impact # on top (e.g. FrameShift should go before NoncodingTranscript) - for transcript_effect in sorted( - transcript_effects, + for effect in sorted( + effects, key=effect_priority, reverse=True): - transcript_effect_count += 1 - lines.append(" -- %s" % transcript_effect) + lines.append(" -- %s" % effect) # if we only printed one effect for this gene then # it's redundant to print it again as the highest priority effect - if transcript_effect_count > 1: - best = variant_effect.highest_priority_effect + if len(effect_collection) > 1: + best = effect_collection.highest_priority_effect lines.append(" Highest Priority Effect: %s" % best) return "\n".join(lines) - def print_effects(self, *args, **kwargs): + def reference_names(self): + """ + All distinct reference names used by Variants in this + collection. """ - Print all variants and their transcript effects (grouped by gene). + return { variant.reference_name for variant in self.variants } - Arguments are passed on to effects_to_string(*args, **kwargs). + def gene_counts(self, only_coding=False): """ - print(self.effects_to_string(*args, **kwargs)) + Count how many variants overlap each gene name. + """ + counter = Counter() + for variant in self.variants: + for gene_name in variant.gene_names(): + counter[gene_name] += 1 + return counter diff --git a/varcode/variant_effect.py b/varcode/variant_effect.py deleted file mode 100644 index c10f563..0000000 --- a/varcode/variant_effect.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import print_function, division, absolute_import - -from .effect_ordering import top_priority_transcript_effect - -from pyensembl.biotypes import is_coding_biotype - -class VariantEffect(object): - """ - A VariantEffect object is a container for all the TranscriptEffects of - a particular mutation, as well as some properties that attempt to - summarize the impact of the mutation across all genes/transcripts. - """ - - def __init__( - self, - variant, - genes, - gene_transcript_effects, - errors={}): - """ - variant : Variant - - genes : list - List of Gene objects - - gene_transcript_effects : dict - Dictionary from gene ID to list of transcript variant effects - - errors : dict, optional - Mapping from transcript objects to strings explaining error which - was encountered while trying to annotate them. - """ - self.variant = variant - self.genes = genes - self.gene_transcript_effects = gene_transcript_effects - - # dictionary mapping from transcript IDs to transcript mutation effects - self.transcript_effects = {} - for (_, transcript_effects) in self.gene_transcript_effects.items(): - for effect in transcript_effects: - self.transcript_effects[effect.transcript.id] = effect - - # if our variant overlaps any genes, then choose the highest - # priority transcript variant, otherwise call the variant "Intergenic" - if len(self.transcript_effects) > 0: - self.highest_priority_effect = top_priority_transcript_effect( - self.transcript_effects.values()) - highest_priority_class = self.highest_priority_effect.__class__ - self.variant_summary = highest_priority_class.__name__ - else: - self.highest_priority_effect = None - self.variant_summary = "Intergenic" - - self.errors = errors - - @property - def coding_genes(self): - """ - The `genes` property includes all sorts of esoteric entities - like anti-sense annotations and pseudogenes. - - This property gives you just protein coding genes and functional RNAs - """ - return [ - gene for gene in self.genes - if is_coding_biotype(gene.biotype) - ] - - def __str__(self): - fields = [ - ("variant", self.variant.short_description()), - ("genes", [gene.name for gene in self.genes]), - ("transcript_effects", self.transcript_effects) - ] - if self.errors: - fields.append( ("errors", self.errors) ) - return "VariantEffect(%s)" % ( - ", ".join(["%s=%s" % (k,v) for (k,v) in fields])) - - def __repr__(self): - return str(self) diff --git a/varcode/variant_effect_collection.py b/varcode/variant_effect_collection.py new file mode 100644 index 0000000..5516b28 --- /dev/null +++ b/varcode/variant_effect_collection.py @@ -0,0 +1,121 @@ +# Copyright (c) 2014. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +from .effect_ordering import top_priority_transcript_effect + +from pyensembl.biotypes import is_coding_biotype + +class VariantEffectCollection(object): + """ + Collection of all the TranscriptEffect objects associated with + each transcript overlapping a given variant, as well as some properties + that attempt to summarize the impact of the mutation across all + genes/transcripts. + """ + def __init__( + self, + variant, + gene_effect_groups, + errors={}): + """ + variant : Variant + + gene_effect_groups : dict + Dictionary from gene ID to list of transcript variant effects + + errors : dict, optional + Mapping from transcript objects to strings explaining error which + was encountered while trying to annotate them. + """ + self.variant = variant + self.gene_effect_groups = gene_effect_groups + + # dictionary mapping from transcript IDs to transcript mutation effects + self.transcript_effect_dict = {} + for (_, effect_list) in self.gene_effect_groups.items(): + for effect in effect_list: + self.transcript_effect_dict[effect.transcript.id] = effect + + # if our variant overlaps any genes, then choose the highest + # priority transcript variant, otherwise call the variant "Intergenic" + if len(self.transcript_effect_dict) > 0: + self.highest_priority_effect = top_priority_transcript_effect( + self.transcript_effect_dict.values()) + highest_priority_class = self.highest_priority_effect.__class__ + self.variant_summary = highest_priority_class.__name__ + else: + self.highest_priority_effect = None + self.variant_summary = "Intergenic" + + self.errors = errors + + def __str__(self): + fields = [ + ("variant", self.variant.short_description()), + ("genes", self.gene_names()), + ("effects", self.effects()) + ] + if self.errors: + fields.append( ("errors", self.errors) ) + return "VariantEffectCollection(%s)" % ( + ", ".join(["%s=%s" % (k,v) for (k,v) in fields])) + + def __repr__(self): + return str(self) + + def __len__(self): + """ + Length of a VariantEffectCollection is the number of effect objects + it contains. + """ + return len(self.transcript_effect_dict) + + def effects(self): + """ + Returns all TranscriptMutationEffect objects contained in this + collection. + """ + return self.transcript_effect_dict.values() + + def transcripts(self): + """ + Returns list of Transcript objects for all transcripts which are + annotated with effects in this collection. + """ + return [effect.transcript for effect in self.effects()] + + def genes(self): + """ + Construct Gene objects for all the genes which contain the + transcripts in this effect collection. + """ + return [ + self.variant.ensembl.gene_by_id(gene_id) + for gene_id + in self.gene_effect_groups + ] + + def gene_names(self): + """ + Fetch names of all the genes which contain the + transcripts in this effect collection. This is significantly + cheaper than constructing a Gene object and fetching its name. + """ + return [ + self.variant.ensembl.gene_name_of_gene_id(gene_id) + for gene_id + in self.gene_effect_groups + ] diff --git a/varcode/vcf.py b/varcode/vcf.py index 282278d..19e5366 100644 --- a/varcode/vcf.py +++ b/varcode/vcf.py @@ -15,13 +15,22 @@ # required so that 'import vcf' gets the global PyVCF package, # rather than our local vcf module from __future__ import absolute_import + +from .reference_name import ( + infer_reference_name, + ensembl_release_number_for_reference_name +) from .variant import Variant from .variant_collection import VariantCollection +from pyensembl import EnsemblRelease # PyVCF import vcf -def load_vcf(filename, reference_path_field='reference'): +def load_vcf( + filename, + reference_path_field='reference', + ensembl_release=None): """ Load reference name and Variant objects from the given VCF filename. Drop any entries whose FILTER field is not one of "." or "PASS". @@ -34,19 +43,39 @@ def load_vcf(filename, reference_path_field='reference'): reference_path_field : str, optional Name of metadata field which contains path to reference FASTA file (default = 'reference') + + ensembl_release : int, optional + Which release of Ensembl to use for annotation, by default inferred + from the reference path. """ + + if not isinstance(filename, str): + raise ValueError( + "Expected filename to be str, got %s : %s" % ( + filename, type(filename))) + vcf_reader = vcf.Reader(filename=filename) - reference_name = vcf_reader.metadata[reference_path_field] + + if not ensembl_release: + reference_path = vcf_reader.metadata[reference_path_field] + reference_name = infer_reference_name(reference_path) + ensembl_release = ensembl_release_number_for_reference_name( + reference_name) + + ensembl = EnsemblRelease(release=ensembl_release) variants = [ Variant( - x.CHROM, x.POS, - x.REF, x.ALT[0].sequence, - x.INFO) + contig=x.CHROM, + pos=x.POS, + ref=x.REF, + alt=x.ALT[0].sequence, + info=x.INFO, + ensembl=ensembl) for x in vcf_reader if not x.FILTER or x.FILTER == "PASS" ] + return VariantCollection( variants=variants, - original_filename=filename, - reference_path=reference_name) + original_filename=filename) \ No newline at end of file