Merge d1e7fd3 into 408b1cf

openvax · Oct 12, 2018 · 07042cf · 07042cf
2 parents 408b1cf + d1e7fd3
commit 07042cf
Show file tree

Hide file tree

Showing 13 changed files with 157 additions and 38 deletions.
diff --git a/test/test_exonic_splice_site.py b/test/test_exonic_splice_site.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2018. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from varcode import Variant
+from varcode.effects import ExonicSpliceSite, PrematureStop
+
+
+def test_STAT1_stop_gain_at_exon_boundary():
+    # top priority effect for this variant should be PrematureStop,
+    # even though it's also ExonicSpliceSite
+    stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37")
+    effects = stat1_variant.effects()
+    print(effects)
+    assert any([e.__class__ is ExonicSpliceSite for e in effects])
+    top_effect = effects.top_priority_effect()
+    print(top_effect)
+    assert top_effect.__class__ is PrematureStop
diff --git a/test/test_vcf_output.py b/test/test_vcf_output.py
@@ -13,17 +13,14 @@
 # limitations under the License.
 
 from __future__ import print_function, division, absolute_import
-import os
-from nose.tools import eq_
 from varcode import load_vcf, load_maf
-from varcode import VariantCollection
+
 from varcode.vcf_output import variants_to_vcf
 from .data import data_path
 import tempfile
 
-TEST_FILENAMES = [
+TEST_FILENAMES_HUMAN = [
     'duplicates.maf',
-    'mouse_vcf_dbsnp_chr1_partial.vcf',
     'multiallelic.vcf',
     'mutect-example.vcf',
     'ov.wustle.subset5.maf',
@@ -38,13 +35,21 @@
     # 'somatic_hg19_14muts.vcf.gz',     # gzip
 ]
 
+TEST_FILENAMES_MOUSE = [
+    'mouse_vcf_dbsnp_chr1_partial.vcf',
+]
+
+TEST_FILENAMES = TEST_FILENAMES_HUMAN + TEST_FILENAMES_MOUSE
+
+
 def _merge_metadata_naive(variants):
     return {
         k: v
         for d in variants.source_to_metadata_dict.values()
         for k, v in d.items()
     }
 
+
 def _do_roundtrip_test(filenames):
 
     def load_fn(filename):
@@ -69,11 +74,11 @@ def load_variants():
 
     # `==` checks the reference genome, which won't necessarily match.
     assert all(
-            v1.contig == v2.contig and \
-            v1.start == v2.start and \
-            v1.ref == v2.ref and \
-            v1.start == v2.start \
-            for (v1, v2) in zip(variants, reparsed_variants))
+        v1.contig == v2.contig and
+        v1.start == v2.start and
+        v1.ref == v2.ref and
+        v1.start == v2.start
+        for (v1, v2) in zip(variants, reparsed_variants))
 
     return (variants, reparsed_variants)
 
@@ -91,43 +96,47 @@ def load_variants():
     # metadata (without the need to individually convert fields), we'd need to add
     # these headers to the output VCF file. See `vcf_output.py` for more info.
 
+
 def test_single_file_roundtrip_conversion():
     for filename in TEST_FILENAMES:
         yield (_do_roundtrip_test, [filename])
 
+
 def test_multiple_file_roundtrip_conversion():
     file_groups = (
         ['simple.1.vcf', 'simple.2.vcf'],  # basic multi-file test
         ['duplicates.maf', 'multiallelic.vcf'],  # dif. file formats
         ['duplicate-id.1.vcf', 'duplicate-id.2.vcf'],
-        TEST_FILENAMES,  # because why not?
+        TEST_FILENAMES_HUMAN,
     )
     for file_group in file_groups:
         yield (_do_roundtrip_test, file_group)
 
+
 def test_same_samples_produce_samples():
     """Ensures that, if a set of variants have the same samples, the reparsed
     collection will output these samples.
     """
     (variants, reparsed_variants) = _do_roundtrip_test(
-            ['same-samples.1.vcf', 'same-samples.2.vcf'])
+        ['same-samples.1.vcf', 'same-samples.2.vcf'])
 
     original_metadata = _merge_metadata_naive(variants)
     reparsed_metadata = _merge_metadata_naive(reparsed_variants)
 
     sample_names = set(list(original_metadata.values())[0]['sample_info'].keys())
     assert all(
-            set(d.get('sample_info', {}).keys()) == sample_names
-            for d in reparsed_metadata.values())
+        set(d.get('sample_info', {}).keys()) == sample_names
+        for d in reparsed_metadata.values())
+
 
 def test_different_samples_produce_no_samples():
     """Ensures that, if a set of variants have different samples, the reparsed
     collection will not output any samples.
-    
+
     See `vcf_output.py` for details as to why this is the way it's done for now.
     """
     (_, reparsed_variants) = _do_roundtrip_test(
-            ['different-samples.1.vcf', 'different-samples.2.vcf'])
+        ['different-samples.1.vcf', 'different-samples.2.vcf'])
 
     metadata = _merge_metadata_naive(reparsed_variants)
     assert all(d.get('sample_info') is None for d in metadata.values())
diff --git a/varcode/__init__.py b/varcode/__init__.py
@@ -24,7 +24,7 @@
     NonsilentCodingMutation,
 )
 
-__version__ = '0.7.1'
+__version__ = '0.8.0'
 
 __all__ = [
     # basic classes

diff --git a/varcode/effects/common.py b/varcode/effects/common.py
@@ -17,6 +17,7 @@
 from Bio.Seq import Seq
 from six import string_types, text_type
 
+
 def bio_seq_to_str(seq):
     if isinstance(seq, Seq):
         return str(seq)

diff --git a/varcode/effects/effect_classes.py b/varcode/effects/effect_classes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
+# Copyright (c) 2016-2018. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 from .common import bio_seq_to_str
 
+
 class MutationEffect(Serializable):
     """
     Base class for mutation effects.
@@ -126,6 +127,7 @@ def __init__(self, variant, gene):
         MutationEffect.__init__(self, variant)
         self.gene = gene
 
+
 class TranscriptMutationEffect(Intragenic):
     def __init__(self, variant, transcript):
         Intragenic.__init__(self, variant, gene=transcript.gene)
@@ -144,25 +146,29 @@ class Failure(TranscriptMutationEffect):
     need to create a non-empty list of effects for each variant.
     """
 
+
 class NoncodingTranscript(TranscriptMutationEffect):
     """
     Any mutation to a transcript with a non-coding biotype
     """
     short_description = "non-coding-transcript"
 
+
 class IncompleteTranscript(TranscriptMutationEffect):
     """
     Any mutation to an incompletely annotated transcript with a coding biotype
     """
     short_description = "incomplete"
 
+
 class FivePrimeUTR(TranscriptMutationEffect):
     """
     Any mutation to the 5' untranslated region (before the start codon) of
     coding transcript.
     """
     short_description = "5' UTR"
 
+
 class ThreePrimeUTR(TranscriptMutationEffect):
     """
     Any mutation to the 3' untranslated region (after the stop codon) of
@@ -182,12 +188,14 @@ def __init__(self, variant, transcript, nearest_exon, distance_to_exon):
 
     short_description = "intronic"
 
+
 class SpliceSite(object):
     """
     Parent class for all splice site mutations.
     """
     pass
 
+
 class IntronicSpliceSite(Intronic, SpliceSite):
     """
     Mutations near exon boundaries, excluding the first two and last two
@@ -200,6 +208,7 @@ def __init__(self, variant, transcript, nearest_exon, distance_to_exon):
 
     short_description = "intronic-splice-site"
 
+
 class SpliceDonor(IntronicSpliceSite):
     """
     Mutation in the first two intron residues.
@@ -210,18 +219,21 @@ def __init__(self, variant, transcript, nearest_exon, distance_to_exon):
 
     short_description = "splice-donor"
 
+
 class SpliceAcceptor(IntronicSpliceSite):
     """
     Mutation in the last two intron residues.
     """
     short_description = "splice-acceptor"
 
+
 class Exonic(TranscriptMutationEffect):
     """
     Any mutation which affects the contents of an exon (coding region or UTRs)
     """
     pass
 
+
 class ExonLoss(Exonic):
     """
     Deletion of one or more exons in a transcript.
@@ -248,6 +260,7 @@ def modifies_coding_sequence(self):
         # TODO: distinguish between exon loss in the CDS and UTRs
         return True
 
+
 class ExonicSpliceSite(Exonic, SpliceSite):
     """
     Mutation in the last three nucleotides before an intron
@@ -283,6 +296,7 @@ def modifies_protein_sequence(self):
     def modifies_coding_sequence(self):
         return self.alternate_effect.modifies_coding_sequence
 
+
 class CodingMutation(Exonic):
     """
     Base class for all mutations which result in a modified coding sequence.
@@ -305,6 +319,7 @@ def __str__(self):
     def modifies_coding_sequence(self):
         return True
 
+
 class Silent(CodingMutation):
     """Mutation to an exon of a coding region which doesn't change the
     amino acid sequence.
@@ -339,6 +354,7 @@ def __init__(
     def short_description(self):
         return "silent"
 
+
 class AlternateStartCodon(Silent):
     """Change to the start codon (e.g. ATG>CTG) but without changing the
     starting amino acid from methionine.
@@ -363,6 +379,7 @@ def short_description(self):
         return "alternate-start-codon (%s>%s)" % (
             self.ref_codon, self.alt_codon)
 
+
 class NonsilentCodingMutation(CodingMutation):
     """
     All coding mutations other than silent codon substitutions
@@ -402,11 +419,23 @@ def __init__(
     def modifies_protein_sequence(self):
         return True
 
+
 class StartLoss(NonsilentCodingMutation):
     """
     When a start codon is lost it's difficult to determine if there is
     an alternative Kozak consensus sequence (either before or after the
     original) from which an alternative start codon can be inferred.
+
+    TODO:
+        - look for downstream alternative start codon to predict
+          new coding sequence (probably also requires matching
+          pattern of preceding ~6nt)
+        - If an alternative start codon is changed to ATG then
+          we should make a StrongerStartCodon effect which is effectively
+          silent
+        - If ATG is changed to the two common alternative codons then
+          we should make a WeakerStartCodon effect which is also
+          effectively silent.
     """
     def __init__(
             self,
@@ -430,6 +459,7 @@ def mutant_protein_sequence(self):
     def short_description(self):
         return "p.%s1? (start-loss)" % (self.transcript.protein_sequence[0],)
 
+
 class KnownAminoAcidChange(NonsilentCodingMutation):
     """
     Coding mutations in which we can predict what the new/mutant protein
@@ -474,6 +504,7 @@ def mutant_protein_sequence(self):
         suffix = original[self.aa_mutation_start_offset + len(self.aa_ref):]
         return prefix + self.aa_alt + suffix
 
+
 class Substitution(KnownAminoAcidChange):
     """
     Single amino acid substitution, e.g. BRAF-001 V600E
@@ -499,6 +530,7 @@ def __init__(
             aa_ref=aa_ref,
             aa_alt=aa_alt)
 
+
 class ComplexSubstitution(KnownAminoAcidChange):
     """
     In-frame substitution of multiple amino acids, e.g. TP53-002 p.391FY>QQQ
@@ -525,6 +557,7 @@ def __init__(
             aa_ref=aa_ref,
             aa_alt=aa_alt)
 
+
 class Insertion(KnownAminoAcidChange):
     """
     In-frame insertion of one or more amino acids.
@@ -543,6 +576,7 @@ def __init__(
             aa_ref="",
             aa_alt=aa_alt)
 
+
 class Deletion(KnownAminoAcidChange):
     """
     In-frame deletion of one or more amino acids.
@@ -694,6 +728,7 @@ def short_description(self):
             self.aa_ref[0],
             self.aa_mutation_start_offset + 1)
 
+
 class FrameShiftTruncation(PrematureStop, FrameShift):
     """
     A frame-shift mutation which immediately introduces a stop codon.

diff --git a/varcode/effects/effect_collection.py b/varcode/effects/effect_collection.py
@@ -25,6 +25,7 @@
     transcript_effect_priority_dict
 )
 
+
 class EffectCollection(Collection):
     """
     Collection of MutationEffect objects and helpers for grouping or filtering