Skip to content

Commit

Permalink
Merge d1e7fd3 into 408b1cf
Browse files Browse the repository at this point in the history
  • Loading branch information
iskandr committed Oct 12, 2018
2 parents 408b1cf + d1e7fd3 commit 07042cf
Show file tree
Hide file tree
Showing 13 changed files with 157 additions and 38 deletions.
28 changes: 28 additions & 0 deletions test/test_exonic_splice_site.py
@@ -0,0 +1,28 @@
# Copyright (c) 2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from varcode import Variant
from varcode.effects import ExonicSpliceSite, PrematureStop


def test_STAT1_stop_gain_at_exon_boundary():
# top priority effect for this variant should be PrematureStop,
# even though it's also ExonicSpliceSite
stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37")
effects = stat1_variant.effects()
print(effects)
assert any([e.__class__ is ExonicSpliceSite for e in effects])
top_effect = effects.top_priority_effect()
print(top_effect)
assert top_effect.__class__ is PrematureStop
41 changes: 25 additions & 16 deletions test/test_vcf_output.py
Expand Up @@ -13,17 +13,14 @@
# limitations under the License.

from __future__ import print_function, division, absolute_import
import os
from nose.tools import eq_
from varcode import load_vcf, load_maf
from varcode import VariantCollection

from varcode.vcf_output import variants_to_vcf
from .data import data_path
import tempfile

TEST_FILENAMES = [
TEST_FILENAMES_HUMAN = [
'duplicates.maf',
'mouse_vcf_dbsnp_chr1_partial.vcf',
'multiallelic.vcf',
'mutect-example.vcf',
'ov.wustle.subset5.maf',
Expand All @@ -38,13 +35,21 @@
# 'somatic_hg19_14muts.vcf.gz', # gzip
]

TEST_FILENAMES_MOUSE = [
'mouse_vcf_dbsnp_chr1_partial.vcf',
]

TEST_FILENAMES = TEST_FILENAMES_HUMAN + TEST_FILENAMES_MOUSE


def _merge_metadata_naive(variants):
return {
k: v
for d in variants.source_to_metadata_dict.values()
for k, v in d.items()
}


def _do_roundtrip_test(filenames):

def load_fn(filename):
Expand All @@ -69,11 +74,11 @@ def load_variants():

# `==` checks the reference genome, which won't necessarily match.
assert all(
v1.contig == v2.contig and \
v1.start == v2.start and \
v1.ref == v2.ref and \
v1.start == v2.start \
for (v1, v2) in zip(variants, reparsed_variants))
v1.contig == v2.contig and
v1.start == v2.start and
v1.ref == v2.ref and
v1.start == v2.start
for (v1, v2) in zip(variants, reparsed_variants))

return (variants, reparsed_variants)

Expand All @@ -91,43 +96,47 @@ def load_variants():
# metadata (without the need to individually convert fields), we'd need to add
# these headers to the output VCF file. See `vcf_output.py` for more info.


def test_single_file_roundtrip_conversion():
for filename in TEST_FILENAMES:
yield (_do_roundtrip_test, [filename])


def test_multiple_file_roundtrip_conversion():
file_groups = (
['simple.1.vcf', 'simple.2.vcf'], # basic multi-file test
['duplicates.maf', 'multiallelic.vcf'], # dif. file formats
['duplicate-id.1.vcf', 'duplicate-id.2.vcf'],
TEST_FILENAMES, # because why not?
TEST_FILENAMES_HUMAN,
)
for file_group in file_groups:
yield (_do_roundtrip_test, file_group)


def test_same_samples_produce_samples():
"""Ensures that, if a set of variants have the same samples, the reparsed
collection will output these samples.
"""
(variants, reparsed_variants) = _do_roundtrip_test(
['same-samples.1.vcf', 'same-samples.2.vcf'])
['same-samples.1.vcf', 'same-samples.2.vcf'])

original_metadata = _merge_metadata_naive(variants)
reparsed_metadata = _merge_metadata_naive(reparsed_variants)

sample_names = set(list(original_metadata.values())[0]['sample_info'].keys())
assert all(
set(d.get('sample_info', {}).keys()) == sample_names
for d in reparsed_metadata.values())
set(d.get('sample_info', {}).keys()) == sample_names
for d in reparsed_metadata.values())


def test_different_samples_produce_no_samples():
"""Ensures that, if a set of variants have different samples, the reparsed
collection will not output any samples.
See `vcf_output.py` for details as to why this is the way it's done for now.
"""
(_, reparsed_variants) = _do_roundtrip_test(
['different-samples.1.vcf', 'different-samples.2.vcf'])
['different-samples.1.vcf', 'different-samples.2.vcf'])

metadata = _merge_metadata_naive(reparsed_variants)
assert all(d.get('sample_info') is None for d in metadata.values())
2 changes: 1 addition & 1 deletion varcode/__init__.py
Expand Up @@ -24,7 +24,7 @@
NonsilentCodingMutation,
)

__version__ = '0.7.1'
__version__ = '0.8.0'

__all__ = [
# basic classes
Expand Down
1 change: 1 addition & 0 deletions varcode/effects/common.py
Expand Up @@ -17,6 +17,7 @@
from Bio.Seq import Seq
from six import string_types, text_type


def bio_seq_to_str(seq):
if isinstance(seq, Seq):
return str(seq)
Expand Down
37 changes: 36 additions & 1 deletion varcode/effects/effect_classes.py
@@ -1,4 +1,4 @@
# Copyright (c) 2016. Mount Sinai School of Medicine
# Copyright (c) 2016-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@

from .common import bio_seq_to_str


class MutationEffect(Serializable):
"""
Base class for mutation effects.
Expand Down Expand Up @@ -126,6 +127,7 @@ def __init__(self, variant, gene):
MutationEffect.__init__(self, variant)
self.gene = gene


class TranscriptMutationEffect(Intragenic):
def __init__(self, variant, transcript):
Intragenic.__init__(self, variant, gene=transcript.gene)
Expand All @@ -144,25 +146,29 @@ class Failure(TranscriptMutationEffect):
need to create a non-empty list of effects for each variant.
"""


class NoncodingTranscript(TranscriptMutationEffect):
"""
Any mutation to a transcript with a non-coding biotype
"""
short_description = "non-coding-transcript"


class IncompleteTranscript(TranscriptMutationEffect):
"""
Any mutation to an incompletely annotated transcript with a coding biotype
"""
short_description = "incomplete"


class FivePrimeUTR(TranscriptMutationEffect):
"""
Any mutation to the 5' untranslated region (before the start codon) of
coding transcript.
"""
short_description = "5' UTR"


class ThreePrimeUTR(TranscriptMutationEffect):
"""
Any mutation to the 3' untranslated region (after the stop codon) of
Expand All @@ -182,12 +188,14 @@ def __init__(self, variant, transcript, nearest_exon, distance_to_exon):

short_description = "intronic"


class SpliceSite(object):
"""
Parent class for all splice site mutations.
"""
pass


class IntronicSpliceSite(Intronic, SpliceSite):
"""
Mutations near exon boundaries, excluding the first two and last two
Expand All @@ -200,6 +208,7 @@ def __init__(self, variant, transcript, nearest_exon, distance_to_exon):

short_description = "intronic-splice-site"


class SpliceDonor(IntronicSpliceSite):
"""
Mutation in the first two intron residues.
Expand All @@ -210,18 +219,21 @@ def __init__(self, variant, transcript, nearest_exon, distance_to_exon):

short_description = "splice-donor"


class SpliceAcceptor(IntronicSpliceSite):
"""
Mutation in the last two intron residues.
"""
short_description = "splice-acceptor"


class Exonic(TranscriptMutationEffect):
"""
Any mutation which affects the contents of an exon (coding region or UTRs)
"""
pass


class ExonLoss(Exonic):
"""
Deletion of one or more exons in a transcript.
Expand All @@ -248,6 +260,7 @@ def modifies_coding_sequence(self):
# TODO: distinguish between exon loss in the CDS and UTRs
return True


class ExonicSpliceSite(Exonic, SpliceSite):
"""
Mutation in the last three nucleotides before an intron
Expand Down Expand Up @@ -283,6 +296,7 @@ def modifies_protein_sequence(self):
def modifies_coding_sequence(self):
return self.alternate_effect.modifies_coding_sequence


class CodingMutation(Exonic):
"""
Base class for all mutations which result in a modified coding sequence.
Expand All @@ -305,6 +319,7 @@ def __str__(self):
def modifies_coding_sequence(self):
return True


class Silent(CodingMutation):
"""Mutation to an exon of a coding region which doesn't change the
amino acid sequence.
Expand Down Expand Up @@ -339,6 +354,7 @@ def __init__(
def short_description(self):
return "silent"


class AlternateStartCodon(Silent):
"""Change to the start codon (e.g. ATG>CTG) but without changing the
starting amino acid from methionine.
Expand All @@ -363,6 +379,7 @@ def short_description(self):
return "alternate-start-codon (%s>%s)" % (
self.ref_codon, self.alt_codon)


class NonsilentCodingMutation(CodingMutation):
"""
All coding mutations other than silent codon substitutions
Expand Down Expand Up @@ -402,11 +419,23 @@ def __init__(
def modifies_protein_sequence(self):
return True


class StartLoss(NonsilentCodingMutation):
"""
When a start codon is lost it's difficult to determine if there is
an alternative Kozak consensus sequence (either before or after the
original) from which an alternative start codon can be inferred.
TODO:
- look for downstream alternative start codon to predict
new coding sequence (probably also requires matching
pattern of preceding ~6nt)
- If an alternative start codon is changed to ATG then
we should make a StrongerStartCodon effect which is effectively
silent
- If ATG is changed to the two common alternative codons then
we should make a WeakerStartCodon effect which is also
effectively silent.
"""
def __init__(
self,
Expand All @@ -430,6 +459,7 @@ def mutant_protein_sequence(self):
def short_description(self):
return "p.%s1? (start-loss)" % (self.transcript.protein_sequence[0],)


class KnownAminoAcidChange(NonsilentCodingMutation):
"""
Coding mutations in which we can predict what the new/mutant protein
Expand Down Expand Up @@ -474,6 +504,7 @@ def mutant_protein_sequence(self):
suffix = original[self.aa_mutation_start_offset + len(self.aa_ref):]
return prefix + self.aa_alt + suffix


class Substitution(KnownAminoAcidChange):
"""
Single amino acid substitution, e.g. BRAF-001 V600E
Expand All @@ -499,6 +530,7 @@ def __init__(
aa_ref=aa_ref,
aa_alt=aa_alt)


class ComplexSubstitution(KnownAminoAcidChange):
"""
In-frame substitution of multiple amino acids, e.g. TP53-002 p.391FY>QQQ
Expand All @@ -525,6 +557,7 @@ def __init__(
aa_ref=aa_ref,
aa_alt=aa_alt)


class Insertion(KnownAminoAcidChange):
"""
In-frame insertion of one or more amino acids.
Expand All @@ -543,6 +576,7 @@ def __init__(
aa_ref="",
aa_alt=aa_alt)


class Deletion(KnownAminoAcidChange):
"""
In-frame deletion of one or more amino acids.
Expand Down Expand Up @@ -694,6 +728,7 @@ def short_description(self):
self.aa_ref[0],
self.aa_mutation_start_offset + 1)


class FrameShiftTruncation(PrematureStop, FrameShift):
"""
A frame-shift mutation which immediately introduces a stop codon.
Expand Down
1 change: 1 addition & 0 deletions varcode/effects/effect_collection.py
Expand Up @@ -25,6 +25,7 @@
transcript_effect_priority_dict
)


class EffectCollection(Collection):
"""
Collection of MutationEffect objects and helpers for grouping or filtering
Expand Down

0 comments on commit 07042cf

Please sign in to comment.