diff --git a/test/test_effect_annotation_errors.py b/test/test_effect_annotation_errors.py index 4e2a122..35a3d4d 100644 --- a/test/test_effect_annotation_errors.py +++ b/test/test_effect_annotation_errors.py @@ -242,3 +242,39 @@ def test_issue193_SNV_stop_gain_in_ZNF45_not_deletion(): effect_class=PrematureStop, modifies_coding_sequence=True, modifies_protein_sequence=True) + +def test_issue202_stoploss_deletes_two_amino_acids(): + """ + Issue: https://github.com/hammerlab/varcode/issues/202 + Variant: chr1 100484693 . TTCATCTGA CCC + Transcript: ENSMUST00000086738 + >>> + The end of that transcript looks like: + + TTC ATC TGA ACT + F I * T + and this mutation will cause the location plus downstream to become + + PTIVWSSGPLF(...) + The annotation that varcode gives is + + StopLoss + * aa_mutation_start_offset = 1292 + * aa_ref="*" + * aa_alt="PTIVWSS(...)" + It should actually be + + StopLoss + * aa_mutation_start_offset = 1290 + * aa_ref="FI*" + * aa_alt="PTIVWSS(...)" + """ + variant = Variant('chr1', 100484693, 'TTCATCTGA', 'CCC', 'GRCm38') + expect_effect( + variant, + transcript_id='ENSMUST00000086738', + effect_class=StopLoss, + modifies_coding_sequence=True, + modifies_protein_sequence=True, + aa_ref='FI', + aa_alt='PTIVWSSGPLFCRGFHLFFFSFF') diff --git a/varcode/effects/effect_classes.py b/varcode/effects/effect_classes.py index fbdcb76..120521c 100644 --- a/varcode/effects/effect_classes.py +++ b/varcode/effects/effect_classes.py @@ -577,12 +577,14 @@ def __init__( Insertion of premature stop codon, possibly preceded by a substitution of `aa_ref` amino acids for `aa_alt` alternative residues. """ - assert "*" not in aa_ref, \ - ("Unexpected aa_ref = '%s', should only include amino acids " - "before the new stop codon.") % aa_ref - assert "*" not in aa_alt, \ - ("Unexpected aa_ref = '%s', should only include amino acids " - "before the new stop codon.") % aa_alt + if "*" in aa_ref: + raise ValueError( + ("Unexpected aa_ref = '%s', should only include amino acids " + "before the new stop codon.") % aa_ref) + if "*" in aa_alt: + raise ValueError( + ("Unexpected aa_ref = '%s', should only include amino acids " + "before the new stop codon.") % aa_alt) KnownAminoAcidChange.__init__( self, variant, @@ -592,12 +594,13 @@ def __init__( aa_alt=aa_alt) self.stop_codon_offset = aa_mutation_start_offset + len(aa_alt) - assert self.stop_codon_offset < len(transcript.protein_sequence), \ - ("Premature stop codon cannot be at position %d" - " since the original protein of %s has length %d") % ( - self.stop_codon_offset, - transcript, - len(transcript.protein_sequence)) + if self.stop_codon_offset >= len(transcript.protein_sequence): + raise ValueError( + ("Premature stop codon cannot be at position %d" + " since the original protein of %s has length %d") % ( + self.stop_codon_offset, + transcript, + len(transcript.protein_sequence))) @property def short_description(self): @@ -617,23 +620,38 @@ def __init__( self, variant, transcript, - extended_protein_sequence): - aa_mutation_start_offset = len(transcript.protein_sequence) + aa_ref, + aa_alt): + # StopLoss assumes that we deleted some codons ending with a + # stop codon + if "*" in aa_ref: + raise ValueError( + "StopLoss aa_ref '%s' should not contain '*'" % ( + aa_ref,)) + if len(aa_alt) == 0: + raise ValueError( + "If no amino acids added by StopLoss then it should be Silent") + # subtract 1 for the stop codon + n_ref_amino_acids = len(aa_ref) + protein_length = len(transcript.protein_sequence) + aa_mutation_start_offset = protein_length - n_ref_amino_acids KnownAminoAcidChange.__init__( self, variant, transcript, aa_mutation_start_offset=aa_mutation_start_offset, - aa_ref="*", - aa_alt=extended_protein_sequence) + aa_alt=aa_alt, + aa_ref=aa_ref) @property def extended_protein_sequence(self): + """Deprecated name for aa_alt""" return self.aa_alt @property def short_description(self): - return "p.*%d%s (stop-loss)" % ( + return "p.%s*%d%s (stop-loss)" % ( + self.aa_ref, self.aa_mutation_start_offset + 1, self.extended_protein_sequence) diff --git a/varcode/effects/effect_prediction_coding_frameshift.py b/varcode/effects/effect_prediction_coding_frameshift.py index 8f57472..e2955c5 100644 --- a/varcode/effects/effect_prediction_coding_frameshift.py +++ b/varcode/effects/effect_prediction_coding_frameshift.py @@ -79,12 +79,16 @@ def create_frameshift_effect( alt=mutant_protein_suffix) n_unchanged_amino_acids = len(unchanged_amino_acids) offset_to_first_different_amino_acid = mutated_codon_index + n_unchanged_amino_acids + # miraculously, this frameshift left the protein unchanged, + # most likely by turning one stop codon into another stop codon + if n_unchanged_amino_acids == 0: + aa_ref = "" + else: + aa_ref = original_protein_sequence[-n_unchanged_amino_acids:] if offset_to_first_different_amino_acid >= original_protein_length: # frameshift is either extending the protein or leaving it unchanged if len(mutant_protein_suffix) == 0: - # miraculously, this frameshift left the protein unchanged, - # most likely by turning one stop codon into another stop codon - aa_ref = original_protein_sequence[-n_unchanged_amino_acids:] + return Silent( variant=variant, transcript=transcript, @@ -97,7 +101,8 @@ def create_frameshift_effect( return StopLoss( variant=variant, transcript=transcript, - extended_protein_sequence=mutant_protein_suffix) + aa_ref=aa_ref, + aa_alt=mutant_protein_suffix) # original amino acid at the mutated codon before the frameshift occurred aa_ref = original_protein_sequence[offset_to_first_different_amino_acid] diff --git a/varcode/effects/effect_prediction_coding_in_frame.py b/varcode/effects/effect_prediction_coding_in_frame.py index 056646a..3dc286d 100644 --- a/varcode/effects/effect_prediction_coding_in_frame.py +++ b/varcode/effects/effect_prediction_coding_in_frame.py @@ -233,7 +233,8 @@ def predict_in_frame_coding_effect( return StopLoss( variant, transcript, - extended_protein_sequence=aa_alt) + aa_ref=aa_ref, + aa_alt=aa_alt) elif n_aa_alt == 0: return Deletion( variant,