In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import numpy as np
import pandas as pd

import SequenceDataORM as sqd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
engine = create_engine('sqlite:///NS001_evolved_mutations_copy2.db', echo=False) # our database connection
session = sessionmaker(bind=engine)() # the session object is how we make queries through sqlalchemy

# How many mutations created a new stop codon?

Breseq refers to stop codons with the \'\*\' character.

In [3]:
stop_mutations = [mutation for mutation in (session.query(sqd.SNP_Mutation)
                                                   .filter(sqd.SNP_Mutation.new_aa=='*')
                                                   .filter(sqd.SNP_Mutation.ref_aa!='*'))]

In [4]:
len(stop_mutations)

26

In [5]:
len([mutation for mutation in session.query(sqd.SNP_Mutation).filter(sqd.SNP_Mutation.intergenic_left==None)])

823

There are a decent number of stop mutations. Perhaps fewer than expected? Assuming any codon is equally likely to occur post mutations you'd expect about 3/64 mutations to create a stop codon. 3/64 * 823 = 38.6

In [6]:
stop_ev = [x for x in (session.query(sqd.SNP_Evidence)
                              .join(sqd.SNP_Mutation)
                              .filter(sqd.SNP_Mutation.new_aa=='*')
                              .filter(sqd.SNP_Mutation.ref_aa!='*')
                              .order_by(sqd.SNP_Evidence.chr_position))]

In [7]:
len(stop_ev)

35

In [8]:
len([ev for ev in (session.query(sqd.SNP_Evidence)
                          .join(sqd.SNP_Mutation)
                          .filter(sqd.SNP_Mutation.intergenic_left==None) #ignore mutations that aren't in coding region
                          .filter(~sqd.SNP_Evidence.sample.in_(['Aggregate_NS001_Ancestors',
                                                               'Ancestor_S1',
                                                               'Ancestor_S2',
                                                               'Ancestor_S3'])))])

1143

In [1]:
3/64*1143

53.578125

Looking the evidence, again it looks like there may be fewer stop codon mutations than expected. 3/64 * 1143 = 53.6.

In [9]:
for ev in stop_ev:
    print(ev, ev.frequency)

<SNP_Evidence(sample=LoMid1t1_S1, chr_position=26063, ref_base=C, new_base=T)> 0.0555009842
<SNP_Evidence(sample=Lo1t1_S1, chr_position=170054, ref_base=C, new_base=T)> 0.144515038
<SNP_Evidence(sample=Mid1t1_S1, chr_position=302668, ref_base=G, new_base=T)> 0.0911693573
<SNP_Evidence(sample=Hi4t2_S1, chr_position=622512, ref_base=T, new_base=A)> 0.136445999
<SNP_Evidence(sample=HiMid1t1_S1, chr_position=883902, ref_base=C, new_base=T)> 0.194756508
<SNP_Evidence(sample=Lo3t1_S1, chr_position=1688871, ref_base=C, new_base=T)> 0.10782671
<SNP_Evidence(sample=HiMid2t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0799436569
<SNP_Evidence(sample=Lo2t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0587720871
<SNP_Evidence(sample=Lo2t2_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.800232887
<SNP_Evidence(sample=Lo4t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0683875084
<SNP_Evidence(sample=Lo4t2_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.150085926
<S

Inspecting the whole list of strains with a mutation to a stop codon we see the mutation at chromosome position 1760934 with a mutation from G to A was detected 9 times. That's 8 of the extra detections over the number of unique mutations out of 9 extra detections. Of the 9 times this mutation was detected, 8 times were in 4 wells at both time 1 and time 2: wells Lo2, Lo4, LoMid4, and Mid2. Interestingly, in the well Lo2 it almost spread to fixation, but in Lo4 and LoMid4 it only increased in frequency moderately. In Mid2 it didn't change in frequency, and in HiMid2 it appeared at time 1 but vanished at time 2.

The last mutation detected more than once was chromosome position 2271475 from G to A in the well HiMid4 at both times 1 and 2. It may have increased slightly in frequency, but I'm guess .07 to .12 is within noise.

No mutations into stop codons were detected in the ancestor.

# How many mutations changed a stop codon into something else?

In [10]:
nonstop_mutations = [mutation for mutation in (session.query(sqd.SNP_Mutation)
                                                   .filter(sqd.SNP_Mutation.new_aa!='*')
                                                   .filter(sqd.SNP_Mutation.ref_aa=='*'))]

In [11]:
len(nonstop_mutations)

1

In [12]:
nonstop_ev = [x for x in (session.query(sqd.SNP_Evidence)
                              .join(sqd.SNP_Mutation)
                              .filter(sqd.SNP_Mutation.new_aa!='*')
                              .filter(sqd.SNP_Mutation.ref_aa=='*')
                              .order_by(sqd.SNP_Evidence.chr_position))]

In [13]:
len(nonstop_ev)

1

In [14]:
for ev in nonstop_ev:
    print(ev, ev.frequency)

<SNP_Evidence(sample=Mid3t2_S1, chr_position=2746434, ref_base=A, new_base=G)> 0.0561490059


Stop codons only occur at the exact end of a gene so perhaps unsurprisingly, a mutation that got rid of a stop codon only was detected once in any sequencing. So there's not much to say about this.