In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import re
import numpy as np
import pandas as pd

import SequenceDataORM as sqd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
engine = create_engine('sqlite:///NS001_evolved_mutations_copy2.db', echo=False) # our database connection
session = sessionmaker(bind=engine)() # the session object is how we make queries through sqlalchemy

# How many times do we sequence a mutation per gene?

In other words, do any genes show an unusual amount of mutations, i.e. are they possibly a target of selection and thus mutations in them sweep more often.

In [3]:
df = pd.read_sql_query('SELECT gene, COUNT(1) AS X '
                       'FROM snp_mutations '
                       'JOIN snp_evidence '
                       'ON snp_mutations.chr_position = snp_evidence.chr_position '
                       'GROUP BY gene ORDER BY X DESC', engine)

In [4]:
df

Unnamed: 0,gene,X
0,gltP/yjcO,117
1,amiB/mutL,37
2,murC|ddlB,27
3,trkA,27
4,yhiI,23
5,htrE,20
6,fdrA,18
7,brnQ,17
8,ffh,17
9,pncA/ydjE,17


Most of the genes where a mutation was sequenced many times are cases where the mutation occurred in the ancestor. I need to filter those out before aggregating up my counts. Doing this in SQL directly may be harder than using the ORM since I need to filter out any mutation which occurred in the ancestor from descendants which have the same mutation as well.

In [5]:
nonancestral_mutations = [snp for snp in session.query(sqd.SNP_Mutation)
                          if ('Aggregate_NS001_Ancestors' not in snp.samples) and
                             ('Ancestor_S1' not in snp.samples) and
                             ('Ancestor_S2' not in snp.samples) and
                             ('Ancestor_S3' not in snp.samples)]

In [6]:
len(nonancestral_mutations)

874

In [7]:
nonancestral_mutations[0]

<SNP_Mutation(chr_position=105395, ref_base=C, new_base=T, gene=ftsZ)>

In [8]:
count_dict = {}
for mut in nonancestral_mutations:
    if mut.gene in count_dict:
        count_dict[mut.gene] = count_dict[mut.gene] + 1
    else:
        count_dict[mut.gene] = 1

In [9]:
len(count_dict)

739

In [10]:
pd.Series(count_dict).sort_values(ascending=False)

sufB         8
gabT         7
rne          6
ftsK         6
yieF/adeP    6
yfjF         5
rhsD         5
ptsP         4
yneO         4
ynjC         4
cas3         3
yeeJ         3
mzrA/yqjC    3
glgX         3
flu          3
rodZ/rlmN    3
pheT         3
ydbA         3
rpoB         3
eco/mqo      3
etk          3
yahH         3
sufC         3
yjjV/yjjW    3
lpxD         2
glpB         2
parC         2
murF         2
yniB         2
phnG         2
            ..
rapA         1
proP/pmrR    1
metH         1
yqhD         1
mscS         1
serA         1
xanQ         1
katE         1
ydjZ         1
creD         1
gspG         1
lptF         1
tamA         1
aspA         1
yshB/glnG    1
yidL         1
malS         1
bcsA         1
tusB         1
gltB         1
araG         1
kduI         1
argA         1
cysN         1
dgcN         1
truA         1
nuoG         1
gatD         1
yeeJ/yeeL    1
ftsZ         1
Length: 739, dtype: int64

Some of these genes occur a lot more than twice. These are distinct mutations with different sequence changes. I also need to investigate how many times each of these mutations was found while sequencing but avoid double counting mutations found at both time t1 and time t2.

But roughly speaking, the chance a given mutation would be found X times is a poisson distribution (not exactly true, but the chance a mutation is found sequencing is low and we detect many mutations so... probably close enough). There are ~4,000 genes in e. coli so since we found 739 genes with mutations, we'll take the mean number of times a gene is found sequencing to be ~.2.

Then the odds of finding a gene 6 times would be.

In [11]:
(.2)**6*np.exp(-.2)/np.math.factorial(6)

7.277606694026507e-08

Unless I'm thinking this through wrong, I shouldn't have found any genes with 6 or more distinct mutations. At the very least, it should have been highly unlikely. Odds of something like one in ten thousand against finding a gene with so many mutations even once.

Of course, this neglects that some genes may sequence poorly. This probably happens and could sink some things. So the thing to check is the frequency of these detected mutations. If mutations are high frequency and high coverage then the gene probably sequences ok if we didn't detect that mutation over and over. Although there's probably some sort of bonferroni type correction that needs to be applied.

Let's see how well this count holds up if we ignore synonymous mutations.

In [12]:
nonsyn_count_dict = {}
for mut in nonancestral_mutations:
    if not mut.synonymous:
        if mut.gene in nonsyn_count_dict:
            nonsyn_count_dict[mut.gene] = nonsyn_count_dict[mut.gene] + 1
        else:
            nonsyn_count_dict[mut.gene] = 1

In [13]:
pd.Series(nonsyn_count_dict).sort_values(ascending=False)

sufB    8
rne     6
rhsD    5
gabT    5
ptsP    4
ynjC    4
etk     3
sufC    3
yahH    3
glgX    3
yeeJ    3
rpoB    3
yfjF    3
ftsK    3
pykF    2
rhsC    2
sufS    2
phnG    2
mhpF    2
menD    2
ompT    2
yjjB    2
dtpD    2
dnaE    2
murF    2
psuG    2
rcsC    2
parC    2
ynfF    2
cobS    2
       ..
nrdG    1
gluQ    1
sbcC    1
narW    1
yciW    1
dksA    1
ydjZ    1
serA    1
mscS    1
yqhD    1
rapA    1
yadC    1
gcl     1
bioF    1
minC    1
sad     1
ydfR    1
nuoG    1
lptF    1
truA    1
tmcA    1
dgcN    1
cysN    1
gltB    1
tusB    1
malS    1
yidL    1
aspA    1
tamA    1
ftsZ    1
Length: 448, dtype: int64

## Investigating the sequencing evidence for the 14 genes with the most nonsynonymous mutations

In [14]:
mutation_ev_by_gene = {}
for index in pd.Series(nonsyn_count_dict).index:
    mutation_ev_by_gene[index] = [ev for ev in (session.query(sqd.SNP_Evidence)
                                                           .join(sqd.SNP_Mutation)
                                                           .filter(sqd.SNP_Mutation.gene==index)
                                                           .order_by(sqd.SNP_Evidence.chr_position,
                                                                     sqd.SNP_Evidence.sample))]

In [15]:
for mut in mutation_ev_by_gene['sufB']:
    print(mut, mut.frequency)

<SNP_Evidence(sample=Hi4t1_S1, chr_position=1761840, ref_base=C, new_base=T)> 0.872701645
<SNP_Evidence(sample=Hi4t2_S1, chr_position=1761840, ref_base=C, new_base=T)> 0.18745327
<SNP_Evidence(sample=Hi3t1_S1, chr_position=1762024, ref_base=G, new_base=A)> 0.0707144737
<SNP_Evidence(sample=Hi1t1_S1, chr_position=1762267, ref_base=A, new_base=G)> 0.407216549
<SNP_Evidence(sample=Hi1t2_S1, chr_position=1762291, ref_base=A, new_base=G)> 0.193992615
<SNP_Evidence(sample=HiMid2t2_S1, chr_position=1762299, ref_base=C, new_base=T)> 0.357156754
<SNP_Evidence(sample=Hi3t1_S1, chr_position=1762315, ref_base=A, new_base=G)> 0.337581158
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1762497, ref_base=C, new_base=T)> 0.945244312
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=1762497, ref_base=C, new_base=T)> 0.837644577
<SNP_Evidence(sample=HiMid4t1_S1, chr_position=1762620, ref_base=T, new_base=C)> 0.104299068


In [16]:
for mut in mutation_ev_by_gene['rne']:
    print(mut, mut.frequency)

<SNP_Evidence(sample=Mid4t1_S1, chr_position=1140822, ref_base=C, new_base=T)> 0.145970345
<SNP_Evidence(sample=Hi1t2_S1, chr_position=1142094, ref_base=A, new_base=G)> 0.113417149
<SNP_Evidence(sample=Mid2t1_S1, chr_position=1142379, ref_base=G, new_base=A)> 0.0553503036
<SNP_Evidence(sample=Hi2t1_S1, chr_position=1142477, ref_base=G, new_base=A)> 0.913094044
<SNP_Evidence(sample=Hi2t2_S1, chr_position=1142477, ref_base=G, new_base=A)> 0.554895401
<SNP_Evidence(sample=Hi3t1_S1, chr_position=1142816, ref_base=T, new_base=C)> 0.263482571
<SNP_Evidence(sample=Hi3t2_S1, chr_position=1142816, ref_base=T, new_base=C)> 0.808775902
<SNP_Evidence(sample=Hi1t2_S1, chr_position=1143263, ref_base=C, new_base=T)> 0.114051342


In [17]:
for mut in mutation_ev_by_gene['rhsD']:
    print(mut, mut.frequency)

<SNP_Evidence(sample=Mid1t2_S1, chr_position=522758, ref_base=T, new_base=C)> 0.059360981
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=522818, ref_base=T, new_base=C)> 0.0547037125
<SNP_Evidence(sample=Lo4t1_S1, chr_position=523100, ref_base=A, new_base=C)> 0.0583357811
<SNP_Evidence(sample=Lo2t1_S1, chr_position=524183, ref_base=T, new_base=C)> 0.0863161087
<SNP_Evidence(sample=Lo2t2_S1, chr_position=524183, ref_base=T, new_base=C)> 0.843327999
<SNP_Evidence(sample=Lo4t1_S1, chr_position=524183, ref_base=T, new_base=C)> 0.177276611
<SNP_Evidence(sample=Lo4t2_S1, chr_position=524183, ref_base=T, new_base=C)> 0.106831551
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=524183, ref_base=T, new_base=C)> 0.31117487
<SNP_Evidence(sample=Mid2t1_S1, chr_position=524183, ref_base=T, new_base=C)> 0.141783714
<SNP_Evidence(sample=Mid2t2_S1, chr_position=524183, ref_base=T, new_base=C)> 0.147058487
<SNP_Evidence(sample=LoMid4t1_S1, chr_position=524492, ref_base=A, new_base=G)> 0.275894642
<SNP_Ev

In [18]:
for mut in mutation_ev_by_gene['gabT']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793107, ref_base=C, new_base=T)> 0.0549821854 2/2 34/39
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793110, ref_base=C, new_base=G)> 0.0548300743 2/2 34/39
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793111, ref_base=A, new_base=T)> 0.0547680855 2/2 34/39
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793112, ref_base=A, new_base=T)> 0.0548152924 2/2 34/39
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793113, ref_base=A, new_base=T)> 0.054792881 2/2 34/39
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793114, ref_base=C, new_base=G)> 0.0540738106 2/2 35/39
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2793117, ref_base=A, new_base=G)> 0.0555243492 2/2 34/38


All the gabT mutations come from a single sample, are very close together, and are low coverage. This could just be PCR or sequencing on a segment of genomic DNA gone horribly wrong once. It's probably best to ignore the gabT mutations then.

In [19]:
for mut in mutation_ev_by_gene['ptsP']:
    print(mut, mut.frequency)

<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2966385, ref_base=C, new_base=T)> 0.130889416
<SNP_Evidence(sample=Hi1t2_S1, chr_position=2967681, ref_base=T, new_base=C)> 0.25425005
<SNP_Evidence(sample=Hi3t2_S1, chr_position=2967940, ref_base=G, new_base=A)> 0.339678288
<SNP_Evidence(sample=Hi3t2_S1, chr_position=2968045, ref_base=G, new_base=A)> 0.0741372108


In [20]:
for mut in mutation_ev_by_gene['ynjC']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837451, ref_base=G, new_base=T)> 0.0506176949 2/2 40/38
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837452, ref_base=G, new_base=C)> 0.0584836006 3/2 40/38
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837453, ref_base=G, new_base=C)> 0.0514974594 2/2 40/38
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837454, ref_base=G, new_base=A)> 0.0513763428 2/2 40/38


All the ynjC mutations occur in a single sample, close together (in fact in a row), and at low coverage. Probably best to ignore.

In [21]:
for mut in mutation_ev_by_gene['etk']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1041585, ref_base=A, new_base=G)> 0.0505862236 2/2 40/39
<SNP_Evidence(sample=Hi3t2_S1, chr_position=1042617, ref_base=T, new_base=C)> 0.238503456 8/9 38/35
<SNP_Evidence(sample=LoMid1t1_S1, chr_position=1042617, ref_base=T, new_base=C)> 0.263982296 10/7 33/32
<SNP_Evidence(sample=Mid1t1_S1, chr_position=1042617, ref_base=T, new_base=C)> 0.654597282 16/18 26/26
<SNP_Evidence(sample=Mid1t2_S1, chr_position=1042617, ref_base=T, new_base=C)> 1.0 32/30 32/30
<SNP_Evidence(sample=Mid2t1_S1, chr_position=1042617, ref_base=T, new_base=C)> 0.191437244 5/4 23/24
<SNP_Evidence(sample=Mid2t2_S1, chr_position=1042617, ref_base=T, new_base=C)> 0.11360836 4/6 44/44
<SNP_Evidence(sample=Mid1t2_S1, chr_position=1043334, ref_base=A, new_base=T)> 0.059237957 2/2 32/30


In [22]:
for mut in mutation_ev_by_gene['sufC']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi4t2_S1, chr_position=1761133, ref_base=T, new_base=C)> 0.151769638 2/3 17/16
<SNP_Evidence(sample=HiMid1t2_S1, chr_position=1761219, ref_base=T, new_base=C)> 0.0727200508 2/2 27/28
<SNP_Evidence(sample=Lo1t1_S1, chr_position=1761225, ref_base=T, new_base=C)> 0.122981071 4/4 33/32
<SNP_Evidence(sample=Lo1t2_S1, chr_position=1761225, ref_base=T, new_base=C)> 0.112802982 4/3 32/30


In [23]:
for mut in mutation_ev_by_gene['yahH']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid2t2_S1, chr_position=339066, ref_base=T, new_base=C)> 0.111017227 2/2 16/20
<SNP_Evidence(sample=HiMid2t2_S1, chr_position=339072, ref_base=T, new_base=C)> 0.114326 2/2 16/19
<SNP_Evidence(sample=HiMid2t2_S1, chr_position=339089, ref_base=A, new_base=G)> 0.114393234 2/2 16/19


All yahH mutations occur in a single sample, close together (although not super close, ~10 bases apart) and at low coverage likely from the same 2 reads.

In [24]:
for mut in mutation_ev_by_gene['glgX']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid1t2_S1, chr_position=3569502, ref_base=G, new_base=A)> 0.0570073128 2/2 36/34
<SNP_Evidence(sample=Mid2t1_S1, chr_position=3569852, ref_base=A, new_base=G)> 0.0625596046 2/2 31/32
<SNP_Evidence(sample=Lo4t1_S1, chr_position=3570116, ref_base=C, new_base=T)> 0.0657372475 2/2 28/33


In [25]:
for mut in mutation_ev_by_gene['yeeJ']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid2t2_S1, chr_position=2046004, ref_base=G, new_base=T)> 0.065507412 2/2 26/26
<SNP_Evidence(sample=HiMid3t2_S1, chr_position=2050483, ref_base=G, new_base=A)> 0.0539302826 2/2 36/38
<SNP_Evidence(sample=LoMid2t1_S1, chr_position=2051027, ref_base=A, new_base=T)> 0.0853638649 2/3 24/33


In [26]:
for mut in mutation_ev_by_gene['rpoB']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid2t2_S1, chr_position=4182689, ref_base=G, new_base=A)> 0.400649071 5/5 12/13
<SNP_Evidence(sample=LoMid3t1_S1, chr_position=4182888, ref_base=A, new_base=G)> 0.866933823 43/43 52/49
<SNP_Evidence(sample=LoMid3t2_S1, chr_position=4182888, ref_base=A, new_base=G)> 1.0 48/37 50/37
<SNP_Evidence(sample=LoMid1t1_S1, chr_position=4182957, ref_base=T, new_base=A)> 0.408305168 26/23 61/60
<SNP_Evidence(sample=LoMid1t2_S1, chr_position=4182957, ref_base=T, new_base=A)> 0.895558834 48/44 54/49


In [27]:
for mut in mutation_ev_by_gene['yfjF']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=LoMid1t2_S1, chr_position=2754230, ref_base=T, new_base=A)> 0.077067852 2/2 28/24
<SNP_Evidence(sample=LoMid1t2_S1, chr_position=2754231, ref_base=C, new_base=G)> 0.096098423 2/3 27/24
<SNP_Evidence(sample=LoMid1t2_S1, chr_position=2754232, ref_base=A, new_base=T)> 0.0786390305 2/2 27/24
<SNP_Evidence(sample=LoMid1t2_S1, chr_position=2754233, ref_base=C, new_base=G)> 0.0785579681 2/2 27/24
<SNP_Evidence(sample=LoMid1t2_S1, chr_position=2754234, ref_base=T, new_base=A)> 0.0785822868 2/2 27/24


All yfjF mutations occur in a single sample, adjacent to each other, and at low coverage likely from the same 2 reads

In [28]:
for mut in mutation_ev_by_gene['ftsK']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi3t1_S1, chr_position=933928, ref_base=A, new_base=G)> 0.0600500107 2/2 36/31
<SNP_Evidence(sample=Hi3t2_S1, chr_position=933928, ref_base=A, new_base=G)> 0.408807278 11/11 26/28
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=934688, ref_base=A, new_base=G)> 0.0585708618 2/2 36/32
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=934691, ref_base=G, new_base=A)> 0.0622596741 2/2 35/31
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=934721, ref_base=A, new_base=G)> 0.0611114502 2/2 35/31
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=934725, ref_base=C, new_base=G)> 0.0665922165 2/3 35/31
<SNP_Evidence(sample=LoMid2t1_S1, chr_position=936210, ref_base=G, new_base=C)> 0.0628123283 3/2 36/34


Of the 14 genes with the most distinct mutations, 4 of them are suspicious in the sense that all the mutations come from one sample at low coverage for the new mutation. It's possible that these are correlated mutations, but it's also possible these are just PCR or sequencing going badly for a bit (PCR once maybe? or sequencing calls twice).

The mutations in sufB and sufC are related to iron-sulfur transport and seem promising for a story. I can also see that sufS got hit twice in the list, let's see how many 'suf' genes had nonsynonymous mutations

In [29]:
for key in nonsyn_count_dict:
    if 'suf' in key:
        print(key)

sufB
sufE
sufS
sufD
sufC


In [30]:
for mut in mutation_ev_by_gene['sufS']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Lo4t1_S1, chr_position=1759215, ref_base=C, new_base=T)> 0.0832509995 2/2 25/23
<SNP_Evidence(sample=LoMid2t1_S1, chr_position=1759215, ref_base=C, new_base=T)> 0.637475014 17/23 28/35
<SNP_Evidence(sample=LoMid2t2_S1, chr_position=1759215, ref_base=C, new_base=T)> 1.0 14/10 14/13
<SNP_Evidence(sample=Hi3t1_S1, chr_position=1759381, ref_base=G, new_base=A)> 0.0653042793 2/2 29/32


In [31]:
for mut in mutation_ev_by_gene['sufE']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi2t1_S1, chr_position=1758364, ref_base=A, new_base=G)> 0.829994202 21/28 26/33
<SNP_Evidence(sample=Hi2t2_S1, chr_position=1758364, ref_base=A, new_base=G)> 0.624610901 13/13 22/21


In [32]:
for mut in mutation_ev_by_gene['sufD']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid2t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0799436569 2/3 32/32
<SNP_Evidence(sample=Lo2t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0587720871 2/2 32/36
<SNP_Evidence(sample=Lo2t2_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.800232887 32/32 39/42
<SNP_Evidence(sample=Lo4t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0683875084 3/2 37/36
<SNP_Evidence(sample=Lo4t2_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.150085926 7/9 52/53
<SNP_Evidence(sample=LoMid4t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.0925116539 3/3 33/32
<SNP_Evidence(sample=LoMid4t2_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.258062363 12/13 49/48
<SNP_Evidence(sample=Mid2t1_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.106333256 3/3 30/27
<SNP_Evidence(sample=Mid2t2_S1, chr_position=1760934, ref_base=G, new_base=A)> 0.114959717 4/4 33/37


There are some high frequency mutations here too.

## Ribosomal proteins and polymerases

The rpoB mutation is interesting because it's detectable, and has been found in cultures lingering in stationary phase before. Just spitballing, but how many genes start with 'rp' and have nonsynonymous mutations?

In [33]:
for key in nonsyn_count_dict:
    if re.match('rp', key):
        print(key)

rplO
rplB
rplS
rpoB
rplQ
rplU


In [34]:
for mut in mutation_ev_by_gene['rplO']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi4t1_S1, chr_position=3444453, ref_base=T, new_base=C)> 0.0504918098 2/2 37/42


In [35]:
for mut in mutation_ev_by_gene['rplB']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid2t1_S1, chr_position=3451013, ref_base=T, new_base=C)> 0.164884567 7/7 44/39


In [36]:
for mut in mutation_ev_by_gene['rplS']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid4t1_S1, chr_position=2744372, ref_base=G, new_base=A)> 0.106283188 4/4 37/35
<SNP_Evidence(sample=HiMid4t2_S1, chr_position=2744372, ref_base=G, new_base=A)> 0.103843689 4/4 39/38


In [37]:
for mut in mutation_ev_by_gene['rplQ']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi2t2_S1, chr_position=3439964, ref_base=G, new_base=A)> 0.0534105301 2/2 33/44


In [38]:
for mut in mutation_ev_by_gene['rplU']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Mid1t2_S1, chr_position=3333447, ref_base=T, new_base=C)> 0.0512113571 2/2 35/43


These are all low frequency and there are a lot of rp* genes so this may mean nothing. I need to control for the total fraction of the genome covered by a set of genes.

# Now let's check some other genes that were associated with pathways that the sufA operon is part of (or at least sufS is part of and sufS is part of the sufA operon)

The associated genes/operons were found using the ecocyc database and manually looking for genes associated with the metabolic pathways sufS is part of (the other suf genes have no known pathways listed).

In [39]:
for key in nonsyn_count_dict:
    if re.match('thi', key):
        print(key, nonsyn_count_dict[key])

In [40]:
for key in nonsyn_count_dict:
    if re.match('isc', key):
        print(key, nonsyn_count_dict[key])

In [41]:
for key in nonsyn_count_dict:
    if re.match('hsc', key):
        print(key, nonsyn_count_dict[key])

In [42]:
for key in nonsyn_count_dict:
    if re.match('moe', key):
        print(key, nonsyn_count_dict[key])

moeB 1


In [43]:
for mut in mutation_ev_by_gene['moeB']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Mid1t1_S1, chr_position=863618, ref_base=C, new_base=T)> 0.739523888 18/19 25/25
<SNP_Evidence(sample=Mid1t2_S1, chr_position=863618, ref_base=C, new_base=T)> 1.0 31/37 31/37


In [44]:
for key in nonsyn_count_dict:
    if re.match('moa', key):
        print(key, nonsyn_count_dict[key])

In [45]:
for key in nonsyn_count_dict:
    if re.match('ynj', key):
        print(key, nonsyn_count_dict[key])

ynjC 4


In [46]:
for mut in mutation_ev_by_gene['ynjC']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837451, ref_base=G, new_base=T)> 0.0506176949 2/2 40/38
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837452, ref_base=G, new_base=C)> 0.0584836006 3/2 40/38
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837453, ref_base=G, new_base=C)> 0.0514974594 2/2 40/38
<SNP_Evidence(sample=HiMid3t1_S1, chr_position=1837454, ref_base=G, new_base=A)> 0.0513763428 2/2 40/38


In [47]:
for key in nonsyn_count_dict:
    if re.match('yfh', key):
        print(key, nonsyn_count_dict[key])

yfhH 1


In [48]:
for mut in mutation_ev_by_gene['yfhH']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi3t2_S1, chr_position=2698580, ref_base=T, new_base=A)> 0.0722632408 2/2 28/27


In [49]:
for key in nonsyn_count_dict:
    if re.match('yhe', key):
        print(key, nonsyn_count_dict[key])

In [50]:
for key in nonsyn_count_dict:
    if re.match('tus', key):
        print(key, nonsyn_count_dict[key])

tusB 1


In [51]:
for mut in mutation_ev_by_gene['tusB']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi1t2_S1, chr_position=3474803, ref_base=A, new_base=G)> 0.0510034561 3/2 46/48


In [52]:
for key in nonsyn_count_dict:
    if re.match('mnm', key):
        print(key, nonsyn_count_dict[key])

mnmE 1


In [53]:
for mut in mutation_ev_by_gene['mnmE']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Hi2t1_S1, chr_position=3888161, ref_base=A, new_base=G)> 0.0675330162 2/2 26/33


In [54]:
for key in nonsyn_count_dict:
    if re.match('ybb', key):
        print(key, nonsyn_count_dict[key])

In [55]:
for key in nonsyn_count_dict:
    if re.match('yaj', key):
        print(key, nonsyn_count_dict[key])

In [56]:
for key in nonsyn_count_dict:
    if re.match('isp', key):
        print(key, nonsyn_count_dict[key])

In [57]:
for key in nonsyn_count_dict:
    if re.match('dxs', key):
        print(key, nonsyn_count_dict[key])

dxs 1


In [58]:
for mut in mutation_ev_by_gene['dxs']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=Lo4t1_S1, chr_position=438688, ref_base=A, new_base=T)> 0.0625696182 2/2 29/29


In [59]:
for key in nonsyn_count_dict:
    if re.match('nuv', key):
        print(key, nonsyn_count_dict[key])

In [60]:
for key in nonsyn_count_dict:
    if re.match('nif', key):
        print(key, nonsyn_count_dict[key])

In [61]:
for key in nonsyn_count_dict:
    if re.match('chl', key):
        print(key, nonsyn_count_dict[key])

In [62]:
for key in nonsyn_count_dict:
    if re.match('bis', key):
        print(key, nonsyn_count_dict[key])

In [63]:
for key in nonsyn_count_dict:
    if re.match('mog', key):
        print(key, nonsyn_count_dict[key])

In [64]:
for key in nonsyn_count_dict:
    if re.match('yhh', key):
        print(key, nonsyn_count_dict[key])

yhhS 1


In [65]:
for mut in mutation_ev_by_gene['yhhS']:
    print(mut, mut.frequency, mut.new_cov, mut.total_cov)

<SNP_Evidence(sample=HiMid1t2_S1, chr_position=3611575, ref_base=A, new_base=G)> 0.0633859634 2/2 27/36


In [66]:
for key in nonsyn_count_dict:
    if re.match('ybb', key):
        print(key, nonsyn_count_dict[key])

In [67]:
for key in nonsyn_count_dict:
    if re.match('sir', key):
        print(key, nonsyn_count_dict[key])

In [68]:
for key in nonsyn_count_dict:
    if re.match('trm', key):
        print(key, nonsyn_count_dict[key])

In [69]:
for key in nonsyn_count_dict:
    if re.match('thd', key):
        print(key, nonsyn_count_dict[key])