In [1]:
import pandas as pd
from Bio import SeqIO
import glob
from itertools import combinations, product
from collections import defaultdict
import numpy as np

def count_divs(s1, s2):
    d = 0
    for b1, b2 in zip(s1, s2):
        if b1 != b2:
            d += 1
    return d * 1.0 / len(s1)

df = pd.read_table('pop_groups.txt')
pop0 = df[df['pop'] == '0'].strain
pop1 = df[df['pop'] == '0-1'].strain
pop2 = df[df['pop'] == '0-2'].strain

# Loops over all alignment blocks and puts sequences into a dictionary
seqs = defaultdict(str)
for f in glob.glob('*.fasta'):
    for s in SeqIO.parse(f, 'fasta'):
        seqs[s.id] += str(s.seq)

In [2]:
# Nucleotide diversity for population I
pop0aves = []
for strain1, strain2 in combinations(pop0, 2):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        pop0aves.append(count_divs(seqs[strain1], seqs[strain2]))
print(np.average(pop0aves))

0.005211226007986305


In [3]:
# Nucleotide diversity for population II
pop1aves = []
for strain1, strain2 in combinations(pop1, 2):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        pop1aves.append(count_divs(seqs[strain1], seqs[strain2]))
print(np.average(pop1aves))

0.013827213986866016


In [4]:
# Nucleotide diversity for population III
pop2aves = []
for strain1, strain2 in combinations(pop2, 2):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        pop2aves.append(count_divs(seqs[strain1], seqs[strain2]))
print(np.average(pop2aves))

0.02343396299783631


In [8]:
# Nucleotide diversity between populations I and II
pop2aves = []
for strain1, strain2 in product(list(pop0), list(pop1)):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        pop2aves.append(count_divs(seqs[strain1], seqs[strain2]))
print(np.average(pop2aves))

0.018270053640144713


In [2]:
# Nucleotide diversity between all different populations
aves = []
for strain1, strain2 in product(pop1, pop2):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        aves.append(count_divs(seqs[strain1], seqs[strain2]))
for strain1, strain2 in product(pop1, pop0):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        aves.append(count_divs(seqs[strain1], seqs[strain2]))
for strain1, strain2 in product(pop2, pop0):
    if strain1 in seqs.keys() and strain2 in seqs.keys():
        aves.append(count_divs(seqs[strain1], seqs[strain2]))
print(np.average(aves))

0.019124413595420418
