In [1]:
from Bio import SeqIO
import numpy as np
import pandas as pd
import glob
from collections import defaultdict
from itertools import combinations, product

project_dir = ""
contig_extension = ""
strain_list_file = ""
output_prefix = ""
pop_infile_name = ""

with open("phybreak_parameters.txt","r") as parameter_file:
    for line in parameter_file:
        line = line.strip().split(" = ")
        if len(line) > 1:
            if line[0] == "project_dir":
                project_dir = line[1].split(" #")[0]
            elif line[0] == "contig_extension":
                contig_extension = line[1].split(" #")[0]
            elif line[0] == "strain_list_file":
                strain_list_file = line[1].split(" #")[0]
            elif line[0] == "pop_infile_name":
                pop_infile_name = line[1].split(" #")[0]
            elif line[0] == "output_prefix":
                output_prefix = line[1].split(" #")[0]

phy_split_dir = input_dir + "phy_split/"
phy_prefix = output_prefix + ".core"

# Define some functions
def count_divs(s1, s2):
    '''
    Given two strings counts the number of differences between them
    '''
    d = 0
    for b1, b2 in zip(s1, s2):
        if b1 != b2:
            d += 1
    return d * 1.0 / len(s1)

def calc_pop_div(pop, s_dict):
    '''
    Given a list of strains and a dictionary of sequences, calculates pi,
    the average nucleotide diversity between pairs of strains
    '''
    
    # pop is just a list of strains in the population of interest
    # s dict is a dictionary that maps strains/genomes to sequences
    aves = []
    for strain1, strain2 in combinations(pop, 2):
        if strain1 in s_dict.keys() and strain2 in s_dict.keys():
            aves.append(count_divs(s_dict[strain1], s_dict[strain2]))
    return(np.average(aves))

def calc_inter_div(pop_1, pop_2, s_dict):
    '''
    Given two lists of strains representing two populations
    and a dictionary of sequences, calculate pi between the 
    two populations 
    '''
    aves = []
    for strain1, strain2 in product(pop_1, pop_2):
        if strain1 in s_dict.keys() and strain2 in s_dict.keys():
            aves.append(count_divs(s_dict[strain1], s_dict[strain2]))
    return(np.average(aves))


# inputs

# File that maps trees to alignment positions
info_file = input_dir + output_prefix + '.core.phyml_tree_info.txt'

# ClusterPop assignments
pop_file = '../../Data/Clusters/rumino_0.000355362.txt.cluster.tab.txt'

# Alignment of each tree
individual_alignments = phy_split_dir + "*window.phy"
tree_aligns = glob.glob(individual_alignments)

In [None]:

info = pd.read_table(info_file, index_col='tree')
pops = pd.read_table(pop_file, index_col='Strain')
pop_assignments = {}

# Have to rename the strains so that they match the sequence files
for pop, pop_df in pops.groupby('Cluster_ID'):
    pop_assignments[str(pop)] = [strain.split('--')[0].replace('.','').replace('_', '') for strain in list(pop_df.index)]
    
all_strains = []
for strain_list in pop_assignments.values():
    all_strains += strain_list

columns = []
for pop, strains in pop_assignments.items():
    popID = 'Pop' + pop.split('.')[-1]
    columns.append(popID)
    columns.append(popID + ' v All')
    
for pop1, pop2 in combinations(pop_assignments, 2):
    popID1 = 'Pop' + pop1.split('.')[-1]
    popID2 = 'Pop' + pop2.split('.')[-1]
    columns.append(popID1 + ' v ' + popID2)
    
pi_df = pd.DataFrame(index=range(1, len(blocks) + 1), columns=columns)

for tree_align in tree_aligns:
    # Get number of tree
    treenum = int(tree_align.split('_')[-1].split('.phy')[0])
    
    all_seqs = {line.strip().split()[0].split('_')[0]: line.split()[1] for line in open(block) if line[0] != ' '}
    for pop, strains in pop_assignments.items():
        popID = 'Pop' + pop.split('.')[-1]
        
        # Diversity within a population
        intra_pop_div = calc_pop_div(strains, all_seqs)
        
        # Diversity between focus population and all other strains
        pop_v_all = calc_inter_div(strains, set(all_strains) - set(strains), all_seqs)
        
        pi_df.loc[blocknum, popID] = intra_pop_div
        pi_df.loc[blocknum, popID + ' v All'] = pop_v_all
        
    for pop1, pop2 in combinations(pop_assignments, 2):
        # Diversity between each pair of populations
        popID1 = 'Pop' + pop1.split('.')[-1]
        popID2 = 'Pop' + pop2.split('.')[-1]
        strains1 = pop_assignments[pop1]
        strains2 = pop_assignments[pop2]
        inter_div = calc_inter_div(strains1, strains2, all_seqs)
        pi_df.loc[blocknum, popID1 + ' v ' + popID2] = pop_v_all

pi_df.to_csv('pi_per_block.csv')
outdf = pi_df.join(info)
outdf.index.name='tree'
outdf.to_csv('pi_join_info.csv')