In [1]:
import simuOpt
simuOpt.setOptions(quiet=True, optimized=True, numThreads=4)
import simuPOP as sim
import os, numpy as np, pandas as pd, collections as col
from saegus import analyze, simulate, parameters, breed, operators
from scipy import stats
import random
np.set_printoptions(suppress=True, precision=3)

In [2]:
tuson = sim.loadPopulation('tuson.pop')

In [3]:
artemis = analyze.Study('artemis')

In [4]:
sim.stat(tuson, numOfSegSites=sim.ALL_AVAIL, vars=['numOfSegSites', 'segSites', 'numOfFixedSites', 'fixedSites'])
parameters.randomly_convert_fixed_sites(tuson, tuson.dvars().fixedSites)
sim.stat(tuson, numOfSegSites=sim.ALL_AVAIL, vars=['numOfSegSites', 'segSites', 'numOfFixedSites', 'fixedSites'])

In [5]:
tuson.dvars().numOfFixedSites

0

In [6]:
sim.stat(tuson, alleleFreq=sim.ALL_AVAIL)

In [7]:
sim.stat(tuson, homoFreq=sim.ALL_AVAIL)

In [8]:
sim.stat(tuson, heteroFreq=sim.ALL_AVAIL)

In [9]:
alleles  = np.array([list(tuson.dvars().alleleFreq[locus].keys()) for locus in range(tuson.totNumLoci())], dtype=np.int8)

In [12]:
alleles.shape

(44445, 2)

## Alleles for Fixed Sites chosen at random.

In [11]:
np.savetxt('alleles_of_tuson_founders.txt', alleles, fmt='%d', delimiter='\t')

In [13]:
af = analyze.allele_data(tuson, alleles, range(tuson.totNumLoci()))

In [14]:
minor_alleles = np.array(af['minor_allele'])

In [15]:
minor_alleles

array([ 1.,  2.,  3., ...,  2.,  3.,  1.])

In [None]:
def expanded_allele_data(pop, allele_data_structure):
    sim.stat(pop, heteroFreq=sim.ALL_AVAIL)
    hetero_frqs = np.array(list(pop.dvars().heteroFreq.values()))
    hetero_column = pd.DataFrame(hetero_frqs, columns=['heterozygote_frequency'])
    return allele_data_structure.join(hetero_column)

In [None]:
eaf = expanded_allele_data(tuson, af)

In [None]:
eaf.to_csv('example_allele_frequency_table.txt', sep='\t')

In [None]:
eaf.to_csv('expanded_tuson_founder_allele_frqs.txt', sep='\t')

In [16]:
tuson.addInfoFields(['generation', 'replicate', 'g', 'p'])

In [None]:
tuson.save('working_tuson.pop')

In [None]:
tuson.asPedigree()

In [None]:
tuson.save("tuson_pedigree.txt", infoFields=['g', 'p'], loci=sim.ALL_AVAIL)

In [17]:
tuson.popSize()

105

In [18]:
tuson.asPopulation()

AttributeError: 'Population' object has no attribute 'asPopulation'

## The Tuson Genetic Map

In [19]:
def parse_recombination_rates(genetic_map_filename):
    """
    Returns a list of crossover probabilities from a genetic map measured in centimorgans.
    """
    genetic_map = pd.read_csv(genetic_map_filename, sep='\t', index_col=None)
    genetic_map.drop(['locus', 'agpv2', 'namZmPRDA', 'namZmPRDS'], axis=1, inplace=True)
    genetic_map = np.array(genetic_map)
    recombination_rates = col.OrderedDict()
    for i in range(1, len(genetic_map), 1):
        if genetic_map[i-1][0] == genetic_map[i][0]:
            recombination_rates[i] = np.divide(np.abs(genetic_map[i][1] - genetic_map[i-1][1]), 100)
        elif genetic_map[i-1][0] != genetic_map[i][0]:
            recombination_rates[i] = 0.0
    recombination_rates[len(genetic_map)] = 0.0
    return list(recombination_rates.values())


In [20]:
recom_rates = parse_recombination_rates('raw_genetic_map.txt')

### Using the parameters.PopulationStructure class

In [21]:
popst = parameters.PopulationStructure(tuson, 'population_structure_matrix.xlsx', 0.01, 1.0)

In [22]:
struct_mating_probs = popst.generate_population_structure()

In [23]:
def format_mating_pmfs(population_structure_dict):
    mating_pmfs = {}
    for ind, probabilities in population_structure_dict.items():
        for i, prob in enumerate(probabilities):
            values = []
            probabilites = []
            for i, prob in enumerate(struct_mating_probs[ind]):
                values.append(i)
                probabilites.append(prob)
            pmf_values = (values, probabilites)
            mating_pmfs[ind] = stats.rv_discrete(values=pmf_values)
    return mating_pmfs

In [24]:
formed_mating_pmfs = format_mating_pmfs(struct_mating_probs)

In [25]:
def assign_primary_subpopulation(pop, struct_mating_probabilities):
    primary_subpop = {}
    for ind_id, inheritance_proportions in struct_mating_probabilities.items():
        primary_subpop[ind_id] = float(np.argmax(inheritance_proportions))
    for ind in pop.individuals():
        ind.primary = primary_subpop[ind.ind_id]

In [26]:
assign_primary_subpopulation(tuson, struct_mating_probs)

In [27]:
tuson.dvars().mating_pmfs = formed_mating_pmfs

In [28]:
pop_struct_expansion = breed.ForcedPopulationStructureParentChooser(10000, formed_mating_pmfs)

In [29]:
primary_subpopulation_splitter = sim.InfoSplitter(field='primary',
                                                  values=[0.0, 1.0, 2.0, 3.0,
                                                          4.0, 5.0])
tuson.setVirtualSplitter(primary_subpopulation_splitter)


In [30]:
tuson.numVirtualSubPop()

6

In [None]:
columns=['rep', 'gen']

In [31]:
sim.tagID(tuson, reset=False)

In [32]:
multi_son = sim.Simulator(tuson, rep=5)

In [33]:
multi_son.evolve(
    matingScheme=sim.HomoMating(
        sim.PyParentsChooser(pop_struct_expansion.forced_structure_parent_chooser),
        sim.OffspringGenerator(ops=[sim.IdTagger(), sim.ParentsTagger(), sim.PedigreeTagger(),
                                   sim.Recombinator(recom_rates)], numOffspring=1),
            subPopSize=1000),
    gen=1
)

(1, 1, 1, 1, 1)

In [34]:
qtl = tuple(sorted(random.sample(range(44445), 30)))

In [35]:
qtl

(27,
 601,
 3153,
 3654,
 4569,
 5510,
 9744,
 12734,
 13180,
 13190,
 13251,
 14123,
 17073,
 18635,
 21380,
 22018,
 26310,
 27162,
 27387,
 27478,
 27923,
 29785,
 30515,
 32793,
 34578,
 34766,
 34910,
 35997,
 36467,
 41835)

In [36]:
additive_trait = parameters.Trait()

In [37]:
allele_effects = additive_trait.assign_allele_effects(alleles, qtl, random.expovariate, 1, multiplicity=1)

In [None]:
import importlib as imp
imp.reload(simulate)
imp.reload(operators)

In [38]:
sampling_generations = [i for i in range(2, 10, 2)]

In [39]:
sampling_generations

[2, 4, 6, 8]

In [40]:
sample_sizes = {i: 100 for i in range(11)}

In [41]:
meta_populations = {rep: [] for rep in range(5)}

In [42]:
trun = simulate.Truncation(10, 1, 1000, 0.05, 0.50, 5, 0.7, sample_sizes, 1)

In [None]:
def print_pop_sizes(multi_pop):
    for pop in multi_pop.populations():
        print(pop.popSize())

In [None]:
print_pop_sizes(multi_son)

In [None]:
sample_sizes

In [43]:
trun.replicate_selection(multi_son, meta_populations, qtl, allele_effects, recom_rates)

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Initial: Sampled 100 individuals from generation 0 Replicate: 1.
Initial: Sampled 100 individuals from generation 0 Replicate: 2.
Initial: Sampled 100 individuals from generation 0 Replicate: 3.
Initial: Sampled 100 individuals from generation 0 Replicate: 4.
Generation: 0
Generation: 0
Generation: 0
Generation: 0
Generation: 0
Generation: 1
Generation: 1
Generation: 1
Generation: 1
Generation: 1
Generation: 2
Generation: 2
Generation: 2
Generation: 2
Generation: 2
Generation: 3
Generation: 3
Generation: 3
Generation: 3
Generation: 3
Generation: 4
Generation: 4
Generation: 4
Generation: 4
Generation: 4
Generation: 5
Generation: 5
Generation: 5
Generation: 5
Generation: 5
Generation: 6
Generation: 6
Generation: 6
Generation: 6
Generation: 6
Generation: 7
Generation: 7
Generation: 7
Generation: 7
Generation: 7
Generation: 8
Generation: 8
Generation: 8
Generation: 8
Generation: 8
Generation: 9
Generation: 9
Generation: 9
Gen

In [44]:
meta_populations

{0: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 1: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 2: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 3: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 4: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>]}

In [None]:
sample_pop = meta_populations[0][-1]

In [None]:
sample_pop.sortIndividuals('p')

In [46]:
datar = []

In [47]:
for rep in range(5):
    for gen in range(6):
        sample_pop = meta_populations[rep][gen]
        g_and_p = np.array((sample_pop.indInfo('ind_id'),
                            sample_pop.indInfo('replicate'),
                            sample_pop.indInfo('generation'),
                            sample_pop.indInfo('g'), 
                            sample_pop.indInfo('p'))).T
        datar.append(g_and_p)
        generation_set = set(sample_pop.indInfo('generation'))
        generation = generation_set.pop()
        name = 'R'+str(rep)+'_'+'G'+str(generation) + '_tuson_selection.txt'
        g_and_p_out = pd.DataFrame(g_and_p, columns=['ind_id', 'replicate', 'generation', 'g', 'p'])
        g_and_p_out.to_csv(name, sep='\t', float_format='%.4f')

In [48]:
concatted = np.concatenate((datar[0], datar[1]), axis=0)

In [49]:
len(datar)

30

In [50]:
first_concatted = np.concatenate((datar[0], datar[1]), axis=0)
for i in range(2, 30):
    first_concatted = np.concatenate((first_concatted, datar[i]))

In [51]:
first_concatted.shape

(3000, 5)

In [52]:
selection_g_and_p = pd.DataFrame(first_concatted, columns=['ind_id', 'rep', 'generation', 'g', 'p'])

In [54]:
selection_g_and_p.to_csv('first_tuson_selection_g_and_p.txt', sep='\t', index=False)

In [None]:
val = set(sample_pop.indInfo('generation'))

In [None]:
val.pop()

In [None]:
g_and_p = np.array((sample_pop.indInfo('ind_id'),
                    sample_pop.indInfo('replicate'),
                    sample_pop.indInfo('generation'),
                    sample_pop.indInfo('g'), 
                    sample_pop.indInfo('p'))).T

In [None]:
example_output=pd.DataFrame(g_and_p, columns=['ind_id', 'rep', 'generation', 'g', 'p'])

In [None]:
example_output

In [None]:
selected_inds = pd.DataFrame(g_and_p, columns=['ind_id', 'replicate', 'generation', 'g', 'p'])

In [None]:
for rep in range(5):
    for i in range(6):
        sim.stat(meta_populations[rep][i], alleleFreq=sim.ALL_AVAIL)

In [None]:
meta_populations[0][0].dvars().alleleFreq

In [None]:
def insert_allele_frequencies_into_aggregate_matrix(number_of_reps,
                                                    meta_population,
                                                    minor_allele):
    """
    number_of_reps: integer specifying number of replicates to include
    inside of one allele frequency matrix.

    meta_population: Multiple replicate meta population.
    minor_allele_list: List or numpy.array of minor alleles.
    :param number_of_reps: Replicates of the population
    :type number_of_reps: int
    :param meta_population: Multi-replicate population container
    :type meta_population: sim.Simulator
    :param minor_allele: Locus: Minor Allele key-value pairs
    :type minor_allele: dict
    :return: Minor allele frequencies of multiple replicates
    :rtype: np.array
    """
    number_of_rows = 6 * number_of_reps
    aggregate_frequency_matrix = np.zeros((number_of_rows, 44447))
    row_indices = list(range(number_of_rows))
    print(
        'Calculating allele frequencies for {number_reps} replicates and writing them to an aggregate matrix.'.format(
            number_reps=number_of_reps))
    for rep in number_of_reps:
        for pop in meta_populations[rep]:
            sim.stat(pop)
    for replicate in meta_population.populations():
        print("Replicate: {rep_id}".format(rep_id=replicate.dvars().rep))
        sim.stat(replicate, alleleFreq=sim.ALL_AVAIL,
                 vars=['alleleFreq_sp'])
        subpops_and_gens = [(1, 0), (2, 2), (3, 4), (4, 6), (5, 8),
                            (6, 10)]
        for sp, gen in subpops_and_gens:
            row_index = row_indices.pop(0)
            print("Row index: {row_index}".format(row_index=row_index))
            rep = replicate.dvars().rep
            aggregate_frequency_matrix[row_index, 0] = gen
            aggregate_frequency_matrix[row_index, 1] = rep
            aggregate_frequency_matrix[row_index, 2:] = [
                replicate.dvars(sp).alleleFreq[locus][ma] for ma, locus in
                zip(minor_allele.values(), range(44445))]
    return aggregate_frequency_matrix


In [68]:
mafrqs = np.zeros((30, 44447))

In [69]:
row_indices = list(range(6*5))

In [70]:
for rep in range(5):
    for gen in range(6):
        rowdex = row_indices.pop(0)
        sim.stat(meta_populations[rep][gen], alleleFreq=sim.ALL_AVAIL)
        generation_set = set(meta_populations[rep][gen].indInfo('generation'))
        generation = generation_set.pop()
        mafrqs[rowdex, 0] = rep
        mafrqs[rowdex, 1] = generation
        mafrqs[rowdex, 2:] = [meta_populations[rep][gen].dvars().alleleFreq[locus][minor_allele] 
                            for locus, minor_allele in enumerate(minor_alleles)]

In [74]:
af_cols = [ 'rep', 'generation'] + list(range(44445))

In [75]:
minor_allele_frqs_data = pd.DataFrame(mafrqs, columns=af_cols)

In [77]:
minor_allele_frqs_data.to_csv('tuson_selection_mafrqs.txt', sep='\t')

In [78]:
hetero_frqs = np.zeros((30, 44447))
row_indices = list(range(6*5))

In [79]:
for rep in range(5):
    for gen in range(6):
        rowdex = row_indices.pop(0)
        sim.stat(meta_populations[rep][gen], heteroFreq=sim.ALL_AVAIL)
        generation_set = set(meta_populations[rep][gen].indInfo('generation'))
        generation = generation_set.pop()
        hetero_frqs[rowdex, 0] = rep
        hetero_frqs[rowdex, 1] = generation
        hetero_frqs[rowdex, 2:] = [meta_populations[rep][gen].dvars().heteroFreq[locus] 
                            for locus, minor_allele in enumerate(minor_alleles)]

In [81]:
hetero_frqs

array([[  0.  ,   0.  ,   0.36, ...,   0.43,   0.32,   0.34],
       [  0.  ,   2.  ,   0.31, ...,   0.48,   0.3 ,   0.36],
       [  0.  ,   4.  ,   0.44, ...,   0.48,   0.24,   0.41],
       ..., 
       [  4.  ,   6.  ,   0.13, ...,   0.52,   0.55,   0.53],
       [  4.  ,   8.  ,   0.02, ...,   0.42,   0.44,   0.48],
       [  4.  ,  10.  ,   0.  , ...,   0.53,   0.47,   0.58]])

In [82]:
hetero_columns = ['rep', 'generation'] + list(range(44445))

In [83]:
minor_alleles

array([ 1.,  2.,  3., ...,  2.,  3.,  1.])

In [84]:
hetero_frq_data = pd.DataFrame(hetero_frqs, columns=hetero_columns)

In [86]:
hetero_frq_data.to_csv('heterozygote_frq_selection.txt', sep='\t')

In [87]:
allele_effects

{27: {2: 2.4563428307058226, 3: 0.902496124727694},
 601: {1: 0.5886817538590416, 3: 0.1285805135499849},
 3153: {1: 2.0446427412933477, 3: 0.05871532001842488},
 3654: {1: 0.44688507279980877, 3: 0.05030852627016928},
 4569: {1: 0.15412168275436025, 3: 0.1586014792679026},
 5510: {1: 0.4048141023881617, 3: 3.9183884081671514},
 9744: {1: 2.6215914152128983, 3: 3.304347811025092},
 12734: {1: 1.0272802470164655, 3: 2.0284785297572134},
 13180: {1: 1.0797354997413735, 2: 0.06932846723783206},
 13190: {1: 0.10199452875256881, 3: 2.672262639036609},
 13251: {1: 1.5412144231442957, 3: 2.492018927589699},
 14123: {1: 0.7455961703850555, 3: 0.11842416265426248},
 17073: {1: 0.2967362392955534, 3: 0.605355189946292},
 18635: {1: 0.5453505705076177, 3: 3.73238438019367},
 21380: {1: 0.6169528965995916, 2: 2.7270417156963833},
 22018: {1: 2.3845380826723264, 3: 2.875438373751781},
 26310: {1: 1.0196529670861978, 3: 1.808619880154679},
 27162: {1: 0.4306684508602656, 3: 0.7907402838826632},
 273

In [89]:
import shelve

In [90]:
allelefx = shelve.open('tuson_allele_effects')
allelefx['first_run'] = allele_effects
allelefx.close()

In [None]:
sim.stat(tuson, alleleFreq=sim.ALL_AVAIL)

In [None]:
tuson.popSize()

In [None]:
tuson.dvars()

In [None]:
tuson.dvars().alleleFreq

In [None]:
sorted(qtl)

In [None]:
qtl