In [1]:
import simuOpt
simuOpt.setOptions(quiet=True, optimized=True, numThreads=4)
import simuPOP as sim
import os, numpy as np, pandas as pd, collections as col
from saegus import analyze, simulate, parameters, breed, operators
from scipy import stats
import random
np.set_printoptions(suppress=True, precision=3)

In [2]:
tuson = sim.loadPopulation('tuson.pop')

In [3]:
artemis = analyze.Study('artemis')

In [4]:
sim.stat(tuson, numOfSegSites=sim.ALL_AVAIL, vars=['numOfSegSites', 'segSites', 'numOfFixedSites', 'fixedSites'])
parameters.randomly_convert_fixed_sites(tuson, tuson.dvars().fixedSites)
sim.stat(tuson, numOfSegSites=sim.ALL_AVAIL, vars=['numOfSegSites', 'segSites', 'numOfFixedSites', 'fixedSites'])

In [5]:
tuson.dvars().numOfFixedSites

0

In [6]:
sim.stat(tuson, alleleFreq=sim.ALL_AVAIL)

In [7]:
sim.stat(tuson, homoFreq=sim.ALL_AVAIL)

In [8]:
sim.stat(tuson, heteroFreq=sim.ALL_AVAIL)

In [9]:
alleles  = np.array([list(tuson.dvars().alleleFreq[locus].keys()) for locus in range(tuson.totNumLoci())], dtype=np.int8)

In [10]:
alleles.shape

(44445, 2)

## Alleles for Fixed Sites chosen at random.

In [11]:
np.savetxt('alleles_of_tuson_founders.txt', alleles, fmt='%d', delimiter='\t')

In [12]:
af = analyze.allele_data(tuson, alleles, range(tuson.totNumLoci()))

In [15]:
def expanded_allele_data(pop, allele_data_structure):
    """
    Adds heterozygote frequency onto the existing major and minor allele frequency table. Also
    reformats the minor allele columns 
    
    Before
    ------
          minor_allele    minor_frequency    major_allele    major_frequency
    0     1               0.319048           2               0.680952
    After
    -----
        minor_allele    minor_frequency  major_allele    major_frequency     heterozygote_frequency
    0   1               0.319048         2               0.680952            0.371429
    
    
    """
    sim.stat(pop, heteroFreq=sim.ALL_AVAIL)
    hetero_frqs = np.array(list(pop.dvars().heteroFreq.values()))
    hetero_column = pd.DataFrame(hetero_frqs, columns=['heterozygote_frequency'])
    return allele_data_structure.join(hetero_column)

In [16]:
eaf = expanded_allele_data(tuson, af)

In [18]:
eaf

Unnamed: 0,minor_allele,minor_frequency,major_allele,major_frequency,heterozygote_frequency
0,1,0.319048,2,0.680952,0.371429
1,2,0.219048,3,0.780952,0.266667
2,3,0.061905,2,0.938095,0.104762
3,1,0.061905,3,0.938095,0.104762
4,3,0.309524,1,0.690476,0.619048
5,3,0.052381,1,0.947619,0.085714
6,1,0.204762,3,0.795238,0.314286
7,1,0.128571,3,0.871429,0.200000
8,1,0.133333,3,0.866667,0.209524
9,3,0.180952,2,0.819048,0.266667


In [19]:
eaf.to_csv('expanded_tuson_founder_allele_frqs.txt', sep='\t')

In [20]:
tuson.addInfoFields(['generation', 'replicate', 'g', 'p'])

In [None]:
tuson.save('working_tuson.pop')

In [None]:
tuson.asPedigree()

In [None]:
tuson.save("tuson_pedigree.txt", infoFields=['g', 'p'], loci=sim.ALL_AVAIL)

In [None]:
tuson.popSize()

In [None]:
tuson.asPopulation()

## The Tuson Genetic Map

In [21]:
def parse_recombination_rates(genetic_map_filename):
    """
    Returns a list of crossover probabilities from a genetic map measured in centimorgans.
    """
    genetic_map = pd.read_csv(genetic_map_filename, sep='\t', index_col=None)
    genetic_map.drop(['locus', 'agpv2', 'namZmPRDA', 'namZmPRDS'], axis=1, inplace=True)
    genetic_map = np.array(genetic_map)
    recombination_rates = col.OrderedDict()
    for i in range(1, len(genetic_map), 1):
        if genetic_map[i-1][0] == genetic_map[i][0]:
            recombination_rates[i] = np.divide(np.abs(genetic_map[i][1] - genetic_map[i-1][1]), 100)
        elif genetic_map[i-1][0] != genetic_map[i][0]:
            recombination_rates[i] = 0.0
    recombination_rates[len(genetic_map)] = 0.0
    return list(recombination_rates.values())


In [22]:
recom_rates = parse_recombination_rates('raw_genetic_map.txt')

### Using the parameters.PopulationStructure class

In [23]:
popst = parameters.PopulationStructure(tuson, 'population_structure_matrix.xlsx', 0.01, 1.0)

In [24]:
struct_mating_probs = popst.generate_population_structure()

In [25]:
def format_mating_pmfs(population_structure_dict):
    mating_pmfs = {}
    for ind, probabilities in population_structure_dict.items():
        for i, prob in enumerate(probabilities):
            values = []
            probabilites = []
            for i, prob in enumerate(struct_mating_probs[ind]):
                values.append(i)
                probabilites.append(prob)
            pmf_values = (values, probabilites)
            mating_pmfs[ind] = stats.rv_discrete(values=pmf_values)
    return mating_pmfs

In [26]:
formed_mating_pmfs = format_mating_pmfs(struct_mating_probs)

In [27]:
def assign_primary_subpopulation(pop, struct_mating_probabilities):
    primary_subpop = {}
    for ind_id, inheritance_proportions in struct_mating_probabilities.items():
        primary_subpop[ind_id] = float(np.argmax(inheritance_proportions))
    for ind in pop.individuals():
        ind.primary = primary_subpop[ind.ind_id]

In [28]:
assign_primary_subpopulation(tuson, struct_mating_probs)

In [29]:
tuson.dvars().mating_pmfs = formed_mating_pmfs

In [30]:
pop_struct_expansion = breed.ForcedPopulationStructureParentChooser(1000, formed_mating_pmfs)

In [31]:
primary_subpopulation_splitter = sim.InfoSplitter(field='primary',
                                                  values=[0.0, 1.0, 2.0, 3.0,
                                                          4.0, 5.0])
tuson.setVirtualSplitter(primary_subpopulation_splitter)


In [32]:
tuson.numVirtualSubPop()

6

In [33]:
columns=['rep', 'gen']

In [34]:
sim.tagID(tuson, reset=False)

In [35]:
multi_son = sim.Simulator(tuson, rep=5)

In [36]:
multi_son.evolve(
    matingScheme=sim.HomoMating(
        sim.PyParentsChooser(pop_struct_expansion.forced_structure_parent_chooser),
        sim.OffspringGenerator(ops=[sim.IdTagger(), sim.ParentsTagger(), sim.PedigreeTagger(),
                                   sim.Recombinator(recom_rates)], numOffspring=1),
            subPopSize=1000),
    gen=1
)

(1, 1, 1, 1, 1)

In [164]:
qtl = tuple(range(44445))

In [165]:
additive_trait = parameters.Trait()

In [166]:
allele_effects = additive_trait.assign_allele_effects(alleles, qtl, random.expovariate, 1, multiplicity=1)

In [167]:
allele_effects

{0: {1: 1.1061100950544054, 2: 0.5453590292999076},
 1: {2: 0.003196264534612611, 3: 0.3798904691436801},
 2: {2: 1.4423837728472835, 3: 0.6830554994191108},
 3: {1: 1.5536717982109969, 3: 3.1161710546117645},
 4: {1: 2.2080466309696707, 3: 0.9351550451058428},
 5: {1: 2.861992802818136, 3: 1.1107325812689615},
 6: {1: 3.5758766116473373, 3: 1.9297703779217266},
 7: {1: 0.9688846631374538, 3: 1.4948652034558256},
 8: {1: 0.26653718805497006, 3: 3.0332790687687767},
 9: {2: 1.1287950537722449, 3: 0.7982748983009149},
 10: {1: 1.1267052179622414, 3: 1.2204617478091424},
 11: {1: 0.3523670323445758, 2: 1.1684490457517074},
 12: {1: 3.257076088835305, 3: 0.9113721718843124},
 13: {1: 0.016550637519060458, 3: 1.2077783344146387},
 14: {1: 0.21011529260853695, 3: 0.8329316335054213},
 15: {2: 0.05023503265970157, 3: 0.6219057038171083},
 16: {1: 1.0867618660803555, 2: 2.256068920087688},
 17: {1: 1.5649726716708625, 2: 3.0851324880050695},
 18: {2: 0.9552739674527352, 3: 1.033245205619355},


In [168]:
sampling_generations = [i for i in range(2, 10, 2)]

In [169]:
sampling_generations

[2, 4, 6, 8]

In [43]:
sample_sizes = {i: 100 for i in range(11)}

In [44]:
meta_populations = {rep: [] for rep in range(5)}

In [45]:
trun = simulate.Truncation(10, 1, 1000, 0.05, 0.50, 5, 0.7, sample_sizes, 1)

In [46]:
trun.replicate_selection(multi_son, meta_populations, qtl, allele_effects, recom_rates)

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Initial: Sampled 100 individuals from generation 0 Replicate: 1.
Initial: Sampled 100 individuals from generation 0 Replicate: 2.
Initial: Sampled 100 individuals from generation 0 Replicate: 3.
Initial: Sampled 100 individuals from generation 0 Replicate: 4.
Generation: 0
Generation: 0
Generation: 0
Generation: 0
Generation: 0
Generation: 1
Generation: 1
Generation: 1
Generation: 1
Generation: 1
Generation: 2
Generation: 2
Generation: 2
Generation: 2
Generation: 2
Generation: 3
Generation: 3
Generation: 3
Generation: 3
Generation: 3
Generation: 4
Generation: 4
Generation: 4
Generation: 4
Generation: 4
Generation: 5
Generation: 5
Generation: 5
Generation: 5
Generation: 5
Generation: 6
Generation: 6
Generation: 6
Generation: 6
Generation: 6
Generation: 7
Generation: 7
Generation: 7
Generation: 7
Generation: 7
Generation: 8
Generation: 8
Generation: 8
Generation: 8
Generation: 8
Generation: 9
Generation: 9
Generation: 9
Gen

In [47]:
meta_populations

{0: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 1: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 2: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 3: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>],
 4: [<simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>,
  <simuPOP.Population>]}

In [48]:
sample_pop = meta_populations[0][-1]

In [49]:
sample_pop.sortIndividuals('p')

## Genotype & Phenotype

In [225]:
def collect_genotype_phenotype_data(meta_population_library, ):
    """
    Collects the genotype and phenotype data of a multiple replicate
    multiple sample population dictionary.
    
    """
    datar = []
    for rep, v in meta_population_library.items():
        for i, gen in enumerate(v):
            gen_id = int(max(gen.indInfo('generation')))
            gen.setIndInfo(rep, 'replicate')
            g_and_p = np.asarray((gen.indInfo('ind_id'),
                                gen.indInfo('replicate'),
                                gen.indInfo('generation'),
                                gen.indInfo('g'),
                                gen.indInfo('p'))).T
            datar.extend(g_and_p)
    return pd.DataFrame(datar, columns=('ind_id', 'rep', 'gen', 'g', 'p'))

## Allele Frequency

In [None]:
for rep in range(5):
    for gen in range(6):
        rowdex = row_indices.pop(0)
        sim.stat(meta_populations[rep][gen], alleleFreq=sim.ALL_AVAIL)
        generation_set = set(meta_populations[rep][gen].indInfo('generation'))
        generation = generation_set.pop()
        mafrqs[rowdex, 0] = rep
        mafrqs[rowdex, 1] = generation
        mafrqs[rowdex, 2:] = [meta_populations[rep][gen].dvars().alleleFreq[locus][minor_allele] 
                            for locus, minor_allele in enumerate(minor_alleles)]

In [271]:
def collect_allele_frequency_data(meta_population_library):
    """
    Collects minor allele frequency data of a multiple generation
    population library.
    
    Example
    -------
    
    meta_population_library
    {
        0: [<simuPOP.Population, ..., <simuPOP.Population>],
        1: [<simuPOP.Population, ..., <simuPOP.Population>],
        ...,
    }
    
    Returns
    -------
    
    collect_multigeneration_frequency_data(meta_population_library)
    array([[  0.   ,   0.   ,   0.319, ...,   0.467,   0.262,   0.267],
           ..., 
           [  4.   ,  10.   ,   0.319, ...,   0.467,   0.262,   0.267]])
    
    
    """
    datar = []
    for rep, v in meta_population_library.items():
        for i, gen in enumerate(v):
            gen_id = int(max(gen.indInfo('generation')))
            datar.append(np.asarray(([rep, gen_id] + list(epop.dvars().alleleFreq[locus][allele] 
                                                          for locus, allele in enumerate(minor_alleles)))))
    return np.asarray(datar)

In [258]:
dr = collect_multigeneration_frequency_data(meta_populations)

In [259]:
dr[0]

array([ 0.   ,  0.   ,  0.319, ...,  0.467,  0.262,  0.267])

In [275]:
def collect_heterozygote_frequency_data(meta_population_library):
    """
    Collects minor allele frequency data of a multiple generation
    population library.
    
    Example
    -------
    
    meta_population_library
    {
        0: [<simuPOP.Population, ..., <simuPOP.Population>],
        1: [<simuPOP.Population, ..., <simuPOP.Population>],
        ...,
    }
    
    Returns
    -------
    
    collect_heterozygote_frequency_data(meta_population_library)
    array([[  0.  ,   0.  ,   0.37, ...,   0.45,   0.3 ,   0.37],
           ..., 
           [  4.  ,  10.  ,   0.27, ...,   0.49,   0.55,   0.53]])    
    
    """
    heterozygote_frequencies = []
    for rep_id, sample_list in meta_population_library.items():
        for i, sample in enumerate(sample_list):
            gen_id = int(max(sample.indInfo('generation')))
            heterozygote_frequencies.append(np.asarray(([rep_id, gen_id] + list(sample.dvars().heteroFreq.values()))))
    return np.asarray(heterozygote_frequencies)

In [278]:
for rep_id, samples in meta_populations.items():
    for samp in samples:
        sim.stat(samp, alleleFreq=sim.ALL_AVAIL)
        sim.stat(samp, heteroFreq=sim.ALL_AVAIL)

In [279]:
collect_heterozygote_frequency_data(meta_populations)[0]

array([ 0.  ,  0.  ,  0.37, ...,  0.45,  0.3 ,  0.37])