# Simulating a Population for Use in GWAS #

### Population Parameters ###

In [1]:
%matplotlib inline
import shelve
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from saegus import breed, operators, selection, helpers, parser, parameterizer, selection
import random
#random.seed(1337)
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import matplotlib.pyplot as plt

In [2]:
hapmap = pd.read_csv('clean_hapmap.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

chr_cM_positions = {}
for i in range(1, 11):
    chr_cM_positions[i] = []

for idx in range(len(genetic_map)):
    chr_cM_positions[int(genetic_map.iloc[idx]['chr'])].append(
    float(genetic_map.iloc[idx]['cM_pos']))


cM_positions = []
for i in range(1, 11):
    cM_positions.append(chr_cM_positions[i])


snp_to_integer = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-':4, '+':5}
allele_names = ['A', 'C', 'T', 'G', 'D', 'I']
integer_to_snp = {0: 'A', 1:'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}


integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])

alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)


nam = sim.loadPopulation('nam_prefounders.pop')
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

genetic_structure = {}
genetic_structure['cM_positions'] = cM_positions
genetic_structure['chr_cM_positions'] = chr_cM_positions
genetic_structure['allele_names'] = allele_names
genetic_structure['integral_valued_loci'] = integral_valued_loci
genetic_structure['relative_integral_valued_loci'] = relative_integral_valued_loci
genetic_structure['alleles'] = alleles
genetic_structure['recombination_rates'] = recombination_rates

In [3]:
sim_params = {
                'generations_of_selection': 10,
                'generations_of_drift': 10,
                'generations_of_random_mating': 3,
                'operating_population_size': 2000,
                'proportion_of_individuals_saved': 0.05,
                'overshoot_as_proportion': 0.50,
                'individuals_per_breeding_subpop': 5,
                'heritability': 0.7,
                'meta_pop_sample_sizes': sample_sizes,
                'number_of_replicates': 1,
                'prefounder_file_name': 'nam_prefounders.pop',
                'founders': [(3,18), (2, 13), (7, 14), (1, 19),
                            (14, 17), (1, 20), (17, 21), (9, 22)]
    }

In [4]:
qtl_params = {
                'qtl': 10,
                'allele_effects': 1,
}
selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}
drift_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}

In [5]:
s = selection.Truncation(sim_params['generations_of_selection'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

d = selection.Drift(sim_params['generations_of_drift'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

In [6]:
sim.tagID(nam, reset=True)

founders = sim_params['founders']
replicated_nam = sim.Simulator(nam, rep=3)
pop = replicated_nam.extract(0)
#pop.dvars().statistics = population_statistics
selection_meta = replicated_nam.extract(0)
drift_meta = replicated_nam.extract(0)
#meta.removeSubPops(0)

### Simulated Breeding Scenario ###

In [7]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])

Creating the F_one population from selected founders.
Generation: 0


In [8]:
s.recombinatorial_convergence(pop, recombination_rates)

Generation: 1	popSize: 8
Generation: 2	popSize: 4
Generation: 3	popSize: 2


In [9]:
s.expand_by_selfing(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)
sim.stat(pop, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])

Creating the F_two population.
Generation: 4
Initiating interim random mating for 3 generations.
Generation: 5
Generation: 6
Generation: 7


In [10]:
pop.popSize()

2000

In [11]:
run_id = "run_11_"

## Choose QTL and Assign Effects ##

In [12]:
qtl = parameterizer.seg_qtl_chooser(pop, integral_valued_loci, qtl_params['qtl'])

triplet_qtl = []
for locus in qtl:
    triplet_qtl.append(locus-1)
    triplet_qtl.append(locus)
    triplet_qtl.append(locus+1)
triplet_qtl = sorted(triplet_qtl)

qtl_params['triplet_qtl'] = triplet_qtl

allele_effects = {locus: {} for locus in triplet_qtl}
for tqtl in triplet_qtl:
    for allele in alleles[tqtl]:
        allele_effects[tqtl][allele] = random.expovariate(qtl_params['allele_effects'])

qtl_params['allele_effects'] = allele_effects
        


# Write parameter sets to a 'shelf'.
import shelve
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

rsparams = shelve.open("RS_Parameter_Sets")
rsparams['truncation'] = s
rsparams['drift'] = d
rsparams['seg_sites_after_rmating'] = list(pop.dvars().segSites)
rsparams['qtl_parameters'] = qtl_params
rsparams['simulation_parameters'] = sim_params
    
pop.dvars().qtl = qtl
pop.dvars().triplet_qtl = triplet_qtl
pop.dvars().allele_effects = allele_effects

selection_plus_drift_replicates = sim.Simulator(pop, rep=2)
selection_pop = selection_plus_drift_replicates.extract(0)
drift_pop = selection_plus_drift_replicates.extract(0)

In [13]:
selection_pop.dvars().statistics = selection_statistics
drift_pop.dvars().statistics = drift_statistics

In [14]:
s.recurrent_truncation_selection(selection_pop, selection_meta, triplet_qtl, allele_effects,
                                recombination_rates)

d.recurrent_drift_selection(drift_pop, drift_meta, triplet_qtl, allele_effects, 
                            recombination_rates)
                                
selection_meta.removeSubPops(0)
drift_meta.removeSubPops(0)

selection_qtd = helpers.Frq(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtd = helpers.Frq(drift_meta, triplet_qtl, alleles, allele_effects)


selection_af = selection_qtd.allele_frequencies(selection_meta, range(selection_meta.totNumLoci()))
drift_af = drift_qtd.allele_frequencies(drift_meta, range(drift_meta.totNumLoci()))
selection_qtalleles = selection_qtd.rank_allele_effects(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtalleles = drift_qtd.rank_allele_effects(drift_meta, triplet_qtl, alleles, allele_effects)
selection_ties = [locus for locus in range(selection_meta.totNumLoci()) 
                  if selection_af['minor', 'alleles'][locus] == selection_af['major', 'alleles'][locus]]
drift_ties = [locus for locus in range(drift_meta.totNumLoci())
                  if drift_af['minor', 'alleles'][locus] == drift_af['major', 'alleles'][locus]]

for st in selection_ties:
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[0]
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[1]
for dt in drift_ties:
    drift_af['minor', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[0]
    drift_af['major', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[1]
sum(np.equal(list(selection_af['minor', 'alleles'].values()), list(selection_af['major', 'alleles'].values())))
sum(np.equal(list(drift_af['minor', 'alleles'].values()), list(drift_af['major', 'alleles'].values())))

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Final: Sampled 100 individuals from generation 10
Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Final: Sampled 100 individuals from generation 10


0

In [15]:
selection_pop.popSize()

2000

In [16]:
drift_pop.popSize()

2000

## Gather Data for Use in GWAS ##

## Analyzing TASSEL GWAS Results ##

In [None]:
selection_meta.dvars(3).haploFreq

In [43]:
htypes['loci'] = {}



for k, i in enumerate(range(0, len(triplet_qtl), 3)):
    htypes['loci'][k] = (triplet_qtl[i], triplet_qtl[i+1], triplet_qtl[i+2])


htypes['alleles'] = {}
htypes['effect'] = {}
htypes['frequency'] = {}
htypes['frequency']['accumulated'] = {}
for loci in htypes['loci'].values():
    htypes['frequency'][loci] = {}
    for sp in range(selection_meta.numSubPop()):
        htypes['frequency'][loci][sp] = {}

In [55]:
sim.stat(selection_meta, haploFreq=list(htypes['loci'].values()), vars=['haploFreq', 'haploFreq_sp'])

In [108]:
for sp in range(selection_meta.numSubPop()):
    for loci, triplet in htypes['alleles'].items():
        for alleles in triplet:
            htypes['frequency'][loci][sp] = selection_meta.dvars(sp).haploFreq[loci][alleles]

for loci, triplet in htypes['alleles'].items():
    htypes['frequency']['accumulated'][loci] = {}
    for alleles in triplet:
        htypes['frequency']['accumulated'][loci] = selection_meta.dvars().haploFreq[loci][alleles]


In [126]:
htypes['frequency']['accumulated']

{(1335, 1336, 1337): 0.4633333333333333,
 (1475, 1476, 1477): 0.23083333333333333,
 (2275, 2276, 2277): 0.23,
 (2685, 2686, 2687): 0.855,
 (3045, 3046, 3047): 0.56,
 (4215, 4216, 4217): 0.2,
 (4335, 4336, 4337): 0.6875,
 (4847, 4848, 4849): 0.14916666666666667,
 (6202, 6203, 6204): 0.38,
 (6227, 6228, 6229): 0.21833333333333332}

In [112]:

htypes['frequency']['accumulated']

{}

In [72]:
for loci in htypes['loci'].values():
    for sp in range(selection_meta.numSubPop()):
        htypes['frequency'][loci][sp] = selection_meta.dvars(sp).haploFreq[loci]

In [20]:
for sp in range(selection_meta.numSubPop()):
    htypes[sp] = {}

In [None]:
sele

In [17]:
hz = helpers.haplotype_data(selection_pop, selection_meta, allele_effects, triplet_qtl)

KeyError: 0

In [None]:
selection_meta.dvars().haploFreq

In [None]:
['frequency'][5611, 5612, 5613]

In [None]:
gens

In [None]:
hz['frequency'][loci][haplotype]

In [None]:
alleles

In [None]:
selection_statistics

In [None]:
store_toy_parameter_set()

In [None]:
tps = load_toy_parameter_set()

In [None]:
hz = tps['haplotype']

In [None]:

generations

In [None]:
hz['frequency']

In [None]:
haplotypez = []
for locus in hz['loci'].values():
    haplotypez.extend(hz['alleles'][locus])

In [None]:
len(haplotypez)

In [None]:
data_columns = ['loci', 'haplotype', 'effect'] + generations

In [None]:
data_columns

In [None]:
pop.chromLocusPair(383)

In [None]:
snp_to_integer

In [None]:
integer_to_snp

In [None]:
selection_meta.numLoci()

In [None]:
selection_meta.chromLocusPair(1391)[0]

In [None]:
ht.ix[:, 'effect':]

In [None]:
def plot_haplotype_effect_vs_frequency(pop, meta_pop, haplo_data, figure_filename):
    """Plots haplotype vs frequency in """
    
    haplotypez = []
    for locus in hz['loci'].values():
        haplotypez.extend(haplo_data['alleles'][locus])
    distinct_haplotypes = len(haplotypez)
    
    fx_vs_frq = {}
    fx_vs_frq[99] = np.zeros((2, distinct_haplotypes))
    segregating_effects = []
    haplotype_frequencies = []
    for htype, triplets in haplo_data['alleles'].items():
        for trip in triplets:
            segregating_effects.append(haplo_data['effect'][htype][trip])
            haplotype_frequencies.append(haplo_data['frequency'][htype][trip])
    fx_vs_frq[99][0] = haplotype_frequencies
    fx_vs_frq[99][1] = segregating_effects
    for sp in range(meta.numSubPop()):
        fx_vs_frq[sp] = np.zeros((2, distinct_haplotypes))
        segregating_effects = []
        haplotype_frequencies = []
        for htype, triplets in haplo_data['alleles'].items():
            for trip in triplets:
                segregating_effects.append(haplo_data['effect'][htype][trip])
                haplotype_frequencies.append(haplo_data['frequency'][sp][htype][trip])
        fx_vs_frq[sp][0] = haplotype_frequencies
        fx_vs_frq[sp][1] = segregating_effects
    generations = ['G_'+str(i) for i in range(0, pop.dvars().gen+1, 2)]
    
    f, ax = plt.subplots(7, 1, figsize=(10,30))
    ax[6].scatter(fx_vs_frq[99][0], fx_vs_frq[99][1], c='red')
    ax[6].grid(True)
    ax[6].set_title("Aggregate Generation")
    generations = [0, 2, 4, 6, 8, 10]
    for i in range(6):
        ax[i].scatter(fx_vs_frq[i][0], fx_vs_frq[i][1], c=np.random.rand(3, 1))
        ax[i].set_xlim(-0.1, 1.1)
        ax[i].set_title("Generation {}".format(generations[i]))
        ax[i].grid(True)
    
    f.savefig(figure_filename, dpi=300)

In [None]:
sim.stat(meta, varOfInfo=['g', 'p'], vars=['varOfInfo', 'varOfInfo_sp'])

In [None]:
meta.dvars().meanOfInfo

In [None]:
pop.dvars(1).meanOfInfo

In [None]:
for i in range(6):
    print(meta.dvars(i).meanOfInfo['p'])

In [None]:
meta_means

In [None]:
plot_means_and_variances(meta, output_prefix+"Meta-Population_Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
meta_gens = [0, 2, 4, 6, 8, 10]
meta_means = [meta.dvars(i).meanOfInfo['p'] for i in range(6)]
meta_vars = [meta.dvars(i).varOfInfo['p'] for i in range(6)]

In [None]:
meta_plot = np.array([meta_gens, meta_means, meta_vars])

In [None]:
f, ax = plt.subplots()

In [None]:
plt.show()

In [None]:
metainfo = np.zeros(())

In [None]:
meta_means

In [None]:
meta_means

In [None]:
qtl_params['triplet_qtl'] = triplet_qtl
qtl_params['allele_effects'] = allele_effects

In [None]:
rw = parameterizer.ReadWrite()

In [None]:
rw.write_trunc_selection_parameters(sim_params, 'truncsel.json', qtl_params,
                                    'qtlparams.json', genetic_structure,
                                   'genstructure.json')

In [None]:
run_id = id_generator()

In [None]:
from itertools import islice

In [None]:
list(islice(list(selection_pop.indInfo('ind_id')),0, 20, 2))

In [None]:
import itertools

In [None]:
list(itertools.combinations(selection_pop.indInfo('ind_id'), 2))