# Simulating a Population for Use in GWAS #

### Population Parameters ###

In [1]:
%matplotlib inline
import shelve
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from saegus import breed, operators, selection, helpers, parser, parameterizer, selection
import random
#random.seed(1337)
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import matplotlib.pyplot as plt

In [2]:
hapmap = pd.read_csv('clean_hapmap.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

chr_cM_positions = {}
for i in range(1, 11):
    chr_cM_positions[i] = []

for idx in range(len(genetic_map)):
    chr_cM_positions[int(genetic_map.iloc[idx]['chr'])].append(
    float(genetic_map.iloc[idx]['cM_pos']))


cM_positions = []
for i in range(1, 11):
    cM_positions.append(chr_cM_positions[i])


snp_to_integer = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-':4, '+':5}
allele_names = ['A', 'C', 'T', 'G', 'D', 'I']
integer_to_snp = {0: 'A', 1:'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}


integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])

alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)


nam = sim.loadPopulation('nam_prefounders.pop')
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

genetic_structure = {}
genetic_structure['cM_positions'] = cM_positions
genetic_structure['chr_cM_positions'] = chr_cM_positions
genetic_structure['allele_names'] = allele_names
genetic_structure['integral_valued_loci'] = integral_valued_loci
genetic_structure['relative_integral_valued_loci'] = relative_integral_valued_loci
genetic_structure['alleles'] = alleles
genetic_structure['recombination_rates'] = recombination_rates

In [3]:
sim_params = {
                'generations_of_selection': 10,
                'generations_of_drift': 10,
                'generations_of_random_mating': 3,
                'operating_population_size': 2000,
                'proportion_of_individuals_saved': 0.05,
                'overshoot_as_proportion': 0.50,
                'individuals_per_breeding_subpop': 5,
                'heritability': 0.7,
                'meta_pop_sample_sizes': sample_sizes,
                'number_of_replicates': 1,
                'prefounder_file_name': 'nam_prefounders.pop',
                'founders': [(3,18), (2, 13), (7, 14), (1, 19),
                            (14, 17), (1, 20), (17, 21), (9, 22)]
    }

In [4]:
qtl_params = {
                'qtl': 10,
                'allele_effects': 1,
}
selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}
drift_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}

In [5]:
s = selection.Truncation(sim_params['generations_of_selection'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

d = selection.Drift(sim_params['generations_of_drift'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

In [6]:
sim.tagID(nam, reset=True)

founders = sim_params['founders']
replicated_nam = sim.Simulator(nam, rep=3)
pop = replicated_nam.extract(0)
#pop.dvars().statistics = population_statistics
selection_meta = replicated_nam.extract(0)
drift_meta = replicated_nam.extract(0)
#meta.removeSubPops(0)

### Simulated Breeding Scenario ###

In [7]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])

Creating the F_one population from selected founders.
Generation: 0


In [8]:
s.recombinatorial_convergence(pop, recombination_rates)

Generation: 1	popSize: 8
Generation: 2	popSize: 4
Generation: 3	popSize: 2


In [9]:
s.expand_by_selfing(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)
sim.stat(pop, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])

Creating the F_two population.
Generation: 4
Initiating interim random mating for 3 generations.
Generation: 5
Generation: 6
Generation: 7


In [10]:
run_id = "run_11_"

## Choose QTL and Assign Effects ##

In [11]:
qtl = parameterizer.seg_qtl_chooser(pop, integral_valued_loci, qtl_params['qtl'])

triplet_qtl = []
for locus in qtl:
    triplet_qtl.append(locus-1)
    triplet_qtl.append(locus)
    triplet_qtl.append(locus+1)
triplet_qtl = sorted(triplet_qtl)

qtl_params['triplet_qtl'] = triplet_qtl

allele_effects = {locus: {} for locus in triplet_qtl}
for tqtl in triplet_qtl:
    for allele in alleles[tqtl]:
        allele_effects[tqtl][allele] = random.expovariate(qtl_params['allele_effects'])

qtl_params['allele_effects'] = allele_effects
        


# Write parameter sets to a 'shelf'.
import shelve
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

rsparams = shelve.open("RS_Parameter_Sets")
rsparams['truncation'] = s
rsparams['drift'] = d
rsparams['seg_sites_after_rmating'] = list(pop.dvars().segSites)
rsparams['qtl_parameters'] = qtl_params
rsparams['simulation_parameters'] = sim_params
    
pop.dvars().qtl = qtl
pop.dvars().triplet_qtl = triplet_qtl
pop.dvars().allele_effects = allele_effects

selection_plus_drift_replicates = sim.Simulator(pop, rep=2)
selection_pop = selection_plus_drift_replicates.extract(0)
drift_pop = selection_plus_drift_replicates.extract(0)

In [12]:
selection_pop.dvars().statistics = selection_statistics
drift_pop.dvars().statistics = drift_statistics

In [13]:
s.recurrent_truncation_selection(selection_pop, selection_meta, triplet_qtl, allele_effects,
                                recombination_rates)

d.recurrent_drift_selection(drift_pop, drift_meta, triplet_qtl, allele_effects, 
                            recombination_rates)
                                
selection_meta.removeSubPops(0)
drift_meta.removeSubPops(0)

selection_qtd = helpers.Frq(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtd = helpers.Frq(drift_meta, triplet_qtl, alleles, allele_effects)


selection_af = selection_qtd.allele_frequencies(selection_meta, range(selection_meta.totNumLoci()))
drift_af = drift_qtd.allele_frequencies(drift_meta, range(drift_meta.totNumLoci()))
selection_qtalleles = selection_qtd.rank_allele_effects(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtalleles = drift_qtd.rank_allele_effects(drift_meta, triplet_qtl, alleles, allele_effects)
selection_ties = [locus for locus in range(selection_meta.totNumLoci()) 
                  if selection_af['minor', 'alleles'][locus] == selection_af['major', 'alleles'][locus]]
drift_ties = [locus for locus in range(drift_meta.totNumLoci())
                  if drift_af['minor', 'alleles'][locus] == drift_af['major', 'alleles'][locus]]

for st in selection_ties:
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[0]
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[1]
for dt in drift_ties:
    drift_af['minor', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[0]
    drift_af['major', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[1]
sum(np.equal(list(selection_af['minor', 'alleles'].values()), list(selection_af['major', 'alleles'].values())))
sum(np.equal(list(drift_af['minor', 'alleles'].values()), list(drift_af['major', 'alleles'].values())))

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Final: Sampled 100 individuals from generation 10
Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Final: Sampled 100 individuals from generation 10


0

## Haplotype Data ##

### Population Subjected to Selection ###

In [14]:
sel_htypes = helpers.collect_haplotype_data(selection_meta, allele_effects, triplet_qtl)

In [15]:
sel_htypes

{'alleles': {(358, 359, 360): [(0, 2, 0), (2, 1, 0)],
  (1700, 1701, 1702): [(1, 2, 1), (1, 1, 1)],
  (1950, 1951, 1952): [(4, 1, 2), (4, 3, 2)],
  (2345, 2346, 2347): [(1, 3, 2), (1, 1, 2)],
  (2915, 2916, 2917): [(2, 0, 0), (1, 1, 2)],
  (3530, 3531, 3532): [(1, 0, 3), (1, 2, 3)],
  (4140, 4141, 4142): [(1, 1, 0), (3, 3, 0)],
  (5726, 5727, 5728): [(3, 3, 2), (1, 1, 2)],
  (6916, 6917, 6918): [(0, 1, 1), (0, 3, 3)],
  (7106, 7107, 7108): [(0, 2, 3), (3, 0, 3)]},
 'effect': {(358, 359, 360): {(0, 2, 0): 1.6601883621410407,
   (2, 1, 0): 0.5810797489427847},
  (1700, 1701, 1702): {(1, 1, 1): 0.9607279333910945,
   (1, 2, 1): 1.4074113804958848},
  (1950, 1951, 1952): {(4, 1, 2): 2.1068187653217225,
   (4, 3, 2): 0.8829805874715089},
  (2345, 2346, 2347): {(1, 1, 2): 1.1953892021708832,
   (1, 3, 2): 1.694970424792654},
  (2915, 2916, 2917): {(1, 1, 2): 2.65850614885613,
   (2, 0, 0): 2.5882654248934207},
  (3530, 3531, 3532): {(1, 0, 3): 4.845959177354244,
   (1, 2, 3): 4.3939873633901

In [16]:
sel_haplotype_table = helpers.generate_haplotype_data_table(selection_meta, sel_htypes)

In [17]:
sel_haplotype_table

Unnamed: 0,centered_on,relative_position,chromosome,haplotype,effect,G_0,G_2,G_4,G_6,G_8,G_10
0,359,359,1,AGA,1.660188,0.47,0.805,0.925,0.94,1.0,1.0
1,359,359,1,GCA,0.58108,0.53,0.195,0.075,0.06,0.0,0.0
2,1701,648,2,CGC,1.407411,0.47,0.68,0.855,0.885,0.915,0.925
3,1701,648,2,CCC,0.960728,0.53,0.32,0.145,0.115,0.085,0.075
4,1951,91,3,+CG,2.106819,0.555,0.89,1.0,1.0,1.0,1.0
5,1951,91,3,+TG,0.882981,0.445,0.11,0.0,0.0,0.0,0.0
6,2346,486,3,CTG,1.69497,0.425,0.535,0.705,0.845,0.875,0.97
7,2346,486,3,CCG,1.195389,0.575,0.465,0.295,0.155,0.125,0.03
8,2916,238,4,GAA,2.588265,0.515,0.4,0.41,0.45,0.445,0.465
9,2916,238,4,CCG,2.658506,0.485,0.6,0.59,0.55,0.555,0.535


### Population Subjected to Drift ###

In [18]:
drift_htypes = helpers.collect_haplotype_data(drift_meta, allele_effects, triplet_qtl)

In [19]:
drift_haplotype_table = helpers.generate_haplotype_data_table(drift_meta, drift_htypes)

In [20]:
drift_haplotype_table

Unnamed: 0,centered_on,relative_position,chromosome,haplotype,effect,G_0,G_2,G_4,G_6,G_8,G_10
0,359,359,1,AGA,1.660188,0.465,0.445,0.475,0.525,0.415,0.345
1,359,359,1,GCA,0.58108,0.535,0.555,0.525,0.475,0.585,0.655
2,1701,648,2,CGC,1.407411,0.52,0.57,0.645,0.555,0.57,0.625
3,1701,648,2,CCC,0.960728,0.48,0.43,0.355,0.445,0.43,0.375
4,1951,91,3,+CG,2.106819,0.435,0.475,0.395,0.44,0.38,0.31
5,1951,91,3,+TG,0.882981,0.565,0.525,0.605,0.56,0.62,0.69
6,2346,486,3,CTG,1.69497,0.485,0.535,0.495,0.47,0.48,0.455
7,2346,486,3,CCG,1.195389,0.515,0.465,0.505,0.53,0.52,0.545
8,2916,238,4,GAA,2.588265,0.455,0.535,0.49,0.545,0.535,0.49
9,2916,238,4,CCG,2.658506,0.545,0.465,0.51,0.455,0.465,0.51


In [95]:
import importlib as imp
imp.reload(helpers)

<module 'saegus.helpers' from 'c:\\Anaconda3\\lib\\site-packages\\saegus\\helpers.py'>

In [97]:
sel_plot_data = helpers.plot_frequency_vs_effect(selection_meta, sel_haplotype_table, 
                                                 'Haplotype Frequencies Under Selection',
                                                 'Selection_Haplotypes.pdf')

In [98]:
drift_plot_data = helpers.plot_frequency_vs_effect(drift_meta, drift_haplotype_table, 
                                                   'Haplotype Frequencies Under Drift',
                                                   'Drift_Haplotypes.pdf')

In [101]:
df_columns = ['chrom', 'start', 'end', 'color', 'width']

In [102]:
df = pd.DataFrame(columns=df_columns)

In [103]:
df

Unnamed: 0,chrom,start,end,color,width
