# Simulating a Population for Use in GWAS #

### Population Parameters ###

In [1]:
%matplotlib inline
import shelve
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, optimized=True, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from wgs import breed, operators, selection, helpers, parser, parameterizer, selection
import random
#random.seed(1337)
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import matplotlib.pyplot as plt

In [2]:


hapmap = pd.read_csv('clean_hapmap.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

chr_cM_positions = {}
for i in range(1, 11):
    chr_cM_positions[i] = []

for idx in range(len(genetic_map)):
    chr_cM_positions[int(genetic_map.iloc[idx]['chr'])].append(
    float(genetic_map.iloc[idx]['cM_pos']))


cM_positions = []
for i in range(1, 11):
    cM_positions.append(chr_cM_positions[i])


snp_to_integer = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-':4, '+':5}
allele_names = ['A', 'C', 'T', 'G', 'D', 'I']
integer_to_snp = {0: 'A', 1:'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}


integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])

alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)


nam = sim.loadPopulation('nam_prefounders.pop')
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

genetic_structure = {}
genetic_structure['cM_positions'] = cM_positions
genetic_structure['chr_cM_positions'] = chr_cM_positions
genetic_structure['allele_names'] = allele_names
genetic_structure['integral_valued_loci'] = integral_valued_loci
genetic_structure['relative_integral_valued_loci'] = relative_integral_valued_loci
genetic_structure['alleles'] = alleles
genetic_structure['recombination_rates'] = recombination_rates


sim_params = {
                'generations_of_selection': 20,
                'generations_of_drift': 20,
                'generations_of_random_mating': 3,
                'operating_population_size': 2000,
                'proportion_of_individuals_saved': 0.05,
                'overshoot_as_proportion': 0.50,
                'individuals_per_breeding_subpop': 5,
                'heritability': 0.7,
                'meta_pop_sample_sizes': sample_sizes,
                'number_of_replicates': 1,
                'prefounder_file_name': 'nam_prefounders.pop',
                'founders': [(3,18), (2, 13), (7, 14), (1, 19),
                            (14, 17), (1, 20), (17, 21), (9, 22)]
    }

qtl_params = {
                'qtl': 10,
                'allele_effects': 1,
}

selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}
drift_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}


s = selection.Truncation(sim_params['generations_of_selection'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

d = selection.Drift(sim_params['generations_of_drift'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])


sim.tagID(nam, reset=True)

founders = sim_params['founders']
replicated_nam = sim.Simulator(nam, rep=3)
pop = replicated_nam.extract(0)
#pop.dvars().statistics = population_statistics
selection_meta = replicated_nam.extract(0)
drift_meta = replicated_nam.extract(0)
#meta.removeSubPops(0)

### Simulated Breeding Scenario ###

In [3]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])

Creating the F_one population from selected founders.
Generation: 0


In [4]:
s.expand_by_selfing(pop, recombination_rates)
s.mate_and_merge(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)

sim.stat(pop, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])

Creating the F_two population.
Generation: 1
Initiating recombinatorial convergence at generation: 2
Generation: 2
Generation: 3
Initiating interim random mating for 3 generations.
Generation: 4
Generation: 5
Generation: 6


In [5]:
run_id = "run_10_"

## Choose QTL and Assign Effects ##

In [6]:
qtl = parameterizer.seg_qtl_chooser(pop, integral_valued_loci, qtl_params['qtl'])

triplet_qtl = []
for locus in qtl:
    triplet_qtl.append(locus-1)
    triplet_qtl.append(locus)
    triplet_qtl.append(locus+1)
triplet_qtl = sorted(triplet_qtl)

qtl_params['triplet_qtl'] = triplet_qtl

allele_effects = {locus: {} for locus in triplet_qtl}
for tqtl in triplet_qtl:
    for allele in alleles[tqtl]:
        allele_effects[tqtl][allele] = random.expovariate(qtl_params['allele_effects'])

qtl_params['allele_effects'] = allele_effects
        


# Write parameter sets to a 'shelf'.
import shelve
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

rsparams = shelve.open("RS_Parameter_Sets")
rsparams['truncation'] = s
rsparams['drift'] = d
rsparams['seg_sites_after_rmating'] = list(pop.dvars().segSites)
rsparams['qtl_parameters'] = qtl_params
rsparams['simulation_parameters'] = sim_params
    
pop.dvars().qtl = qtl
pop.dvars().triplet_qtl = triplet_qtl
pop.dvars().allele_effects = allele_effects

selection_plus_drift_replicates = sim.Simulator(pop, rep=2)
selection_pop = selection_plus_drift_replicates.extract(0)
drift_pop = selection_plus_drift_replicates.extract(0)

In [7]:
selection_pop.dvars().statistics = selection_statistics
drift_pop.dvars().statistics = drift_statistics

In [8]:
s.recurrent_truncation_selection(selection_pop, selection_meta, triplet_qtl, allele_effects,
                                recombination_rates)

d.recurrent_drift_selection(drift_pop, drift_meta, triplet_qtl, allele_effects, 
                            recombination_rates)
                                
selection_meta.removeSubPops(0)
drift_meta.removeSubPops(0)

selection_qtd = helpers.Frq(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtd = helpers.Frq(drift_meta, triplet_qtl, alleles, allele_effects)


selection_af = selection_qtd.allele_frequencies(selection_meta, range(selection_meta.totNumLoci()))
drift_af = drift_qtd.allele_frequencies(drift_meta, range(drift_meta.totNumLoci()))
selection_qtalleles = selection_qtd.rank_allele_effects(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtalleles = drift_qtd.rank_allele_effects(drift_meta, triplet_qtl, alleles, allele_effects)
selection_ties = [locus for locus in range(selection_meta.totNumLoci()) 
                  if selection_af['minor', 'alleles'][locus] == selection_af['major', 'alleles'][locus]]
drift_ties = [locus for locus in range(drift_meta.totNumLoci())
                  if drift_af['minor', 'alleles'][locus] == drift_af['major', 'alleles'][locus]]

for st in selection_ties:
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[0]
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[1]
for dt in drift_ties:
    drift_af['minor', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[0]
    drift_af['major', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[1]
sum(np.equal(list(selection_af['minor', 'alleles'].values()), list(selection_af['major', 'alleles'].values())))
sum(np.equal(list(drift_af['minor', 'alleles'].values()), list(drift_af['major', 'alleles'].values())))

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Generation: 10
Generation: 11
Generation: 12
Generation: 13
Generation: 14
Generation: 15
Generation: 16
Generation: 17
Generation: 18
Generation: 19
Final: Sampled 100 individuals from generation 20
Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Generation: 10
Generation: 11
Generation: 12
Generation: 13
Generation: 14
Generation: 15
Generation: 16
Generation: 17
Generation: 18
Generation: 19
Final: Sampled 100 individuals from generation 20


0

## Gather Data for Use in GWAS ##

In [None]:
pca = helpers.PCA(selection_meta, range(selection_meta.totNumLoci()), selection_af)
minor_ac = pca.calculate_count_matrix(selection_meta, selection_af['minor', 'alleles'], 
                                      'sim_minor_allele_count.txt')
eigendata = pca.svd(selection_meta, minor_ac)
ts = pca.test_statistic(selection_meta, eigendata['values'])

raw_hmap = pd.read_csv('hapmap3.txt', delimiter='\t', index_col=0)
locus_names = list(raw_hmap['nearest.site'])
pos_column = list(raw_hmap['agp_pos'])
individual_names = {ind.ind_id: 'RS_R'+str(1)+'_G'+str(int(ind.generation)) + '_I'+str(int(ind.ind_id))
                   for ind in selection_meta.individuals()}

In [14]:
selection_af

{('frequencies',
  3): OrderedDict([(0, [1.0, 0]),
              (1, [1.0, 0]),
              (2, [1.0, 0]),
              (3, [1.0, 0]),
              (4, [1.0, 0]),
              (5, [1.0, 0]),
              (6, [1.0, 0]),
              (7, [0.83, 0.17]),
              (8, [0.85, 0.15]),
              (9, [0.89, 0.11]),
              (10, [0.675, 0.325]),
              (11, [0.865, 0.135]),
              (12, [0.81, 0.19]),
              (13, [0.85, 0.15]),
              (14, [1.0, 0]),
              (15, [1.0, 0]),
              (16, [1.0, 0]),
              (17, [0.96, 0.04]),
              (18, [0.96, 0.04]),
              (19, [0.86, 0.14]),
              (20, [1.0, 0]),
              (21, [0.875, 0.125]),
              (22, [1.0, 0]),
              (23, [0.725, 0.275]),
              (24, [1.0, 0]),
              (25, [0.91, 0.09]),
              (26, [0.405, 0.595]),
              (27, [0.835, 0.165]),
              (28, [0.695, 0.305]),
              (29, [1.0, 0]),
          

In [None]:
cols_for_hapmap = {'locus_names': locus_names, 'pos_column': pos_column}
gwas = helpers.GWAS(selection_meta, individual_names, locus_names, pos_column)
hmap = gwas.hapmap_formatter(integer_to_snp, input_prefix + run_id_prefix + 'sim_hapmap.txt')
popstruct = gwas.population_structure_formatter(eigendata, input_prefix + run_id_prefix + "sim_structure.txt")
phenos = gwas.trait_formatter(input_prefix + run_id_prefix + "sim_trait_vector.txt")
kinship_matrix = gwas.calc_kinship_matrix(minor_ac, selection_af, input_prefix + run_id_prefix + "sim_kinship.txt")
pd.DataFrame(
    selection_pop.dvars().statistics).to_csv(input_prefix
                                             + run_id_prefix
                                             + "population_statistics.csv", sep='\t', index=True, header=True)


In [None]:
quantrait_allele_table = selection_qtd.qt_allele_table(selection_meta, genetic_map, selection_af, 
                                                       triplet_qtl, recombination_rates, selection_qtalleles, 
                                             allele_effects, )

In [None]:
marker_absindex_conversion = {idx: hmap['rs'][idx] for idx in hmap['rs'].index}
absindex_marker_conversion = {hmap['rs'][idx]: idx for idx in hmap['rs'].index}

## Analyzing TASSEL GWAS Results ##

In [9]:
hz = helpers.haplotype_data(selection_pop, selection_meta, allele_effects, triplet_qtl)

In [13]:
hz['frequency'][5951, 5952, 5953]['G_0']

{(0, 1, 2): 0.13, (2, 1, 2): 0.53, (2, 2, 2): 0.34}

In [17]:
hz.keys()

dict_keys(['loci', 'alleles', 'effect', 'frequency'])

In [20]:
gens

NameError: name 'gens' is not defined

In [38]:
hz['frequency'][loci][haplotype]

TypeError: unhashable type: 'list'

In [43]:
alleles

{0: (0, 2),
 1: (2, 0),
 2: (2, 0),
 3: (1, 3),
 4: (1, 2),
 5: (5, 4),
 6: (2, 0),
 7: (1, 3),
 8: (2, 0),
 9: (1, 3),
 10: (1, 3),
 11: (0, 1),
 12: (3, 1),
 13: (2, 0),
 14: (3, 1),
 15: (1, 3),
 16: (1, 3),
 17: (2, 0),
 18: (2, 0),
 19: (0, 2),
 20: (1, 3),
 21: (1, 3),
 22: (4, 5),
 23: (1, 2),
 24: (2, 0),
 25: (3, 0),
 26: (3, 1),
 27: (2, 0),
 28: (4, 5),
 29: (0, 2),
 30: (2, 1),
 31: (0, 3),
 32: (0, 3),
 33: (2, 0),
 34: (0, 2),
 35: (3, 1),
 36: (3, 1),
 37: (2, 0),
 38: (0, 2),
 39: (3, 1),
 40: (1, 0),
 41: (2, 0),
 42: (1, 3),
 43: (4, 5),
 44: (0, 2),
 45: (1, 0),
 46: (2, 1),
 47: (0, 2),
 48: (3, 0),
 49: (1, 3),
 50: (1, 2),
 51: (3, 1),
 52: (1, 2),
 53: (1, 3),
 54: (1, 3),
 55: (0, 2),
 56: (1, 3),
 57: (2, 0),
 58: (1, 3),
 59: (1, 3),
 60: (2, 3),
 61: (1, 3),
 62: (4, 5),
 63: (4, 5),
 64: (3, 1),
 65: (3, 1),
 66: (1, 3),
 67: (2, 3),
 68: (1, 3),
 69: (2, 0),
 70: (1, 3),
 71: (4, 5),
 72: (1, 3),
 73: (0, 2),
 74: (3, 0),
 75: (1, 2),
 76: (2, 1),
 77: (1, 

In [46]:
selection_statistics

{'aggregate': {('mean', 'g', 0): 72.95112954604305,
  ('mean', 'g', 2): 80.60278226862975,
  ('mean', 'g', 4): 84.99047321241609,
  ('mean', 'g', 6): 87.67373514076549,
  ('mean', 'g', 8): 88.90437833423123,
  ('mean', 'g', 10): 89.76302119581302,
  ('mean', 'g', 12): 90.20003277902514,
  ('mean', 'g', 14): 90.46019745294032,
  ('mean', 'g', 16): 90.90899777568896,
  ('mean', 'g', 18): 90.98932621130848,
  ('mean', 'g', 20): 91.13973989489836,
  ('mean', 'p', 0): 72.77687769854369,
  ('mean', 'p', 2): 80.50492692631919,
  ('mean', 'p', 4): 84.88780030348603,
  ('mean', 'p', 6): 87.51551436928898,
  ('mean', 'p', 8): 88.90726446825978,
  ('mean', 'p', 10): 89.72051553634802,
  ('mean', 'p', 12): 90.15417254457243,
  ('mean', 'p', 14): 90.44769815409846,
  ('mean', 'p', 16): 90.93071910085176,
  ('mean', 'p', 18): 91.01407067291291,
  ('mean', 'p', 20): 91.17143567284491,
  ('var', 'g', 0): 12.341933798812713,
  ('var', 'g', 2): 8.22655014002909,
  ('var', 'g', 4): 4.951047934908578,
  (

In [47]:
def store_toy_parameter_set():
    with shelve.open('toy_parameter_set') as tpset:
        tpset['selection_af'] = selection_af
        tpset['drift_af'] = drift_af
        tpset['triplet_qtl'] = triplet_qtl
        tpset['allele_effects'] = allele_effects
        tpset['alleles'] = alleles
        tpset['haplotype'] = haplotype
        tpset['epsilon'] = selection_pop.dvars().epsilon
        tpset['selection_statistics'] = selection_statistics
        tpset['drift_statistics'] = drift_statistics


In [42]:
for loci, haplotype in hz['frequency'].items():
    print(loci, haplotype)

(5951, 5952, 5953) {(2, 2, 2): 0.26681818181818184, 'G_18': {(0, 1, 2): 0, (2, 2, 2): 0.365, (2, 1, 2): 0.635}, 'G_0': {(0, 1, 2): 0.13, (2, 2, 2): 0.34, (2, 1, 2): 0.53}, 'G_14': {(0, 1, 2): 0, (2, 2, 2): 0.305, (2, 1, 2): 0.695}, 'G_16': {(0, 1, 2): 0, (2, 2, 2): 0.35, (2, 1, 2): 0.65}, (0, 1, 2): 0.03272727272727273, 'G_6': {(0, 1, 2): 0.025, (2, 2, 2): 0.2, (2, 1, 2): 0.775}, 'G_4': {(0, 1, 2): 0.1, (2, 2, 2): 0.175, (2, 1, 2): 0.725}, 'G_10': {(0, 1, 2): 0, (2, 2, 2): 0.14, (2, 1, 2): 0.86}, 'G_20': {(0, 1, 2): 0, (2, 2, 2): 0.4, (2, 1, 2): 0.6}, 'G_2': {(0, 1, 2): 0.105, (2, 2, 2): 0.25, (2, 1, 2): 0.645}, (2, 1, 2): 0.7004545454545454, 'G_12': {(0, 1, 2): 0, (2, 2, 2): 0.27, (2, 1, 2): 0.73}, 'G_8': {(0, 1, 2): 0, (2, 2, 2): 0.14, (2, 1, 2): 0.86}}
G_18 {}
G_0 {}
(1390, 1391, 1392) {(1, 2, 1): 0.08818181818181818, 'G_18': {(1, 2, 1): 0.01, (0, 2, 1): 0.035, (1, 1, 1): 0.955}, 'G_0': {(1, 2, 1): 0.245, (0, 2, 1): 0.12, (1, 1, 1): 0.635}, 'G_14': {(1, 2, 1): 0.025, (0, 2, 1): 0.06

In [None]:
with shelve.open("Haplotype_Data") as hdata:
    hdata['test_run'] = hz

In [None]:
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

In [None]:
hz['frequency']

In [None]:
from wgs import visualization
visualization.plot_means_and_variances(selection_pop, selection_meta, drift_pop, drift_meta, 'run_7_Means_and_Variances_Meta-population.pdf')

In [None]:
plot_means_and_variances(pop, output_prefix+"Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
plot_means_and_variances(meta, output_prefix+"Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
sim.stat(meta, meanOfInfo=['g', 'p'], vars=['meanOfInfo', 'meanOfInfo_sp'])

In [None]:
def plot_haplotype_effect_vs_frequency(haplotype_data, figure_filename):
    """Plots haplotype vs frequency in """
    fx_vs_frq = {}
    fx_vs_frq[99] = np.zeros((2, 32))
    segregating_effects = []
    haplotype_frequencies = []
    for htype, triplets in hz['alleles'].items():
        for trip in triplets:
            segregating_effects.append(hz['effect'][htype][trip])
            haplotype_frequencies.append(hz['frequency'][htype][trip])
    fx_vs_frq[99][0] = haplotype_frequencies
    fx_vs_frq[99][1] = segregating_effects
    for sp in range(meta.numSubPop()):
        fx_vs_frq[sp] = np.zeros((2, 32))
        segregating_effects = []
        haplotype_frequencies = []
        for htype, triplets in hz['alleles'].items():
            for trip in triplets:
                segregating_effects.append(hz['effect'][htype][trip])
                haplotype_frequencies.append(hz['frequency'][sp][htype][trip])
        fx_vs_frq[sp][0] = haplotype_frequencies
        fx_vs_frq[sp][1] = segregating_effects
    
    
    f, ax = plt.subplots(7, 1, figsize=(10,30))
    ax[6].scatter(fx_vs_frq[99][0], fx_vs_frq[99][1], c='red')
    ax[6].grid(True)
    ax[6].set_title("Aggregate Generation")
    generations = [0, 2, 4, 6, 8, 10]
    for i in range(6):
        ax[i].scatter(fx_vs_frq[i][0], fx_vs_frq[i][1], c=np.random.rand(3, 1))
        ax[i].set_xlim(-0.1, 1.1)
        ax[i].set_title("Generation {}".format(generations[i]))
        ax[i].grid(True)
    
    f.savefig(figure_filename, dpi=300)

In [None]:
sim.stat(meta, varOfInfo=['g', 'p'], vars=['varOfInfo', 'varOfInfo_sp'])

In [None]:
meta.dvars().meanOfInfo

In [None]:
pop.dvars(1).meanOfInfo

In [None]:
for i in range(6):
    print(meta.dvars(i).meanOfInfo['p'])

In [None]:
meta_means

In [None]:
plot_means_and_variances(meta, output_prefix+"Meta-Population_Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
meta_gens = [0, 2, 4, 6, 8, 10]
meta_means = [meta.dvars(i).meanOfInfo['p'] for i in range(6)]
meta_vars = [meta.dvars(i).varOfInfo['p'] for i in range(6)]

In [None]:
meta_plot = np.array([meta_gens, meta_means, meta_vars])

In [None]:
f, ax = plt.subplots()

In [None]:
plt.show()

In [None]:
metainfo = np.zeros(())

In [None]:
meta_means

In [None]:
meta_means

In [None]:
qtl_params['triplet_qtl'] = triplet_qtl
qtl_params['allele_effects'] = allele_effects

In [None]:
rw = parameterizer.ReadWrite()

In [None]:
rw.write_trunc_selection_parameters(sim_params, 'truncsel.json', qtl_params,
                                    'qtlparams.json', genetic_structure,
                                   'genstructure.json')

In [None]:
run_id = id_generator()

In [None]:
from itertools import islice

In [None]:
list(islice(list(selection_pop.indInfo('ind_id')),0, 20, 2))

In [None]:
import itertools

In [None]:
list(itertools.combinations(selection_pop.indInfo('ind_id'), 2))