# Generate Data for TASSEL GWAS

In [1]:
%matplotlib inline
import shelve
import simuOpt
simuOpt.setOptions(alleleType='short', optimized=True, numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from saegus import breed, operators, selection, helpers, parser, parameterizer, selection
import random
#random.seed(1337)
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import matplotlib.pyplot as plt

In [2]:
hapmap = pd.read_csv('clean_hapmap.txt')
raw_hmap = pd.read_csv('hapmap3.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

raw_hmap = pd.read_csv('hapmap3.txt', delimiter='\t', index_col=0)
locus_names = list(raw_hmap['nearest.site'])
pos_column = list(raw_hmap['agp_pos'])

chr_cM_positions = {}
for i in range(1, 11):
    chr_cM_positions[i] = []

for idx in range(len(genetic_map)):
    chr_cM_positions[int(genetic_map.iloc[idx]['chr'])].append(
    float(genetic_map.iloc[idx]['cM_pos']))


cM_positions = []
for i in range(1, 11):
    cM_positions.append(chr_cM_positions[i])

In [3]:
snp_to_integer = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-':4, '+':5}
allele_names = ['A', 'C', 'T', 'G', 'D', 'I']
integer_to_snp = {0: 'A', 1:'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}


integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])

alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)

In [4]:
nam = sim.loadPopulation('nam_prefounders.pop')
sim.tagID(nam, reset=True)
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

genetic_structure = {}
genetic_structure['cM_positions'] = cM_positions
genetic_structure['chr_cM_positions'] = chr_cM_positions
genetic_structure['allele_names'] = allele_names
genetic_structure['integral_valued_loci'] = integral_valued_loci
genetic_structure['relative_integral_valued_loci'] = relative_integral_valued_loci
genetic_structure['alleles'] = alleles
genetic_structure['recombination_rates'] = recombination_rates

In [5]:
sim_params = {
                'generations_of_selection': 10,
                'generations_of_drift': 10,
                'generations_of_random_mating': 3,
                'operating_population_size': 2000,
                'proportion_of_individuals_saved': 0.05,
                'overshoot_as_proportion': 0.50,
                'individuals_per_breeding_subpop': 5,
                'heritability': 0.7,
                'meta_pop_sample_sizes': sample_sizes,
                'number_of_replicates': 1,
                'prefounder_file_name': 'nam_prefounders.pop',
                'founders': [(3,18), (2, 13), (7, 14), (1, 19),
                            (14, 17), (1, 20), (17, 21), (9, 22)]
    }

In [6]:
qtl_params = {
                'qtl': 10,
                'allele_effects': 1,
}
selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}
drift_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}

In [7]:
s = selection.Truncation(sim_params['generations_of_selection'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

d = selection.Drift(sim_params['generations_of_drift'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

In [8]:
founders = sim_params['founders']
replicated_nam = sim.Simulator(nam, rep=3, stealPops=False)
pop = replicated_nam.extract(0)
selection_meta = replicated_nam.extract(0)
drift_meta = replicated_nam.extract(0)

### Run MAGIC Mating Scheme

In [9]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])
s.recombinatorial_convergence(pop, recombination_rates)
s.expand_by_selfing(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)

Creating the F_one population from selected founders.
Generation: 0
Generation: 1	popSize: 8
Generation: 2	popSize: 4
Generation: 3	popSize: 2
Creating the F_two population.
Generation: 4
Initiating interim random mating for 3 generations.
Generation: 5
Generation: 6
Generation: 7


In [10]:
sim.stat(pop, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])
qtl = parameterizer.seg_qtl_chooser(pop, integral_valued_loci, qtl_params['qtl'])

triplet_qtl = []
for locus in qtl:
    triplet_qtl.append(locus-1)
    triplet_qtl.append(locus)
    triplet_qtl.append(locus+1)
triplet_qtl = sorted(triplet_qtl)

qtl_params['triplet_qtl'] = triplet_qtl

allele_effects = {locus: {} for locus in triplet_qtl}
for tqtl in triplet_qtl:
    for allele in alleles[tqtl]:
        allele_effects[tqtl][allele] = random.expovariate(qtl_params['allele_effects'])

qtl_params['allele_effects'] = allele_effects

simuPOP.simuPOP_op.Simulator

In [11]:
pop.dvars().qtl = qtl
pop.dvars().triplet_qtl = triplet_qtl
pop.dvars().allele_effects = allele_effects

selection_plus_drift_replicates = sim.Simulator(pop, rep=2)
selection_pop = selection_plus_drift_replicates.extract(0)
drift_pop = selection_plus_drift_replicates.extract(0)

selection_pop.dvars().statistics = selection_statistics
drift_pop.dvars().statistics = drift_statistics

s.recurrent_truncation_selection(selection_pop, selection_meta, triplet_qtl, allele_effects,
                                recombination_rates)
d.recurrent_drift_selection(drift_pop, drift_meta, triplet_qtl, allele_effects, 
                            recombination_rates)

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Final: Sampled 100 individuals from generation 10
Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 2
Generation: 3
Generation: 4
Generation: 5
Generation: 6
Generation: 7
Generation: 8
Generation: 9
Final: Sampled 100 individuals from generation 10


In [17]:
assert selection_meta.numSubPop() == 7, "Empty subpopulation has already been removed."
selection_meta.removeSubPops(0)
drift_meta.removeSubPops(0)

selection_qtd = helpers.Frq(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtd = helpers.Frq(drift_meta, triplet_qtl, alleles, allele_effects)
selection_af = selection_qtd.allele_frequencies(selection_meta, range(selection_meta.totNumLoci()))
drift_af = drift_qtd.allele_frequencies(drift_meta, range(drift_meta.totNumLoci()))
selection_qtalleles = selection_qtd.rank_allele_effects(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtalleles = drift_qtd.rank_allele_effects(drift_meta, triplet_qtl, alleles, allele_effects)
selection_ties = [locus for locus in range(selection_meta.totNumLoci()) 
                  if selection_af['minor', 'alleles'][locus] == selection_af['major', 'alleles'][locus]]
drift_ties = [locus for locus in range(drift_meta.totNumLoci())
                  if drift_af['minor', 'alleles'][locus] == drift_af['major', 'alleles'][locus]]

for st in selection_ties:
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[0]
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[1]
for dt in drift_ties:
    drift_af['minor', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[0]
    drift_af['major', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[1]
sum(np.equal(list(selection_af['minor', 'alleles'].values()), list(selection_af['major', 'alleles'].values())))
sum(np.equal(list(drift_af['minor', 'alleles'].values()), list(drift_af['major', 'alleles'].values())))

0

In [31]:
if not selection_pop.dvars().rep:
    print('true')

true


In [22]:
selection_pop.dvars().rep

0

In [23]:
drift_pop.dvars().rep

0

In [None]:
pca = helpers.PCA(selection_meta, range(selection_meta.totNumLoci()), selection_qtd)
minor_ac = pca.calculate_count_matrix(selection_meta, selection_af['minor', 'alleles'], 
                                      'minor_allele_count.txt')
eigendata = pca.svd(selection_meta, minor_ac)

In [None]:
individual_names = {ind.ind_id: 'RS_R'+str(1)+'_G'+str(int(ind.generation)) + '_I'+str(int(ind.ind_id))
                   for ind in selection_meta.individuals()}

gwas = helpers.GWAS(selection_meta, individual_names, locus_names, pos_column)
hmap = gwas.hapmap_formatter(integer_to_snp, 'rs_simulated_hapmap.txt')
phenos = gwas.trait_formatter('rs_phenotype_vector.txt')
kinship_matrix = gwas.calc_kinship_matrix(minor_ac, selection_af, 'rs_kinship_matrix.txt')

In [None]:
selection_pop.dvars().statistics

In [None]:
selection_pop.dvars().epsilon

In [None]:
selection_pop.popSize()

In [None]:
selection_meta.popSize()

In [None]:
selection_meta.vars().keys()

### Storing Data in Shelves

In [None]:
import shelve
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

rsparams = shelve.open("RS_Parameter_Sets")
rsparams['truncation'] = s
rsparams['drift'] = d
rsparams['seg_sites_after_rmating'] = list(pop.dvars().segSites)
rsparams['qtl_parameters'] = qtl_params
rsparams['simulation_parameters'] = sim_params

In [None]:
sel_htypes = helpers.collect_haplotype_data(selection_meta, allele_effects, triplet_qtl)
drift_htypes = helpers.collect_haplotype_data(drift_meta, allele_effects, triplet_qtl)
drift_haplotype_table = helpers.generate_haplotype_data_table(drift_meta, drift_htypes)
sel_haplotype_table = helpers.generate_haplotype_data_table(selection_meta, sel_htypes)
sel_plot_data = helpers.plot_frequency_vs_effect(selection_meta, sel_haplotype_table, 
                                                 'Haplotype Frequencies Under Selection',
                                                 'Selection_Haplotypes.pdf')
drift_plot_data = helpers.plot_frequency_vs_effect(drift_meta, drift_haplotype_table, 
                                                   'Haplotype Frequencies Under Drift',
                                                   'Drift_Haplotypes.pdf')