# Simulating a Population for Use in GWAS #

### Population Parameters ###

In [1]:
%matplotlib inline
import shelve
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, optimized=True, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from wgs import breed, operators, selection, helpers, parser, parameterizer, selection
import random
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import matplotlib.pyplot as plt

In [2]:


hapmap = pd.read_csv('clean_hapmap.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

chr_cM_positions = {}
for i in range(1, 11):
    chr_cM_positions[i] = []

for idx in range(len(genetic_map)):
    chr_cM_positions[int(genetic_map.iloc[idx]['chr'])].append(
    float(genetic_map.iloc[idx]['cM_pos']))


cM_positions = []
for i in range(1, 11):
    cM_positions.append(chr_cM_positions[i])


snp_to_integer = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-':4, '+':5}
allele_names = ['A', 'C', 'T', 'G', 'D', 'I']
integer_to_snp = {0: 'A', 1:'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}


integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])

alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)


nam = sim.loadPopulation('nam_prefounders.pop')
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

genetic_structure = {}
genetic_structure['cM_positions'] = cM_positions
genetic_structure['chr_cM_positions'] = chr_cM_positions
genetic_structure['allele_names'] = allele_names
genetic_structure['integral_valued_loci'] = integral_valued_loci
genetic_structure['relative_integral_valued_loci'] = relative_integral_valued_loci
genetic_structure['alleles'] = alleles
genetic_structure['recombination_rates'] = recombination_rates


sim_params = {
                'generations_of_selection': 20,
                'generations_of_drift': 20,
                'generations_of_random_mating': 3,
                'operating_population_size': 2000,
                'proportion_of_individuals_saved': 0.05,
                'overshoot_as_proportion': 0.50,
                'individuals_per_breeding_subpop': 5,
                'heritability': 0.7,
                'meta_pop_sample_sizes': sample_sizes,
                'number_of_replicates': 1,
                'prefounder_file_name': 'nam_prefounders.pop',
                'founders': [(3,18), (2, 13), (7, 14), (1, 19),
                            (14, 17), (1, 20), (17, 21), (9, 22)]
    }

qtl_params = {
                'qtl': 10,
                'allele_effects': 1,
}

selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}
drift_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}


s = selection.Truncation(sim_params['generations_of_selection'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

d = selection.Drift(sim_params['generations_of_drift'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])


sim.tagID(nam, reset=True)

founders = sim_params['founders']
replicated_nam = sim.Simulator(nam, rep=3)
pop = replicated_nam.extract(0)
#pop.dvars().statistics = population_statistics
selection_meta = replicated_nam.extract(0)
drift_meta = replicated_nam.extract(0)
#meta.removeSubPops(0)

### Simulated Breeding Scenario ###

In [3]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])

Creating the F_one population from selected founders.
Generation: 0
Creating the F_one population from selected founders.
Generation: 0


In [4]:
s.expand_by_selfing(pop, recombination_rates)
s.mate_and_merge(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)

sim.stat(pop, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])

Creating the F_two population.
Generation: 1
Initiating recombinatorial convergence at generation: 2Creating the F_two population.
Generation: 1
Initiating recombinatorial convergence at generation: 2
Generation: 2
G
Generation: 2
Generation: 3
Initiating interim random mating for 3 generations.eneration: 3
Initiating interim random mating for 3 generations.
Generation: 4
G
Generation: 4
Generation: 5
Generation: 5
Generation: 6
eneration: 6


In [5]:
run_id = "run_10_"

## Choose QTL and Assign Effects ##

In [6]:
qtl = parameterizer.seg_qtl_chooser(pop, integral_valued_loci, qtl_params['qtl'])

triplet_qtl = []
for locus in qtl:
    triplet_qtl.append(locus-1)
    triplet_qtl.append(locus)
    triplet_qtl.append(locus+1)
triplet_qtl = sorted(triplet_qtl)

qtl_params['triplet_qtl'] = triplet_qtl

allele_effects = {locus: {} for locus in triplet_qtl}
for tqtl in triplet_qtl:
    for allele in alleles[tqtl]:
        allele_effects[tqtl][allele] = random.expovariate(qtl_params['allele_effects'])

qtl_params['allele_effects'] = allele_effects
        


# Write parameter sets to a 'shelf'.
import shelve
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

rsparams = shelve.open("RS_Parameter_Sets")
rsparams['truncation'] = s
rsparams['drift'] = d
rsparams['seg_sites_after_rmating'] = list(pop.dvars().segSites)
rsparams['qtl_parameters'] = qtl_params
rsparams['simulation_parameters'] = sim_params
    
pop.dvars().qtl = qtl
pop.dvars().triplet_qtl = triplet_qtl
pop.dvars().allele_effects = allele_effects

selection_plus_drift_replicates = sim.Simulator(pop, rep=2)
selection_pop = selection_plus_drift_replicates.extract(0)
drift_pop = selection_plus_drift_replicates.extract(0)

In [7]:
selection_pop.dvars().statistics = selection_statistics
drift_pop.dvars().statistics = drift_statistics

In [8]:
s.recurrent_truncation_selection(selection_pop, selection_meta, triplet_qtl, allele_effects,
                                recombination_rates)

d.recurrent_drift_selection(drift_pop, drift_meta, triplet_qtl, allele_effects, 
                            recombination_rates)
                                
selection_meta.removeSubPops(0)
drift_meta.removeSubPops(0)

selection_qtd = helpers.Frq(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtd = helpers.Frq(drift_meta, triplet_qtl, alleles, allele_effects)


selection_af = selection_qtd.allele_frequencies(selection_meta, range(selection_meta.totNumLoci()))
drift_af = drift_qtd.allele_frequencies(drift_meta, range(drift_meta.totNumLoci()))
selection_qtalleles = selection_qtd.rank_allele_effects(selection_meta, triplet_qtl, alleles, allele_effects)
drift_qtalleles = drift_qtd.rank_allele_effects(drift_meta, triplet_qtl, alleles, allele_effects)
selection_ties = [locus for locus in range(selection_meta.totNumLoci()) 
                  if selection_af['minor', 'alleles'][locus] == selection_af['major', 'alleles'][locus]]
drift_ties = [locus for locus in range(drift_meta.totNumLoci())
                  if drift_af['minor', 'alleles'][locus] == drift_af['major', 'alleles'][locus]]

for st in selection_ties:
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[0]
    selection_af['major', 'alleles'][st] = list(selection_meta.dvars().alleleFreq[st])[1]
for dt in drift_ties:
    drift_af['minor', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[0]
    drift_af['major', 'alleles'][dt] = list(drift_meta.dvars().alleleFreq[dt])[1]
sum(np.equal(list(selection_af['minor', 'alleles'].values()), list(selection_af['major', 'alleles'].values())))
sum(np.equal(list(drift_af['minor', 'alleles'].values()), list(drift_af['major', 'alleles'].values())))

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
GInitial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 1
Generation: 2
Generation: 2
Generation: 3
Generation: 3
Generation: 4
Generation: 4
Generation: 5
Generation: 5
Generation: 6
Generation: 6
Generation: 7
Generation: 7
Generation: 8
Generation: 8
Generation: 9
Generation: 9
Generation: 10
Generation: 10
Generation: 11
Generation: 11
Generation: 12
Generation: 12
Generation: 13
Generation: 13
Generation: 14
Generation: 14
Generation: 15
Generation: 15
Generation: 16
Generation: 16
Generation: 17
Generation: 17
Generation: 18
Generation: 18
Generation: 19
Feneration: 19
Final: Sampled 100 individuals from generation 20
Iinal: Sampled 100 individuals from generation 20
Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Gnitial: Sampled 100 individuals from generation 0 Replicate: 0.
Generation: 0
Generation: 1
Generation: 1
Gen

0

inal: Sampled 100 individuals from generation 20


0

In [None]:
from wgs import visualization
visualization.plot_means_and_variances(selection_pop, selection_meta, drift_pop, drift_meta, run_id+'Means_and_Variances_Meta-population.pdf')

In [9]:
rsparams['selection_statistics'] = selection_statistics
rsparams['drift_statistics'] = drift_statistics

In [None]:
rsparams.close()

In [10]:
run_id

'run_10_'

'run_10_'

## Gather Data for Use in GWAS ##

In [None]:
input_prefix = "C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\input\\"
output_prefix = "C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\output\\"
run_id_prefix = run_id

In [None]:
run_id_prefix

In [11]:
pca = helpers.PCA(selection_meta, range(selection_meta.totNumLoci()), selection_af)
minor_ac = pca.calculate_count_matrix(selection_meta, selection_af['minor', 'alleles'], 
                                      run_id_prefix+'sim_minor_allele_count.txt')
eigendata = pca.svd(selection_meta, minor_ac)
ts = pca.test_statistic(selection_meta, eigendata['values'])

raw_hmap = pd.read_csv('hapmap3.txt', delimiter='\t', index_col=0)
locus_names = list(raw_hmap['nearest.site'])
pos_column = list(raw_hmap['agp_pos'])
individual_names = {ind.ind_id: 'RS_R'+str(1)+'_G'+str(int(ind.generation)) + '_I'+str(int(ind.ind_id))
                   for ind in selection_meta.individuals()}

NameError: name 'run_id_prefix' is not defined

NameError: name 'run_id_prefix' is not defined

In [13]:
selection_qtd

<wgs.helpers.Frq at 0x8d8a320>

<wgs.helpers.Frq at 0x8d8a320>

In [None]:
cols_for_hapmap = {'locus_names': locus_names, 
                   'pos_column': pos_column}
gwas = helpers.GWAS(selection_meta, individual_names, 
                    locus_names, pos_column)
hmap = gwas.hapmap_formatter(integer_to_snp, 
                             input_prefix + run_id_prefix + 'sim_hapmap.txt')
popstruct = gwas.population_structure_formatter(eigendata, 
                                                input_prefix + run_id_prefix + "sim_structure.txt")
phenos = gwas.trait_formatter(input_prefix + run_id_prefix + "sim_trait_vector.txt")
kinship_matrix = gwas.calc_kinship_matrix(minor_ac, selection_af, 
                                          input_prefix + run_id_prefix + "sim_kinship.txt")

In [14]:
quantrait_allele_table = selection_qtd.qt_allele_table(selection_af, allele_effects)

TypeError: qt_allele_table() missing 2 required positional arguments: 'allele_effects' and 'qtrait_filename'

TypeError: qt_allele_table() missing 2 required positional arguments: 'allele_effects' and 'qtrait_filename'

In [None]:
marker_absindex_conversion = {idx: hmap['rs'][idx] for idx in hmap['rs'].index}
absindex_marker_conversion = {hmap['rs'][idx]: idx for idx in hmap['rs'].index}

## Analyzing TASSEL GWAS Results ##

In [None]:
run_id_prefix

In [None]:
import subprocess
cmdline = ["cmd", "/q", "/k", "echo on"]
cmd = subprocess.Popen(cmdline, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
batch = b"""\
set Pathname="C:\\GWAS\\tassel-5-standalone\"
pushd %Pathname%
run_pipeline.bat -fork1 -h C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\input\\run_6_sim_hapmap.txt -fork2 -t C:\\Users\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\input\\run_6_sim_trait_vector.txt  -fork3 -q C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\input\\run_6_sim_structure.txt -fork4 -k C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\input\run_6_sim_kinship.txt -combine5 -input1 -input2 -input3 -intersect -combine6 -input5 -input4 -mlm -mlmCompressionLevel None -export C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\output\\sim_run_6 -runfork1 -runfork2 -runfork3 -runfork4
exit
"""
cmd.stdin.write(batch)

cmd.stdin.flush() # Must include this to ensure data is passed to child process
result = cmd.stdout.read()
print(result.decode())
#TEST_VAR=Hello World
#Hello World

In [None]:
%%cmd
cd C:\GWAS\tassel-5-standalone
run_pipeline.bat -fork1 -h C:\Users\DoubleDanks\Dropbox\wgs-and-beyond\gwas\simulator_results\input\run_6_sim_hapmap.txt -fork2 -t C:\Users\DoubleDanks\Dropbox\wgs-and-beyond\gwas\simulator_results\input\run_6_sim_trait_vector.txt  -fork3 -q C:\Users\DoubleDanks\Dropbox\wgs-and-beyond\gwas\simulator_results\input\run_6_sim_structure.txt -fork4 -k C:\Users\DoubleDanks\Dropbox\wgs-and-beyond\gwas\simulator_results\input\run_6_sim_kinship.txt -combine5 -input1 -input2 -input3 -intersect -combine6 -input5 -input4 -mlm -mlmCompressionLevel None -export C:\Users\DoubleDanks\Dropbox\wgs-and-beyond\gwas\simulator_results\output\sim_run_6_ -runfork1 -runfork2 -runfork3 -runfork4

In [None]:
run_id_prefix = 'run_7'

In [None]:
gwasout_name = "C:\\Users\\DoubleDanks\\Dropbox\\wgs-and-beyond\\gwas\\simulator_results\\output\\sim_run_62.txt"

In [None]:
gwasout = pd.read_csv(gwasout_name, sep='\t')

In [None]:
gwasout

In [None]:
sig_results = gwasout[gwasout['p'] < 0.05]

In [None]:
gwasout.to_csv(output_prefix+'full_gwas_results.txt', sep='\t', index=True, header=True)

In [None]:
sig_results.to_csv(output_prefix+'seemingly_sig_results.txt', sep='\t', index=True, header=True)

In [None]:
sig_results

In [None]:
triplet_qtl

In [None]:
hz = haplotype_data(selection_meta, allele_effects, triplet_qtl)

In [None]:
hz

In [None]:
qtl

In [None]:
sorted(list(hz['loci'].values()))

In [None]:
haplotype_data = {}

In [None]:
import shelve
with shelve.open(run_id+"quantitative_trait_simulation_params") as qtdb:
    qtdb['qtl_params'] = qtl_params
    qtdb['sim_params'] = sim_params

In [None]:
selection_meta.dvars().haploFreq

In [None]:
selection_meta.numSubPop()

In [None]:
selection_meta.subPopSizes()

In [None]:
gen_labels = ['G_'+str(i) for i in range(0, selection_pop.dvars().gen+1, 2)]

In [None]:
gen_labels

In [None]:
hapldata_columns = ['centered_on', 'loci_triplet', 'haplotype', 'effect'] + gen_labels

In [None]:
hapldata_columns

In [None]:
hapltype_data['centered_on'] = [loci_triplet[1] for loci_triplet in list(sorted(hz['loci'].values())):
    for allele_triplet in hz['alleles'][loci_triplet]:
        triplet_effect = hz['effect'][loci_triplet][allele_triplet]
        triplet_frequency = hz['frequency'][0][loci_triplet][allele_triplet]
        haplotype_data['centered_on']=[loci_triplet[1] for loci_triplet in 
                   loci_triplet[1], loci_triplet, allele_triplet, triplet_effect, triplet_frequency]

In [None]:
with shelve.open("Haplotype_Data") as hdata:
    hdata['sim_run_6'] = hz

In [None]:
len(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])

In [None]:
def plot_haplotype_effect_vs_frequency(haplotype_data, figure_filename):
    """Plots haplotype vs frequency in """
    fx_vs_frq = {}
    fx_vs_frq[99] = np.zeros((2, 32))
    segregating_effects = []
    haplotype_frequencies = []
    for htype, triplets in hz['alleles'].items():
        for trip in triplets:
            segregating_effects.append(hz['effect'][htype][trip])
            haplotype_frequencies.append(hz['frequency'][htype][trip])
    fx_vs_frq[99][0] = haplotype_frequencies
    fx_vs_frq[99][1] = segregating_effects
    for sp in range(meta.numSubPop()):
        fx_vs_frq[sp] = np.zeros((2, 32))
        segregating_effects = []
        haplotype_frequencies = []
        for htype, triplets in hz['alleles'].items():
            for trip in triplets:
                segregating_effects.append(hz['effect'][htype][trip])
                haplotype_frequencies.append(hz['frequency'][sp][htype][trip])
        fx_vs_frq[sp][0] = haplotype_frequencies
        fx_vs_frq[sp][1] = segregating_effects
    
    
    f, ax = plt.subplots(7, 1, figsize=(10,30))
    ax[6].scatter(fx_vs_frq[99][0], fx_vs_frq[99][1], c='red')
    ax[6].grid(True)
    ax[6].set_title("Aggregate Generation")
    generations = [0, 2, 4, 6, 8, 10]
    for i in range(6):
        ax[i].scatter(fx_vs_frq[i][0], fx_vs_frq[i][1], c=np.random.rand(3, 1))
        ax[i].set_xlim(-0.1, 1.1)
        ax[i].set_title("Generation {}".format(generations[i]))
        ax[i].grid(True)
    
    f.savefig(figure_filename, dpi=300)

In [None]:
from wgs import visualization
visualization.plot_means_and_variances(selection_pop, selection_meta, drift_pop, drift_meta, 'run_7_Means_and_Variances_Meta-population.pdf')

In [None]:
plot_haplotype_effect_vs_frequency(hz, 'test_htypevsfrq.pdf')

In [None]:
plot_means_and_variances(pop, output_prefix+"Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
plot_means_and_variances(meta, output_prefix+"Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
sim.stat(meta, meanOfInfo=['g', 'p'], vars=['meanOfInfo', 'meanOfInfo_sp'])

In [None]:
sim.stat(meta, varOfInfo=['g', 'p'], vars=['varOfInfo', 'varOfInfo_sp'])

In [None]:
meta.dvars().meanOfInfo

In [None]:
pop.dvars(1).meanOfInfo

In [None]:
for i in range(6):
    print(meta.dvars(i).meanOfInfo['p'])

In [None]:
meta_means

In [None]:
plot_means_and_variances(meta, output_prefix+"Meta-Population_Means_and_Variances_of_Phenotype_Over_Time.pdf")

In [None]:
meta_gens = [0, 2, 4, 6, 8, 10]
meta_means = [meta.dvars(i).meanOfInfo['p'] for i in range(6)]
meta_vars = [meta.dvars(i).varOfInfo['p'] for i in range(6)]

In [None]:
meta_plot = np.array([meta_gens, meta_means, meta_vars])

In [None]:
f, ax = plt.subplots()

In [None]:
plt.show()

In [None]:
metainfo = np.zeros(())

In [None]:
meta_means

In [None]:
meta_means

In [None]:
qtl_params['triplet_qtl'] = triplet_qtl
qtl_params['allele_effects'] = allele_effects

In [None]:
rw = parameterizer.ReadWrite()

In [None]:
rw.write_trunc_selection_parameters(sim_params, 'truncsel.json', qtl_params,
                                    'qtlparams.json', genetic_structure,
                                   'genstructure.json')

In [None]:
import shelve

stored_parameters = {}

with shelve.open('RS_Parameter_Sets') as loaded:
    for k, v in loaded.items():
        stored_parameters[k] = v

stored_parameters.keys()