## Fourth Gen Simulation Script

In [1]:
pwd

'C:\\Users\\DoubleDanks\\BISB\\wisser\\code\\rjwlab-scripts\\saegus_project\\devel\\jupyternbs'

In [2]:
cd C:\Users\DoubleDanks\BISB\wisser\code\rjwlab-scripts\saegus_project\devel\data_dump\fourth_generation_simulated_gwas\rs_L10_H07

C:\Users\DoubleDanks\BISB\wisser\code\rjwlab-scripts\saegus_project\devel\data_dump\fourth_generation_simulated_gwas\rs_L10_H07


In [3]:
cd simulation_parameters

C:\Users\DoubleDanks\BISB\wisser\code\rjwlab-scripts\saegus_project\devel\data_dump\fourth_generation_simulated_gwas\rs_L10_H07\simulation_parameters


### Import Modules and Load Parameters

In [4]:
import simuOpt
simuOpt.setOptions(alleleType='short', optimized=True, numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from saegus import breed, operators, simulate, analyze, parse, parameters
import random
import copy
import yaml
import numpy as np
import os
np.set_printoptions(suppress=True, precision=3)


genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')


with open('rs_L10_H07_universal_parameters.yaml', 'r') as uparms:
    u_parameters = yaml.load(uparms)

with open('general_genetic_map_parameters.yaml', 'r') as ggmap:
    general_genetic_map_params = yaml.load(ggmap)




### Assign Parameters

In [5]:
locus_names = u_parameters['locus_names']
pos_column = u_parameters['pos_column']
allele_names = u_parameters['allele_names']
snp_to_integer = u_parameters['snp_to_integer']
integer_to_snp = u_parameters['integer_to_snp']

alleles = general_genetic_map_params['alleles']
chr_cM_positions = general_genetic_map_params['chr_cM_positions']
cM_positions = general_genetic_map_params['cM_positions']
integral_valued_loci = general_genetic_map_params['integral_valued_loci']
relative_integral_valued_loci = general_genetic_map_params['relative_integral_valued_loci']
recombination_rates = general_genetic_map_params['recombination_rates']

### Direct Input/Output

In [6]:
proto_prefix = 'C:\\Users\\DoubleDanks\\BISB\\wisser\\code\\rjwlab-scripts\\saegus_project\\devel\\data_dump\\fourth_generation_simulated_gwas\\'
run_prefix = 'rs_L10_H07\\'


tassel_input_dir_prefix = proto_prefix + run_prefix + 'tassel_input\\'
tassel_output_dir_prefix = proto_prefix + run_prefix + 'tassel_output\\'
tassel_config_prefix = proto_prefix + run_prefix + 'tassel_config_files\\'
various_simulation_info_prefix = proto_prefix + run_prefix + 'simulation_data\\'
populations_prefix = proto_prefix + run_prefix + 'populations\\'
parameter_prefix = proto_prefix + run_prefix + 'simulation_parameters\\'
ind_names_prefix = proto_prefix + run_prefix + 'ind_names\\'

hapmap_file_dir = tassel_input_dir_prefix
phenos_file_dir = tassel_input_dir_prefix
kinship_file_dir = tassel_input_dir_prefix
pop_struct_file_dir = tassel_input_dir_prefix

hapmap_file_name = tassel_input_dir_prefix + 'rs_L10_H07_R0_simulated_hapmap.txt'
phenos_file_name = tassel_input_dir_prefix + 'rs_L10_H07_R0_phenotype_vector.txt'
kinship_file_name = tassel_input_dir_prefix + 'rs_L10_H07_R0_kinship_matrix.pop'
pop_struct_file_name = tassel_input_dir_prefix + 'rs_L10_H07_R0_structure_matrix.txt'

io_parameters = dict(proto_prefix=proto_prefix,
                     run_prefix=run_prefix,
                     tassel_input_dir_prefix=tassel_output_dir_prefix,
                     tassel_output_dir_prefix=tassel_output_dir_prefix,
                     tassel_config_prefix=tassel_config_prefix,
                     populations_prefix=populations_prefix,
                     parameter_prefix=parameter_prefix,
                     ind_names_prefix=ind_names_prefix,
                     hapmap_file_dir=hapmap_file_dir,
                     phenos_file_dir=phenos_file_dir,
                     kinship_file_dir=kinship_file_dir,
                     pop_struct_file_dir=pop_struct_file_dir,
                     hapmap_file_name=hapmap_file_name,
                     phenos_file_name=phenos_file_name,
                     kinship_file_name=kinship_file_name,
                     pop_struct_file_name=pop_struct_file_name
                    )

with open('io_parameters.yaml', 'w') as tiop:
    yaml.dump(io_parameters, tiop)

In [7]:
nam = sim.loadPopulation(u_parameters['prefounder_file_name'])
sim.tagID(nam, reset=True)
nam.setSubPopName('maize_nam_prefounders', 0)

In [8]:
selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}

ind_names_for_gwas = {i: {} for i in range(u_parameters['number_of_replicates'])}
u_parameters['meta_pop_sample_sizes'] = {i: 100 for i in range(0, u_parameters['generations_of_selection']+1, 2)}

s = simulate.Truncation(u_parameters['generations_of_selection'],
                       u_parameters['generations_of_random_mating'],
                       u_parameters['operating_population_size'],
                       u_parameters['proportion_of_individuals_saved'],
                       u_parameters['overshoot_as_proportion'],
                       u_parameters['individuals_per_breeding_subpop'],
                       u_parameters['heritability'],
                       u_parameters['meta_pop_sample_sizes'],
                       u_parameters['number_of_replicates'])

In [9]:
ind_names_for_gwas = {i: {} for i in range(u_parameters['number_of_replicates'])}

In [10]:
founders = u_parameters['founders']
replicated_nam = sim.Simulator(nam, rep=2, stealPops=False)
pop = replicated_nam.extract(0)


s.generate_f_one(pop, recombination_rates, u_parameters['founders'])
s.recombinatorial_convergence(pop, recombination_rates)
s.expand_by_selfing(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)

Generation: 0
Generation: 1	popSize: 8
Generation: 2	popSize: 4
Generation: 3	popSize: 2
Generation: 4
Initiating interim random mating for 3 generations.
Generation: 5
Generation: 6
Generation: 7


In [11]:
multipop = sim.Simulator(pop, u_parameters['number_of_replicates'])
multi_meta = sim.Simulator(nam, u_parameters['number_of_replicates'], stealPops=False)

triplet_qtl, allele_effects = parameters.assign_identical_qtl_parameters(multipop, alleles,
                                                                         integral_valued_loci, u_parameters['number_of_qtl'],
                                                                         u_parameters['allele_effect_parameters'])

assert type(triplet_qtl[0]) == type([]), "Variables are flip-flopped in return."

for repid, pop_rep in enumerate(multipop.populations()):
    pop_rep.dvars().statistics = copy.deepcopy(selection_statistics)

s.replicate_selection(multipop, multi_meta, triplet_qtl, allele_effects,
                                recombination_rates)

Initial: Sampled 100 individuals from generation 0 Replicate: 0.
Initial: Sampled 100 individuals from generation 0 Replicate: 1.
Initial: Sampled 100 individuals from generation 0 Replicate: 2.
Initial: Sampled 100 individuals from generation 0 Replicate: 3.
Initial: Sampled 100 individuals from generation 0 Replicate: 4.
Generation: 0
Generation: 0
Generation: 0
Generation: 0
Generation: 0
Generation: 1
Generation: 1
Generation: 1
Generation: 1
Generation: 1
Generation: 2
Generation: 2
Generation: 2
Generation: 2
Generation: 2
Generation: 3
Generation: 3
Generation: 3
Generation: 3
Generation: 3
Generation: 4
Generation: 4
Generation: 4
Generation: 4
Generation: 4
Generation: 5
Generation: 5
Generation: 5
Generation: 5
Generation: 5
Generation: 6
Generation: 6
Generation: 6
Generation: 6
Generation: 6
Generation: 7
Generation: 7
Generation: 7
Generation: 7
Generation: 7
Generation: 8
Generation: 8
Generation: 8
Generation: 8
Generation: 8
Generation: 9
Generation: 9
Generation: 9
Gen

In [12]:
for meta_rep in multi_meta.populations():
    assert meta_rep.numSubPop() == 7, "Correct number subpopulations before removal of the dummy population"
    meta_rep.removeSubPops(0)
    assert meta_rep.numSubPop() == 6, "Correct number after removal"

In [14]:
import importlib as imp
imp.reload(analyze)

<module 'saegus.analyze' from 'C:\\Anaconda3\\lib\\site-packages\\saegus\\analyze.py'>

In [16]:
for i, meta_rep in enumerate(multi_meta.populations()):
    
    meta_rep_id = str(meta_rep.dvars().rep)
    prefix = 'rs_L10_H07_R' + str(meta_rep_id) + '_'
    
    meta_rep.dvars().triplet_qtl = triplet_qtl[i]
    meta_rep.dvars().allele_effects = allele_effects[i]
    frq = analyze.Frq(meta_rep, triplet_qtl[i], alleles, allele_effects[i])
    af = frq.allele_frequencies(meta_rep, range(meta_rep.totNumLoci()))
    qtalleles = frq.rank_allele_effects(meta_rep, triplet_qtl[i], alleles, allele_effects[i])
    ties = [locus for locus in range(meta_rep.totNumLoci())
            if af['minor', 'alleles'][locus] == af['major', 'alleles'][locus]]

    for st in ties:
        af['major', 'alleles'][st] = list(meta_rep.dvars().alleleFreq[st])[0]
        af['minor', 'alleles'][st] = list(meta_rep.dvars().alleleFreq[st])[1]
    major_minor_allele_conflicts = sum(np.equal(list(af['minor', 'alleles'].values()),
                                                list(af['major', 'alleles'].values())))

    assert major_minor_allele_conflicts == 0, "There is a tie in at least one locus."

    af_table = frq.allele_frq_table(meta_rep, meta_rep.numSubPop(), af, 
                                                           recombination_rates, genetic_map)
    qtaf_table = frq.qt_allele_table(qtalleles, allele_effects[i])
    
    af_table.to_csv(various_simulation_info_prefix + prefix + 'allele_frequency_table.txt', sep=',', index=0)
    qtaf_table.to_csv(various_simulation_info_prefix + prefix + 'qt_allele_info.txt', sep=',', index=0)
    
    #del af_table, qtaf_table



    pca = analyze.PCA(meta_rep, range(meta_rep.totNumLoci()), frq)
    

    minor_ac = pca.calculate_count_matrix(meta_rep, af['minor', 'alleles'],
                                          various_simulation_info_prefix + prefix + 'minor_allele_count.txt')

    eigendata = pca.svd(meta_rep, minor_ac)


    individual_names = {ind.ind_id: 'rs_L10_H07_R'+ meta_rep_id +'_G' +
                        str(int(ind.generation)) +
                        '_I'+str(int(ind.ind_id))
                        for ind in meta_rep.individuals()}

    ind_names_for_gwas[meta_rep_id] = individual_names

    #meta_rep.save(populations_prefix + prefix + 'metapopulation.pop')
    
    names_filename = ind_names_prefix + prefix + 'individual_names.yaml'
    with open(names_filename, 'w') as name_stream:
        yaml.dump(individual_names, name_stream)


    
    

    gwas = analyze.GWAS(meta_rep, individual_names, locus_names, pos_column)
    hmap = gwas.hapmap_formatter(integer_to_snp, tassel_input_dir_prefix + prefix + 'simulated_hapmap.txt')
    phenos = gwas.trait_formatter(tassel_input_dir_prefix + prefix + 'phenotype_vector.txt')
    kinship_matrix = gwas.calc_kinship_matrix(minor_ac, af, tassel_input_dir_prefix + prefix + 'kinship_matrix.txt')
    pop_struct_matrix = gwas.population_structure_formatter(eigendata, tassel_input_dir_prefix + prefix + 'structure_matrix.txt')
    pd.DataFrame(multipop.population(i).dvars().statistics).to_csv(various_simulation_info_prefix + prefix + 'means_and_vars.txt', sep='\t')
    analyze.generate_tassel_gwas_configs(tassel_input_dir_prefix, 
                                         tassel_output_dir_prefix, 
                                         tassel_config_prefix, 
                                         prefix, 
                                         'sim_mlm_gwas_pipeline.xml')

