## Run: daoko_girl

In [1]:
import pytest
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
from saegus import breed, operators, simulate, analyze, parse, parameters
import shelve
import numpy as np
import random
np.set_printoptions(suppress=True, precision=3)

In [None]:
import importlib as imp
imp.reload(analyze)

### Top Level Definitions

In [None]:
trait_parameter_set = shelve.open('daoko_girl_trait_parameters')
list(trait_parameter_set)

In [None]:
run_id = 'daoko_girl'
trait_parameter_set = run_id + '_trait_parameters'
analysis_parameter_set = run_id + '_analysis_parameters'

In [None]:
analysis_parameters = shelve.open(analysis_parameter_set)
analysis_parameters['population_name'] = run_id
analysis_parameters['scenario'] = 'random_mating'
analysis_parameters['generations'] = 3
analysis_parameters['run_identifier'] = run_id
analysis_parameters['operating_population_size'] = 2000

In [None]:
trait = shelve.open(trait_parameter_set)
trait['allele_effect_distribution'] = random.expovariate.__name__
trait['distribution_parameters'] = 1
trait['multiplicity'] = 3
trait['heritability'] = 0.7

### File Names

In [None]:
base_population_file_name = "populations\\magic_1478.pop"
genetic_map_file_name = "parameters\\genetic_map_1478.hdf"
allele_file_name = "parameters\\alleles_at_1478_loci.hdf"

### Genotype Data

In [None]:
genetic_map = pd.read_hdf(genetic_map_file_name)
alleles = np.array(pd.read_hdf(allele_file_name))
recombination_rates = np.array(list(genetic_map['recom_rate']))
number_qt_loci = 10

In [None]:
base_population = sim.loadPopulation(base_population_file_name)

In [None]:
base_population.setSubPopName(run_id, 0)

In [None]:
analyze.allele_data(base_population, alleles, list(range(1478)))

### Create Analysis Population

In [None]:
sim.tagID(base_population, reset=False)

In [None]:
random_mater = breed.MAGIC(base_population, recombination_rates)

In [None]:
random_mater.interim_random_mating(analysis_parameters['generations']
                                   ,analysis_parameters['operating_population_size'])

In [None]:
sim.stat(base_population, alleleFreq=sim.ALL_AVAIL)
sim.stat(base_population, numOfSegSites=sim.ALL_AVAIL, vars=['segSites', 'numOfSegSites'])

### Quantitative Trait

In [None]:
qtl = sorted(random.sample(base_population.dvars().segSites, number_qt_loci))

In [None]:
additive_trait = parameters.Trait()

In [None]:
aes = additive_trait.assign_allele_effects(alleles, qtl, random.expovariate, 1, multiplicity=3)

In [None]:
aes

In [None]:
allele_effects_table

In [None]:
alle

In [None]:
heratability = 0.7

In [None]:
operators.assign_additive_g(base_population, qtl, aes)

In [None]:
import importlib as imp
imp.reload(analyze)
imp.reload(parameters)

# Main Analysis Engine

In [None]:
segregating_loci

In [None]:
for size in range(100, 2100, 100):
    segregating_loci, allele_effects_table = analyze.population_sample_analyzer(base_population, size, qtl, 
                                                  alleles, aes, heratability)

In [None]:
segregating_loci, allele_effects_table = analyze.population_sample_analyzer(base_population, 200, qtl, 
                                                  alleles, aes, heratability)

In [None]:
expanded_ae_table = analyze.remap_ae_table_loci(allele_effects_table, saegus_to_tassel_loci)

In [None]:
allele_effects_table

In [None]:
expanded_ae_table.to_hdf("expanded_allele_effects.hdf", "exp")

In [None]:
allele_effects_table.to_hdf("allele_effects.hdf", "allele_effects")

In [None]:
#synthesis_parameters['founders'] = simulation_parameters['founders']
#synthesis_parameters['operating_population_size'] = 2000
#synthesis_parameters['snp_to_integer'] = simulation_parameters['snp_to_integer']
#synthesis_parameters['integer_to_snp'] = simulation_parameters['integer_to_snp']
#synthesis_parameters['prefounder_file_name'] = 'prefounders_1478.pop'
#synthesis_parameters['mating_scheme'] = 'MAGIC'

In [None]:
saegus_to_tassel_loci = {}
tassel_to_saegus_loci = {}
for idx, locus in enumerate(segregating_loci):
    saegus_to_tassel_loci[locus] = idx
    tassel_to_saegus_loci[idx] = locus

In [None]:
segregating_frqs = [base_population.dvars().alleleFreq[seg_loc] for seg_loc in segregating_loci]

In [None]:
allele_effects_table

In [None]:
aeframe.to_hdf(indir+'daoko_girl_allele_effects_table.hdf', 'aeframe')

In [None]:
analysis_parameters['sample_segregating_loci'] = rm_sample.dvars().segSites
analysis_parameters['sample_allele_frequencies'] = dict(rm_sample.dvars().alleleFreq)
analysis_parameters['saegus_to_tassel_loci'] = saegus_to_tassel_loci
analysis_parameters['tassel_to_saegus_loci'] = tassel_to_saegus_loci

In [None]:
analysis_parameters.close()

In [None]:
import importlib as imp
imp.reload(analyze)

In [None]:
gwas.generate_tassel_gwas_configs("C:\\tassel\\bin\\daoko_girl_", 
                                     "C:\\tassel\\input\\daoko_girl_",
                                    "C:\\tassel\\output\\daoko_girl_",
                                    "C:\\Users\DoubleDanks\\BISB\\wisser\\code\\rjwlab-scripts\\saegus_project\\devel\\magic\\1478\\daoko_girl_gwas_pipeline.xml")

In [None]:
def generate_tassel_gwas_configs(sample_size,
                                 hapmap_file_name,
                                 kinship_file_name,
                                 phenotype_file_name,
                                 structure_file_name,
                                 output_file_name,
                                 config_file_template):
    """
    Creates an xml file to run TASSEL using a mixed linear model approach.
    Assumes use of hapmap, kinship, phenotype and population structure files.

    The TASSEL command line interface requires a considerable number of
    options to run GWAS. It is impractical to run the command line manually
    for the number of replications in a simulated study. The TASSEL command
    line interface allows the user to input a .xml file with the same
    information which is used in the terminal.

    :param input_directory: Directory path to send the input files.
    :param run_identifier_prefix: Identifier for single replicate of data
    :param config_file_templae: XML file already setup for running a
    specific kind of GWAS
    :return: XML file to run a single replicate of data using TASSEL
    """


    import xml.etree.ElementTree as ET
    import lxml.etree as etree

    tree = ET.parse(config_file_template)
    root = tree.getroot()
    lxml_tree = etree.fromstring(ET.tostring(root))
    lxml_root = lxml_tree.getroottree()

    lxml_root.find('fork1/h').text = hapmap_file_name
    lxml_root.find('fork2/t').text = phenotype_file_name
    lxml_root.find('fork3/q').text = structure_file_name
    lxml_root.find('fork4/k').text = kinship_file_name

    lxml_root.find('combine6/export').text = str(sample_size) + "_daoko_girl_out_"


    lxml_root.write(output_file_name,
                    encoding="UTF-8",
                   method="xml", xml_declaration=True, standalone='',
                    pretty_print=True)


In [None]:
indir = "C:\\tassel\\input\\"

for sample_size in range(100, 2000, 100):
    generate_tassel_gwas_configs(sample_size,
                  indir + str(sample_size) + '_daoko_girl_simulated_hapmap.txt',
                  indir + str(sample_size) + '_daoko_girl_kinship_matrix.txt',
                  indir + str(sample_size) + '_daoko_girl_phenotype_vector.txt',
                  indir + str(sample_size) + '_daoko_girl_structure_matrix.txt',
                  "C:\\tassel\\bin\\daoko_girl_" + str(sample_size) + "_sim_gwas_pipeline.xml",
                  "C:\\Users\DoubleDanks\\BISB\\wisser\\code\\rjwlab-scripts\\"
              "saegus_project\\devel\\magic\\1478\\daoko_girl_gwas_pipeline.xml")


In [2]:
analysis_parameters = shelve.open('daoko_girl_analysis_parameters')

In [3]:
list(analysis_parameters)

['saegus_to_tassel_loci',
 'sampled_ind_ids',
 'run_identifier',
 'tassel_to_saegus_loci',
 'scenario',
 'operating_population_size',
 'sample_allele_frequencies',
 'pos_names',
 'sample_segregating_loci',
 'population_name',
 'sample_size',
 'generations']

In [5]:
analysis_parameters['saegus_to_tassel_loci']

{1: 0,
 2: 1,
 4: 2,
 5: 3,
 6: 4,
 8: 5,
 10: 6,
 12: 7,
 13: 8,
 15: 9,
 16: 10,
 20: 11,
 21: 12,
 23: 13,
 24: 14,
 25: 15,
 26: 16,
 28: 17,
 29: 18,
 31: 19,
 32: 20,
 34: 21,
 36: 22,
 38: 23,
 40: 24,
 42: 25,
 44: 26,
 45: 27,
 46: 28,
 47: 29,
 49: 30,
 50: 31,
 54: 32,
 55: 33,
 56: 34,
 58: 35,
 60: 36,
 61: 37,
 62: 38,
 68: 39,
 70: 40,
 74: 41,
 77: 42,
 82: 43,
 85: 44,
 86: 45,
 94: 46,
 95: 47,
 99: 48,
 100: 49,
 102: 50,
 103: 51,
 105: 52,
 106: 53,
 107: 54,
 108: 55,
 109: 56,
 114: 57,
 115: 58,
 116: 59,
 119: 60,
 120: 61,
 121: 62,
 122: 63,
 123: 64,
 124: 65,
 126: 66,
 127: 67,
 128: 68,
 130: 69,
 132: 70,
 134: 71,
 137: 72,
 138: 73,
 139: 74,
 140: 75,
 141: 76,
 144: 77,
 145: 78,
 146: 79,
 147: 80,
 149: 81,
 152: 82,
 153: 83,
 155: 84,
 156: 85,
 158: 86,
 159: 87,
 163: 88,
 165: 89,
 168: 90,
 171: 91,
 173: 92,
 174: 93,
 176: 94,
 178: 95,
 184: 96,
 185: 97,
 186: 98,
 187: 99,
 189: 100,
 191: 101,
 192: 102,
 193: 103,
 194: 104,
 195: 105,