## Run: daoko_girl

In [1]:
import pytest
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
from saegus import breed, operators, simulate, analyze, parse, parameters
import shelve
import numpy as np
import random
np.set_printoptions(suppress=True, precision=3)

In [None]:
import importlib as imp
imp.reload(analyze)

### Top Level Definitions

In [2]:
trait_parameter_set = shelve.open('daoko_girl_trait_parameters')
list(trait_parameter_set)

['number_of_qtl',
 'qtl',
 'distribution_parameters',
 'allele_effects',
 'heritability',
 'allele_effect_distribution',
 'multiplicity',
 'epsilon']

In [3]:
run_id = 'daoko_girl'
trait_parameter_set = run_id + '_trait_parameters'
analysis_parameter_set = run_id + '_analysis_parameters'

In [4]:
analysis_parameters = shelve.open(analysis_parameter_set)
analysis_parameters['population_name'] = run_id
analysis_parameters['scenario'] = 'random_mating'
analysis_parameters['generations'] = 3
analysis_parameters['run_identifier'] = run_id
analysis_parameters['operating_population_size'] = 2000

In [5]:
trait = shelve.open(trait_parameter_set)
trait['allele_effect_distribution'] = random.expovariate.__name__
trait['distribution_parameters'] = 1
trait['multiplicity'] = 3
trait['heritability'] = 0.7

### File Names

In [6]:
base_population_file_name = "populations\\magic_1478.pop"
genetic_map_file_name = "parameters\\genetic_map_1478.hdf"
allele_file_name = "parameters\\alleles_at_1478_loci.hdf"

### Genotype Data

In [7]:
genetic_map = pd.read_hdf(genetic_map_file_name)
alleles = np.array(pd.read_hdf(allele_file_name))
recombination_rates = np.array(list(genetic_map['recom_rate']))
number_qt_loci = 10

### Quantitative Trait

In [8]:
base_population = sim.loadPopulation(base_population_file_name)

In [9]:
base_population.setSubPopName(run_id, 0)

In [119]:
analyze.allele_data(base_population, alleles, list(range(1478)))

Unnamed: 0,minor_allele,minor_frequency,major_allele,major_frequency
0,2,0.00000,1,1.00000
1,3,0.11675,1,0.88325
2,1,0.05850,3,0.94150
3,2,0.00000,0,1.00000
4,0,0.05275,2,0.94725
5,2,0.25400,0,0.74600
6,2,0.12550,0,0.87450
7,1,0.00000,3,1.00000
8,2,0.23750,0,0.76250
9,3,0.00000,1,1.00000


### Create Analysis Population

In [10]:
sim.tagID(base_population, reset=False)

In [11]:
random_mater = breed.MAGIC(base_population, recombination_rates)

In [12]:
random_mater.interim_random_mating(analysis_parameters['generations']
                                   ,analysis_parameters['operating_population_size'])

Initiating interim random mating for 3 generations.
Generation: 3
Generation: 4
Generation: 5


In [13]:
sim.stat(base_population, alleleFreq=sim.ALL_AVAIL)
sim.stat(base_population, numOfSegSites=sim.ALL_AVAIL, vars=['segSites', 'numOfSegSites'])

In [14]:
qtl = sorted(random.sample(base_population.dvars().segSites, number_qt_loci))

In [15]:
additive_trait = parameters.Trait()

In [16]:
aes = additive_trait.assign_allele_effects(alleles, qtl, random.expovariate, 1, multiplicity=3)

In [103]:
aes

{42: {1: 1.393701239159606, 2: 3.184295015795534},
 127: {1: 5.367228389288934, 3: 1.3078061708495794},
 449: {1: 8.035747688940756, 3: 0.6976499336874429},
 611: {0: 2.519242596821685, 2: 0.6305183368377753},
 621: {1: 4.386457733737859, 3: 4.030080017077876},
 705: {0: 2.556492893713342, 2: 2.4893106817937616},
 714: {1: 2.09885237770004, 2: 4.887870521417468},
 930: {1: 2.0592992333167435, 2: 3.567773234808219},
 1018: {1: 3.0368174994984773, 3: 5.546120917748954},
 1255: {1: 1.3620001555727523, 3: 0.9369738103146801}}

In [121]:
allele_effects_table

Unnamed: 0,locus,alpha_allele,alpha_effect,beta_allele,beta_effect
0,42,2,3.184295,1,1.393701
1,127,3,1.307806,1,5.367228
2,449,3,0.69765,1,8.035748
3,611,2,0.630518,0,2.519243
4,621,3,4.03008,1,4.386458
5,705,2,2.489311,0,2.556493
6,714,1,2.098852,2,4.887871
7,930,2,3.567773,1,2.059299
8,1018,3,5.546121,1,3.036817
9,1255,1,1.362,3,0.936974


In [117]:
alle

NameError: name 'af' is not defined

In [27]:
heratability = 0.7

In [None]:
operators.assign_additive_g(base_population, qtl, aes)

In [99]:
import importlib as imp
imp.reload(analyze)
imp.reload(parameters)

<module 'saegus.parameters' from 'c:\\Anaconda3\\lib\\site-packages\\saegus\\parameters.py'>

# Main Analysis Engine

In [102]:
segregating_loci

[1,
 2,
 4,
 5,
 6,
 8,
 10,
 12,
 13,
 15,
 16,
 20,
 21,
 23,
 24,
 25,
 26,
 28,
 29,
 31,
 32,
 34,
 36,
 38,
 40,
 42,
 44,
 45,
 46,
 47,
 49,
 50,
 54,
 55,
 56,
 58,
 60,
 61,
 62,
 68,
 70,
 74,
 77,
 82,
 85,
 86,
 94,
 95,
 99,
 100,
 102,
 103,
 105,
 106,
 107,
 108,
 109,
 114,
 115,
 116,
 119,
 120,
 121,
 122,
 123,
 124,
 126,
 127,
 128,
 130,
 132,
 134,
 137,
 138,
 139,
 140,
 141,
 144,
 145,
 146,
 147,
 149,
 152,
 153,
 155,
 156,
 158,
 159,
 163,
 165,
 168,
 171,
 173,
 174,
 176,
 178,
 184,
 185,
 186,
 187,
 189,
 191,
 192,
 193,
 194,
 195,
 196,
 202,
 203,
 208,
 211,
 212,
 213,
 214,
 215,
 220,
 222,
 223,
 225,
 226,
 227,
 229,
 232,
 234,
 235,
 240,
 241,
 242,
 244,
 245,
 247,
 252,
 256,
 257,
 258,
 259,
 261,
 262,
 264,
 265,
 266,
 268,
 272,
 273,
 274,
 275,
 277,
 281,
 282,
 283,
 284,
 285,
 288,
 290,
 291,
 292,
 293,
 294,
 297,
 299,
 300,
 301,
 302,
 303,
 305,
 306,
 307,
 310,
 312,
 314,
 315,
 316,
 317,
 318,
 319,
 320,

In [120]:
for size in range(100, 2100, 100):
    segregating_loci, allele_effects_table = analyze.population_sample_analyzer(base_population, size, qtl, 
                                                  alleles, aes, heratability)

In [95]:
segregating_loci, allele_effects_table = analyze.population_sample_analyzer(base_population, 200, qtl, 
                                                  alleles, aes, heratability)

In [126]:
expanded_ae_table = analyze.remap_ae_table_loci(allele_effects_table, saegus_to_tassel_loci)

In [128]:
allele_effects_table

Unnamed: 0_level_0,locus,alpha_allele,alpha_effect,beta_allele,beta_effect,difference
locus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25,25,2,3.184295,1,1.393701,1.790594
67,67,3,1.307806,1,5.367228,4.059422
251,251,3,0.69765,1,8.035748,7.338098
347,347,2,0.630518,0,2.519243,1.888724
353,353,3,4.03008,1,4.386458,0.356378
410,410,2,2.489311,0,2.556493,0.067182
416,416,1,2.098852,2,4.887871,2.789018
536,536,2,3.567773,1,2.059299,1.508474
590,590,3,5.546121,1,3.036817,2.509303
729,729,1,1.362,3,0.936974,0.425026


In [129]:
expanded_ae_table.to_hdf("expanded_allele_effects.hdf", "exp")

In [130]:
allele_effects_table.to_hdf("allele_effects.hdf", "allele_effects")

In [None]:
#synthesis_parameters['founders'] = simulation_parameters['founders']
#synthesis_parameters['operating_population_size'] = 2000
#synthesis_parameters['snp_to_integer'] = simulation_parameters['snp_to_integer']
#synthesis_parameters['integer_to_snp'] = simulation_parameters['integer_to_snp']
#synthesis_parameters['prefounder_file_name'] = 'prefounders_1478.pop'
#synthesis_parameters['mating_scheme'] = 'MAGIC'

In [122]:
saegus_to_tassel_loci = {}
tassel_to_saegus_loci = {}
for idx, locus in enumerate(segregating_loci):
    saegus_to_tassel_loci[locus] = idx
    tassel_to_saegus_loci[idx] = locus

In [123]:
segregating_frqs = [base_population.dvars().alleleFreq[seg_loc] for seg_loc in segregating_loci]

In [125]:
allele_effects_table

Unnamed: 0,locus,alpha_allele,alpha_effect,beta_allele,beta_effect
0,42,2,3.184295,1,1.393701
1,127,3,1.307806,1,5.367228
2,449,3,0.69765,1,8.035748
3,611,2,0.630518,0,2.519243
4,621,3,4.03008,1,4.386458
5,705,2,2.489311,0,2.556493
6,714,1,2.098852,2,4.887871
7,930,2,3.567773,1,2.059299
8,1018,3,5.546121,1,3.036817
9,1255,1,1.362,3,0.936974


In [None]:
aeframe.to_hdf(indir+'daoko_girl_allele_effects_table.hdf', 'aeframe')

In [None]:
analysis_parameters['sample_segregating_loci'] = rm_sample.dvars().segSites
analysis_parameters['sample_allele_frequencies'] = dict(rm_sample.dvars().alleleFreq)
analysis_parameters['saegus_to_tassel_loci'] = saegus_to_tassel_loci
analysis_parameters['tassel_to_saegus_loci'] = tassel_to_saegus_loci

In [None]:
analysis_parameters.close()

In [None]:
import importlib as imp
imp.reload(analyze)

In [None]:
gwas.generate_tassel_gwas_configs("C:\\tassel\\bin\\daoko_girl_", 
                                     "C:\\tassel\\input\\daoko_girl_",
                                    "C:\\tassel\\output\\daoko_girl_",
                                    "C:\\Users\DoubleDanks\\BISB\\wisser\\code\\rjwlab-scripts\\saegus_project\\devel\\magic\\1478\\daoko_girl_gwas_pipeline.xml")

In [9]:
def generate_tassel_gwas_configs(sample_size,
                                 hapmap_file_name,
                                 kinship_file_name,
                                 phenotype_file_name,
                                 structure_file_name,
                                 output_file_name,
                                 config_file_template):
    """
    Creates an xml file to run TASSEL using a mixed linear model approach.
    Assumes use of hapmap, kinship, phenotype and population structure files.

    The TASSEL command line interface requires a considerable number of
    options to run GWAS. It is impractical to run the command line manually
    for the number of replications in a simulated study. The TASSEL command
    line interface allows the user to input a .xml file with the same
    information which is used in the terminal.

    :param input_directory: Directory path to send the input files.
    :param run_identifier_prefix: Identifier for single replicate of data
    :param config_file_templae: XML file already setup for running a
    specific kind of GWAS
    :return: XML file to run a single replicate of data using TASSEL
    """


    import xml.etree.ElementTree as ET
    import lxml.etree as etree

    tree = ET.parse(config_file_template)
    root = tree.getroot()
    lxml_tree = etree.fromstring(ET.tostring(root))
    lxml_root = lxml_tree.getroottree()

    lxml_root.find('fork1/h').text = hapmap_file_name
    lxml_root.find('fork2/t').text = phenotype_file_name
    lxml_root.find('fork3/q').text = structure_file_name
    lxml_root.find('fork4/k').text = kinship_file_name

    lxml_root.find('combine6/export').text = str(sample_size) + "_daoko_girl_out_"


    lxml_root.write(output_file_name,
                    encoding="UTF-8",
                   method="xml", xml_declaration=True, standalone='',
                    pretty_print=True)


In [11]:
indir = "C:\\tassel\\input\\"

for sample_size in range(100, 2000, 100):
    generate_tassel_gwas_configs(sample_size,
                  indir + str(sample_size) + '_daoko_girl_simulated_hapmap.txt',
                  indir + str(sample_size) + '_daoko_girl_kinship_matrix.txt',
                  indir + str(sample_size) + '_daoko_girl_phenotype_vector.txt',
                  indir + str(sample_size) + '_daoko_girl_structure_matrix.txt',
                  "C:\\tassel\\bin\\daoko_girl_" + str(sample_size) + "_sim_gwas_pipeline.xml",
                  "C:\\Users\DoubleDanks\\BISB\\wisser\\code\\rjwlab-scripts\\"
              "saegus_project\\devel\\magic\\1478\\daoko_girl_gwas_pipeline.xml")
