# Generating Data for GWAS with TASSEL

In [1]:
%matplotlib inline
import shelve
import simuOpt
simuOpt.setOptions(alleleType='short', optimized=True, numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from saegus import breed, operators, selection, helpers, parser, parameterizer, selection
import random
import copy
import yaml
#random.seed(1337)
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import matplotlib.pyplot as plt

In [2]:
hapmap = pd.read_csv('clean_hapmap.txt')
raw_hmap = pd.read_csv('hapmap3.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

raw_hmap = pd.read_csv('hapmap3.txt', delimiter='\t', index_col=0)
locus_names = list(raw_hmap['nearest.site'])
pos_column = list(raw_hmap['agp_pos'])

chr_cM_positions = {}
for i in range(1, 11):
    chr_cM_positions[i] = []

for idx in range(len(genetic_map)):
    chrome = str(int())
    chr_cM_positions[int(genetic_map.iloc[idx]['chr'])].append(genetic_map.iloc[idx]['cM_pos'])


cM_positions = []
for i in range(1, 11):
    cM_positions.append(chr_cM_positions[i])

In [3]:
snp_to_integer = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-':4, '+':5}
allele_names = ['A', 'C', 'T', 'G', 'D', 'I']
integer_to_snp = {0: 'A', 1:'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}


integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])

alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)

In [4]:
nam = sim.loadPopulation('nam_prefounders.pop')
sim.tagID(nam, reset=True)
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

genetic_structure = {}
#genetic_structure['cM_positions'] = cM_positions
#enetic_structure['chr_cM_positions'] = chr_cM_positions
genetic_structure['allele_names'] = allele_names
genetic_structure['integral_valued_loci'] = integral_valued_loci
genetic_structure['relative_integral_valued_loci'] = relative_integral_valued_loci
genetic_structure['alleles'] = alleles
genetic_structure['recombination_rates'] = recombination_rates

In [5]:
sim_params = {
                'generations_of_selection': 10,
                'generations_of_drift': 10,
                'generations_of_random_mating': 3,
                'number_of_replicates': 10,
                'operating_population_size': 500,
                'proportion_of_individuals_saved': 0.05,
                'overshoot_as_proportion': 0.50,
                'individuals_per_breeding_subpop': 5,
                'heritability': 0.7,
                'meta_pop_sample_sizes': sample_sizes,
                'prefounder_file_name': 'nam_prefounders.pop',
                'founders': [(3,18), (2, 13), (7, 14), (1, 19),
                            (14, 17), (1, 20), (17, 21), (9, 22)]
    }

In [6]:
qtl_params = {
                'qtl': 10,
                'allele_effects': 1,
}
selection_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}
drift_statistics = {
    'aggregate': {},
    'selected': {},
    'non-selected': {}
}

In [7]:
ind_names_for_gwas = {i: {} for i in range(sim_params['number_of_replicates'])}

In [8]:
s = selection.Truncation(sim_params['generations_of_selection'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

d = selection.Drift(sim_params['generations_of_drift'],
                       sim_params['generations_of_random_mating'],
                       sim_params['operating_population_size'],
                       sim_params['proportion_of_individuals_saved'],
                       sim_params['overshoot_as_proportion'],
                       sim_params['individuals_per_breeding_subpop'],
                       sim_params['heritability'],
                       sim_params['meta_pop_sample_sizes'],
                       sim_params['number_of_replicates'])

In [9]:
founders = sim_params['founders']
replicated_nam = sim.Simulator(nam, rep=3, stealPops=False)
pop = replicated_nam.extract(0)
#selection_meta = replicated_nam.extract(0)
#drift_meta = replicated_nam.extract(0)

### Run MAGIC Mating Scheme

In [10]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])
s.recombinatorial_convergence(pop, recombination_rates)
s.expand_by_selfing(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)

Initiating interim random mating for 3 generations.
Generation: 5
Generation: 6
Generation: 7


## Adapting QTL and Allele Effects to Multiple Replicate Case

In [11]:
multipop = sim.Simulator(pop, sim_params['number_of_replicates'])
multi_meta = sim.Simulator(nam, sim_params['number_of_replicates'], stealPops=False)

In [12]:
for pop_rep in multipop.populations():
    sim.stat(pop_rep, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])

In [13]:
multipop.numRep()

10

In [14]:
triplet_qtl = {}
for i, pop_rep in enumerate(multipop.populations()):
    triplet_qtl[i] = []
    qtl = parameterizer.seg_qtl_chooser(pop_rep, integral_valued_loci, qtl_params['qtl'])
    for locus in qtl:
        triplet_qtl[i].append(locus-1)
        triplet_qtl[i].append(locus)
        triplet_qtl[i].append(locus+1)

#qtl_params['triplet_qtl'] = triplet_qtl

In [15]:
allele_effects = {rep_id: {locus: {} for locus in triplet_qtl[rep_id]} 
                  for rep_id in range(sim_params['number_of_replicates'])}
for i, pop_rep in enumerate(multipop.populations()):
    for tqtl in triplet_qtl[i]:
        for allele in alleles[tqtl]:
            allele_effects[i][tqtl][allele] = random.expovariate(1)

#qtl_params['allele_effects'] = allele_effects

In [16]:
for repid, pop_rep in enumerate(multipop.populations()):
    pop_rep.dvars().qtl = qtl[repid]
    pop_rep.dvars().triplet_qtl = triplet_qtl[repid]
    pop_rep.dvars().allele_effects = allele_effects[repid]
    pop_rep.dvars().statistics = copy.deepcopy(selection_statistics)


#selection_plus_drift_replicates = sim.Simulator(pop, rep=2)
#selection_pop = multipop.extract(0)
#drift_pop = multipop.extract(0)

In [None]:
s.replicate_selection(multipop, multi_meta, triplet_qtl, allele_effects,
                                recombination_rates)

In [None]:
for meta_rep in multi_meta.populations():
    assert meta_rep.numSubPop() == 7, "Correct number subpopulations before removal of the dummy population"
    meta_rep.removeSubPops(0)
    assert meta_rep.numSubPop() == 6, "Correct number after removal"

In [None]:
for i, meta_rep in enumerate(multi_meta.populations()):
    selection_qtd = helpers.Frq(meta_rep, triplet_qtl[i], alleles, allele_effects[i])
    #drift_qtd = helpers.Frq(drift_meta, triplet_qtl, alleles, allele_effects)
    selection_af = selection_qtd.allele_frequencies(meta_rep, range(meta_rep.totNumLoci()))
    #drift_af = drift_qtd.allele_frequencies(drift_meta, range(drift_meta.totNumLoci()))
    selection_qtalleles = selection_qtd.rank_allele_effects(meta_rep, triplet_qtl[i], alleles, allele_effects[i])
    #drift_qtalleles = drift_qtd.rank_allele_effects(drift_meta, triplet_qtl, alleles, allele_effects)
    selection_ties = [locus for locus in range(meta_rep.totNumLoci()) 
                      if selection_af['minor', 'alleles'][locus] == selection_af['major', 'alleles'][locus]]
    #drift_ties = [locus for locus in range(drift_meta.totNumLoci())
     #                 if drift_af['minor', 'alleles'][locus] == drift_af['major', 'alleles'][locus]]

    for st in selection_ties:
        selection_af['major', 'alleles'][st] = list(meta_rep.dvars().alleleFreq[st])[0]
        selection_af['minor', 'alleles'][st] = list(meta_rep.dvars().alleleFreq[st])[1]
    major_minor_allele_conflicts = sum(np.equal(list(selection_af['minor', 'alleles'].values()), 
                 list(selection_af['major', 'alleles'].values())))
    
    assert major_minor_allele_conflicts == 0, "There is a tie in at least one locus."
    
    pca = helpers.PCA(meta_rep, range(meta_rep.totNumLoci()), selection_qtd)
    meta_rep_id = str(meta_rep.dvars().rep)
    prefix = 'rs_rep_' + str(meta_rep_id) + '_'
    
    minor_ac = pca.calculate_count_matrix(meta_rep, selection_af['minor', 'alleles'], 
                                      prefix + 'minor_allele_count.txt')
    eigendata = pca.svd(meta_rep, minor_ac)
    meta_rep_id = str(meta_rep.dvars().rep)
    individual_names = {ind.ind_id: 'RS_R'+ meta_rep_id +'_G' + 
                        str(int(ind.generation)) + 
                        '_I'+str(int(ind.ind_id)) 
                        for ind in meta_rep.individuals()}
    
    ind_names_for_gwas[meta_rep_id] = individual_names
    
    meta_rep.save(prefix + 'metapopulation.pop')
    
    names_filename = prefix + 'individual_names.yaml'
    with open(names_filename, 'w') as name_stream:
        yaml.dump(individual_names, name_stream)
    
    
    gwas = helpers.GWAS(meta_rep, individual_names, locus_names, pos_column)
    hmap = gwas.hapmap_formatter(integer_to_snp, prefix + 'simulated_hapmap.txt')
    phenos = gwas.trait_formatter(prefix + 'phenotype_vector.txt')
    kinship_matrix = gwas.calc_kinship_matrix(minor_ac, selection_af, prefix + 'kinship_matrix.txt')
    pop_struct_matrix = gwas.population_structure_formatter(eigendata, prefix + 'structure_matrix.txt')
    pd.DataFrame(multipop.population(i).dvars().statistics).to_csv(prefix + 'means_and_vars.txt', sep='\t')

### Haplotype Data and Plots

In [None]:
sel_htypes = helpers.collect_haplotype_data(selection_meta, allele_effects, triplet_qtl)
drift_htypes = helpers.collect_haplotype_data(drift_meta, allele_effects, triplet_qtl)
drift_haplotype_table = helpers.generate_haplotype_data_table(drift_meta, drift_htypes)
sel_haplotype_table = helpers.generate_haplotype_data_table(selection_meta, sel_htypes)
sel_plot_data = helpers.plot_frequency_vs_effect(selection_meta, sel_haplotype_table, 
                                                 'Haplotype Frequencies Under Selection',
                                                 'Selection_Haplotypes.pdf')
drift_plot_data = helpers.plot_frequency_vs_effect(drift_meta, drift_haplotype_table, 
                                                   'Haplotype Frequencies Under Drift',
                                                   'Drift_Haplotypes.pdf')

The TASSEL command line interface requires a considerable number of
options to run GWAS. It is impractical to run the command line manually
for the number of replications in a simulated study. The TASSEL command
line interface allows the user to input a .xml file with the same
information which is used in the terminal.