# Simulating a Population for Use in GWAS #

### Population Parameters ###

In [1]:
import simuOpt
simuOpt.setOptions(alleleType='short', optimized=True, quiet=True)
import simuPOP as sim
import pandas as pd
import collections as col
from wgs import breed, operators, selection, helpers, parser, parameterizer, selection
import random
import numpy as np
np.set_printoptions(suppress=True, precision=3)
import shelve
import matplotlib.pyplot as plt



hapmap = pd.read_csv('clean_hapmap.txt')
genetic_map = hapmap.ix[:, :'cM_pos']
genetic_map = pd.read_csv('nam_prefounders_genetic_map.txt', index_col=None,
                         sep='\t')

chr_cm_positions = col.OrderedDict()
for i in range(1, 11):
    chr_cm_positions[i] = []

for idx in range(len(genetic_map)):
    chr_cm_positions[int(genetic_map.iloc[idx]['chr'])].append(
    float(genetic_map.iloc[idx]['cM_pos']))


cM_positions = []
for k, v in chr_cm_positions.items():
    cM_positions.append(v)


snp_to_integer = {'A':0, 'C':1, 'G':2, 'T':3, '-':4, '+':5}

integral_valued_loci = []
relative_integral_valued_loci = {}
for idx in range(len(genetic_map)):
    if str(genetic_map.iloc[idx]['cM_pos'])[-2:] == '.0':
        integral_valued_loci.append(idx)
        relative_integral_valued_loci[idx] = (genetic_map.iloc[idx]['chr'], genetic_map.iloc[idx]['cM_pos'])






alleles = {i: (snp_to_integer[hapmap.ix[i, 'alleles'][0]], 
               snp_to_integer[hapmap.ix[i, 'alleles'][-1]]) for i in
          range(len(hapmap))}

recombination_rates = []
for chromosome in cM_positions:
    for cM in chromosome:
        if str(cM)[-2:] == '.6':
            recombination_rates.append(0.01)
        else:
            recombination_rates.append(0.0)

allele_names = ['A', 'C', 'T', 'G', 'D', 'I']

flat_cM_positions = []
for cMs in cM_positions:
    flat_cM_positions.extend(cMs)


nam = sim.loadPopulation('nam_prefounders.pop')
nam.setSubPopName('prefounders', 0)
sample_sizes = {i: 100 for i in range(0, 21, 2)}

sim_params = {
                'gens_selection': 10,
                'gens_random_mating': 3,
                'main_pop_size': 2000,
                'proportion_saved': 0.05,
                'overshoot': 0.50,
                'breeding_inds_per_sp': 5,
                'heritability': 0.7,
                'sample_sizes': sample_sizes,
                'replicates': 1,
                'prefounder_file': 'nam_prefounders.pop',
                'qtl': 30,
                'founders': [(1,5), (7, 8), (3, 4), (10, 11)],
}

In [None]:
import shelve
stored_parameters = {}
with shelve.open('RS_Parameter_Sets') as loaded:
    for k, v in loaded.items():
        stored_parameters[k] = v
stored_parameters.keys()

### Simulated Breeding Scenario ###

In [None]:
s.generate_f_one(pop, recombination_rates, sim_params['founders'])
s.generate_f_two(pop, recombination_rates)
s.mate_and_merge(pop, recombination_rates)
s.interim_random_mating(pop, recombination_rates)

sim.stat(pop, numOfSegSites=integral_valued_loci, vars=['numOfSegSites', 'segSites'])

## Choose QTL and Assign Effects ##

In [None]:
qtl = parameterizer.seg_qtl_chooser(pop, integral_valued_loci, sim_params['qtl'])

triplet_qtl = []
for locus in qtl:
    triplet_qtl.append(locus-1)
    triplet_qtl.append(locus)
    triplet_qtl.append(locus+1)
triplet_qtl = sorted(triplet_qtl)


allele_effects = {}
for tqtl in triplet_qtl:
    for allele in alleles[tqtl]:
        allele_effects[tqtl, allele] = random.expovariate(sim_params['allele_effects'])


        

pop.dvars().qtl = qtl
pop.dvars().triplet_qtl = triplet_qtl
pop.dvars().allele_effects = allele_effects

In [None]:
s.recurrent_truncation_selection(pop, meta, triplet_qtl, allele_effects,
                                recombination_rates)
                                
meta.removeSubPops(0)

qtd = helpers.Frq(meta, triplet_qtl, alleles, allele_effects)
qtalleles = qtd.rank_allele_effects(meta, triplet_qtl, alleles, allele_effects)
af = qtd.allele_frequencies(meta, range(meta.totNumLoci()))
ties = [locus for locus in range(meta.totNumLoci()) if af['minor', 'alleles'][locus] == af['major', 'alleles'][locus]]
for t in ties:
    af['major', 'alleles'][t] = list(meta.dvars().alleleFreq[t])[0]
    af['minor', 'alleles'][t] = list(meta.dvars().alleleFreq[t])[1]
sum(np.equal(list(af['minor', 'alleles'].values()), list(af['major', 'alleles'].values())))

## Gather Data for Use in GWAS ##

In [None]:
pca = helpers.PCA(meta, range(meta.totNumLoci()), af)
minor_ac = pca.calculate_count_matrix(meta, af['minor', 'alleles'], 'sim_minor_allele_count.txt')
eigendata = pca.svd(meta, minor_ac)
ts = pca.test_statistic(meta, eigendata['values'])

integer_to_snp = {0: 'A', 1: 'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}
raw_hmap = pd.read_csv('hapmap3.txt', delimiter='\t', index_col=0)
locus_names = list(raw_hmap['nearest.site'])
pos_column = list(raw_hmap['agp_pos'])
individual_names = {ind.ind_id: 'RS_R'+str(1)+'_G'+str(int(ind.generation)) + '_I'+str(int(ind.ind_id))
                   for ind in meta.individuals()}

cols_for_hapmap = {'locus_names': locus_names, 'pos_column': pos_column}
                   
gwas = helpers.GWAS(meta, individual_names, locus_names, pos_column)
hmap = gwas.hapmap_formatter(integer_to_snp, 'sim_hapmap.txt')
popstruct = gwas.population_structure_formatter(eigendata, 'sim_structure.txt')
phenos = gwas.trait_formatter('sim_trait_vector.txt')
kinship_matrix = gwas.calc_kinship_matrix(minor_ac, af, 'sim_kinship.txt')


In [None]:
def qt_allele_data(qtl: list, allele_effects:dict):
    """Creates a data table for quantitative trait alleles.
    Can be further utilized for downstream analysis and visualization.
    
    Output is suitable for storage in an SQL/NoSQL type database with
    use of pandas package.
    """
    qt_allele_columns = ['locus', 'favorable', 'fav_effect', 'unfavorable', 'unfav_effect'
                     'effect_difference', 'G_0', 'G_2', 'G_4', 'G_6', 
                     'G_8', 'G_10', 'aggregate']


In [None]:
def haplotype_diagram(pop, recom_rates):
    """Makes a very simple diagram of the special triplet-qtl
    simulator.
    
    """
    breakpoints = col.OrderedDict()
    for locus in range(meta.totNumLoci()):
        if recombination_rates[locus] == 0.01:
            breakpoints[locus] = locus + 1

    diagram = ["|"]*meta.totNumLoci()

    for locus, point in breakpoints.items():
        try:
            diagram[point] = '*'
        except IndexError:
            pass
    return diagram

In [None]:
triplet_qtl

In [None]:
qt = ['o']*meta.totNumLoci()

In [None]:
qt[129] = 'x'

In [None]:
qt[129]

In [None]:
for t in range(meta.totNumLoci()):
    
    

In [None]:
loci_data = {'diagram': ['o']*meta.totNumLoci(),
            'qtl': qtl,
             'triplet_qtl': triplet_qtl,
             
            }


In [None]:
for l in qt:
    if l in triplet_qtl:
        qt[l] = 'x'

In [None]:
qt

In [None]:
def allele_frq_table(pop: sim.Population, allele_frq_data:dict, genetic_map, 
                     halpotype_diagram, database_name=''):
    """
    Generates a large table which centralizes all allele frequency data.
    The data is inserted into a pandas DataFrame object.
    Useful for downstream analysis and insertion into a database.
    
    Allele frequency data is first built up in a regular *dict* object
    then inserted into a 
    """
    
    data_columns = ['abs_index', 'chrom', 'locus', 'recom_rate', 'cM', 'v',
     'qtl', 'minor', 'major', 'G_0', 'G_2', 'G_4', 'G_6', 'G_8', 'G_10', 'aggregate']
    
    data = {}

    chromosomes = []
    relative_loci = []
    for locus in range(meta.totNumLoci()):
        pair = meta.chromLocusPair(locus)
        chromosomes.append(pair[0]+1)
        relative_loci.append(pair[1])

    data['chrom'] = chromosomes
    data['locus'] = relative_loci


    data['recom_rate'] = recombination_rates
    data['v'] = haplotype_diagram
    data['cM'] = genetic_map['cM_pos']


    data['abs_index'] = [locus for locus in range(pop.totNumLoci())]
    data['minor'] = [allele_frq_data['minor', 'alleles'][locus] for locus in range(pop.totNumLoci())]
    data['major'] = [allele_frq_data['major', 'alleles'][locus] for locus in range(pop.totNumLoci())]
    for subpop in pop.numSubPop():
        data['G_0'] = [allele_frq_data['minor', 'frequency', subpop][locus] for locus in range(pop.totNumLoci())]
    data['aggregate'] = [allele_frq_data['minor', 'frequency'][locus] for locus in range(pop.totNumLoci())]
    return pd.DataFrame(data, columns=data_columns)

In [None]:
df = pd.DataFrame(data, index=data['abs_index'], columns=data_columns)

In [None]:
import sqlite3
conn = sqlite3.connect('gwas.db')

In [None]:
df.to_sql('gwas.db', conn, chunksize=1000)

In [None]:
conn.commit()

In [None]:
conn.close()

In [None]:
statistics = pd.DataFrame(pop.dvars().statistics)

In [None]:
statistics.index

In [None]:
plt.plot(statistics)

In [None]:
plt.show()

In [None]:
plt.style.use('ggplot')`

In [None]:
plt.ion()

In [None]:
len(qtl)

In [None]:
statistics.to_csv('how_is_it_arranged.csv', header=True, index=True)