# Small

In [1]:
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
from saegus import breed, operators, simulate, analyze, parse, parameters
import shelve
import numpy as np
import random
import h5py
import collections as col
np.set_printoptions(suppress=True, precision=3)

In [2]:
small = analyze.Study('small')

In [3]:
run_id = 'small'
generations_of_random_mating = 10
number_of_qtl = 20
number_of_replicates = 10
founders = [[2, 26], [3, 25], [4, 24], [5, 23]]
os_per_pair = 500
recombination_rates = [0.01]*1478

In [4]:
prefounders = sim.loadPopulation('bia_prefounders.pop')

In [5]:
prefounders.infoFields()

('ind_id',
 'father_id',
 'mother_id',
 'fitness',
 'p',
 'g',
 'generation',
 'replicate')

In [6]:
sim.tagID(prefounders, reset=True)

In [7]:
prefounders.popSize()

26

In [8]:
multi_prefounders = sim.Simulator(prefounders, 10, stealPops=False)

In [9]:
magic = breed.MAGIC(multi_prefounders, founders, recombination_rates)

In [10]:
magic.generate_f_one(founders, os_per_pair)

In [11]:
mrc = breed.MultiRandomCross(multi_prefounders, 4, 500)

In [12]:
mother_choices, father_choices = mrc.determine_random_cross()

In [13]:
multi_snd_ord_chooser = breed.MultiSecondOrderPairIDChooser(
    mother_choices, father_choices)

In [14]:
multi_prefounders.evolve(
    matingScheme=sim.HomoMating(
        sim.PyParentsChooser(multi_snd_ord_chooser.snd_ord_id_pairs),
        sim.OffspringGenerator(ops=[
            sim.IdTagger(),
            sim.PedigreeTagger(),
            sim.Recombinator(rates=0.01)
        ],
            numOffspring=1),
        subPopSize=[2000],
    ),
    gen=1,
)

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

In [15]:
final_mrc = breed.MultiRandomCross(multi_prefounders, 2, 1000)

In [16]:
final_mothers, final_fathers = final_mrc.determine_random_cross()

In [17]:
final_multi_snd_ord_chooser = breed.MultiSecondOrderPairIDChooser(
    final_mothers, final_fathers)

In [18]:
multi_prefounders.evolve(
    matingScheme=sim.HomoMating(
        sim.PyParentsChooser(final_multi_snd_ord_chooser.snd_ord_id_pairs),
        sim.OffspringGenerator(ops=[
            sim.IdTagger(),
            sim.PedigreeTagger(),
            sim.Recombinator(rates=0.01)
        ],
            numOffspring=1),
        subPopSize=[2000],
    ),
    gen=1,
)

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

# Random Mating Phase

In [19]:
multi_prefounders.evolve(
    matingScheme=sim.RandomMating(ops=[
            sim.IdTagger(),
            sim.PedigreeTagger(),
            sim.Recombinator(rates=0.01)
        ],
        subPopSize=[2000]),
    gen=10,
)

(10, 10, 10, 10, 10, 10, 10, 10, 10, 10)

In [20]:
sample_size = 200

In [21]:
sample_library = small.collect_samples(multi_prefounders, [200])

In [22]:
sample_library

{0: [<simuPOP.Population>],
 1: [<simuPOP.Population>],
 2: [<simuPOP.Population>],
 3: [<simuPOP.Population>],
 4: [<simuPOP.Population>],
 5: [<simuPOP.Population>],
 6: [<simuPOP.Population>],
 7: [<simuPOP.Population>],
 8: [<simuPOP.Population>],
 9: [<simuPOP.Population>]}

In [23]:
for rep_id, sample_list in sample_library.items():
    sim.stat(sample_list[0], numOfSegSites=sim.ALL_AVAIL, vars=['numOfSegSites', 'segSites'])
    sim.stat(sample_list[0], alleleFreq=sim.ALL_AVAIL)

In [24]:
sample = sample_library[0][0]

In [25]:
astates = small.gather_allele_data(sample)

In [26]:
alleles = np.array([astates[:, 1], astates[:, 2]]).T

In [27]:
af = small.gather_allele_frequencies(sample, astates)

In [28]:
segregating_loci = np.array(sample.dvars().segSites)

In [29]:
trait = parameters.Trait()

In [30]:
qtl = sorted(list(random.sample(list(segregating_loci), 20)))

In [31]:
allele_effects = trait.construct_allele_effects_table(alleles, qtl, random.expovariate, 1)

In [32]:
ae_array = trait.construct_ae_array(allele_effects, qtl)

In [33]:
operators.calculate_g(sample, ae_array)

In [34]:
operators.calculate_error_variance(sample, 0.7)

In [35]:
operators.calculate_p(sample)

# Storing Data

In [36]:
small_data = h5py.File('small_data.hdf5')

In [37]:
small_data['allele/states'] = astates
small_data['segregating_loci'] = segregating_loci
small_data['qtl'] = np.array(qtl)
small_data['allele/effects'] = allele_effects

In [38]:
for rep, sample_list in sample_library.items():
    small_data['allele/frequency/replicate/' + str(rep)] = small.gather_allele_frequencies(sample_list[0], astates)
    operators.calculate_g(sample_list[0], ae_array)
    operators.calculate_error_variance(sample_list[0], 0.7)
    operators.calculate_p(sample_list[0])
    small_data['trait/g/replicate/' + str(rep)] = np.array([sample_list[0].indInfo('ind_id'), 
                                                            sample_list[0].indInfo('g')]).T
    small_data['trait/p/replicate/' + str(rep)] = np.array([sample_list[0].indInfo('ind_id'),
                                                          sample_list[0].indInfo('p')]).T
    

In [39]:
small_data['trait'].attrs['heritability'] = np.array([0.7])

In [40]:
segregating_loci

array([   1,    2,    3,    4,    5,    6,    8,   10,   12,   13,   14,
         15,   16,   20,   21,   23,   24,   25,   26,   27,   29,   30,
         31,   32,   34,   36,   37,   38,   39,   40,   42,   45,   46,
         50,   52,   53,   54,   56,   59,   61,   62,   63,   66,   67,
         68,   70,   71,   74,   76,   77,   79,   80,   82,   86,   88,
         90,   91,   95,   96,   97,   98,   99,  100,  101,  103,  104,
        106,  107,  108,  109,  114,  115,  116,  119,  120,  121,  122,
        123,  124,  126,  127,  128,  129,  130,  132,  134,  136,  138,
        139,  140,  141,  144,  145,  146,  147,  149,  152,  153,  155,
        158,  159,  163,  165,  168,  170,  171,  172,  173,  174,  175,
        177,  181,  182,  185,  186,  190,  191,  192,  193,  196,  198,
        202,  208,  209,  211,  212,  214,  215,  218,  219,  220,  221,
        223,  224,  225,  227,  229,  232,  235,  240,  241,  246,  247,
        249,  250,  252,  253,  254,  255,  256,  2

In [41]:
minalls = np.array(small_data['allele/states'], dtype=np.int_)[:, 3]

In [None]:
minalls[segregating_loci]

In [42]:
gwas = analyze.GWAS(sample_library[0][0], segregating_loci, minalls, 'small')

In [43]:
small.single_gen_multi_rep_tassel_input(sample_library, small_data, 'gwas_pipeline.xml')

TypeError: 'str' object cannot be interpreted as an integer

In [None]:
gwas.single_gen_multi_rep_tassel_config(0, 'gwas_pipeline.xml', 
                                        output_prefix = '/home/vakanas/tassel-5-standalone/output/small_output_')

In [None]:
analyze.GWAS()

In [45]:
cm = gwas.calculate_count_matrix()

In [46]:
ps, svd = gwas.pop_struct_eigendecomp(cm)

In [47]:
gwas.population_structure_formatter(ps, svd)

Unnamed: 0,0,1
I240040,-0.025466,-0.027220
I240056,-0.289714,-0.132141
I240136,0.207240,0.153284
I240163,-0.043662,-0.202815
I240182,-0.752386,0.084806
I240223,-0.141871,-0.251398
I240227,-0.533531,0.137630
I240275,0.145696,0.163592
I240304,0.138332,-0.177832
I240325,-0.080273,0.129299


In [48]:
hm = gwas.hapmap_formatter()

In [49]:
hm

Unnamed: 0,rs,alleles,chrom,pos,strand,assembly,center,protLSID,assayLSID,panelLSID,...,I241711,I241823,I241901,I241907,I241919,I241922,I241950,I241966,I241995,I242019
0,1,3,1,0,,,,,,,...,CC,CC,CC,CC,CC,CC,CC,CC,CT,CC
1,2,1,1,1,,,,,,,...,CT,TT,TT,TT,TT,TT,TT,TT,TT,TT
2,3,2,1,2,,,,,,,...,AA,AA,AA,AA,AG,AG,AA,AA,AA,AA
3,4,0,1,3,,,,,,,...,GG,GG,AG,GG,GG,AG,AG,GG,GG,GG
4,5,2,1,4,,,,,,,...,AA,AA,AG,AG,AG,AG,AG,AA,AA,GG
5,6,2,1,5,,,,,,,...,AA,AG,AA,AA,AA,AA,AA,AA,AG,AA
6,8,2,1,6,,,,,,,...,AA,AA,AA,AA,AA,AG,AA,AA,AA,AA
7,10,3,1,7,,,,,,,...,TT,CT,CT,TT,CT,CT,CT,CT,CC,CC
8,12,1,1,8,,,,,,,...,CC,TT,CC,CC,TT,TT,TT,TT,TT,TT
9,13,0,1,9,,,,,,,...,GG,AG,GG,GG,GG,GG,GG,AG,AG,GG


In [50]:
km = gwas.calc_kinship_matrix(cm)

In [51]:
km

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
I240040,1868.28755,1595.38255,1599.41255,1611.88755,1615.84255,1631.21255,1602.38755,1565.04755,1594.87755,1580.30755,...,1570.71755,1599.90755,1622.28755,1593.89255,1569.63255,1607.21755,1594.51255,1607.83255,1593.39755,1591.45255
I240056,1595.38255,1852.47755,1590.50755,1599.98255,1599.93755,1625.30755,1556.48255,1551.14255,1585.97255,1557.40255,...,1567.81255,1593.00255,1585.38255,1572.98755,1585.72755,1569.31255,1551.60755,1571.92755,1569.49255,1588.54755
I240136,1599.41255,1590.50755,1864.53755,1623.01255,1618.96755,1634.33755,1590.51255,1548.17255,1589.00255,1583.43255,...,1578.84255,1607.03255,1612.41255,1597.01755,1569.75755,1579.34255,1595.63755,1584.95755,1599.52255,1597.57755
I240163,1611.88755,1599.98255,1623.01255,1873.48755,1643.44255,1630.81255,1607.98755,1573.64755,1607.47755,1590.90755,...,1600.31755,1633.50755,1633.88755,1580.49255,1588.23255,1613.81755,1593.11255,1596.43255,1579.99755,1599.05255
I240182,1615.84255,1599.93755,1618.96755,1643.44255,1889.39755,1621.76755,1601.94255,1590.60255,1605.43255,1596.86255,...,1592.27255,1646.46255,1613.84255,1564.44755,1600.18755,1627.77255,1603.06755,1638.38755,1612.95255,1607.00755
I240223,1631.21255,1625.30755,1634.33755,1630.81255,1621.76755,1897.13755,1609.31255,1582.97255,1615.80255,1620.23255,...,1612.64255,1634.83255,1636.21255,1592.81755,1621.55755,1618.14255,1610.43755,1610.75755,1596.32255,1629.37755
I240227,1602.38755,1556.48255,1590.51255,1607.98755,1601.94255,1609.31255,1858.48755,1573.14755,1562.97755,1576.40755,...,1556.81755,1595.00755,1617.38755,1575.99255,1544.73255,1599.31755,1560.61255,1586.93255,1559.49755,1568.55255
I240275,1565.04755,1551.14255,1548.17255,1573.64755,1590.60255,1582.97255,1573.14755,1796.80755,1550.63755,1549.06755,...,1551.47755,1565.66755,1587.04755,1532.65255,1538.39255,1551.97755,1518.27255,1559.59255,1550.15755,1558.21255
I240304,1594.87755,1585.97255,1589.00255,1607.47755,1605.43255,1615.80255,1562.97755,1550.63755,1834.46755,1574.89755,...,1587.30755,1613.49755,1585.87755,1557.48255,1577.22255,1590.80755,1568.10255,1589.42255,1559.98755,1614.04255
I240325,1580.30755,1557.40255,1583.43255,1590.90755,1596.86255,1620.23255,1576.40755,1549.06755,1574.89755,1827.32755,...,1564.73755,1601.92755,1582.30755,1565.91255,1568.65255,1573.23755,1566.53255,1582.85255,1568.41755,1565.47255


# Correcting the Kinship Matrix Calculation

In [None]:
M = np.matrix(allele_count_matrix - 1)
P = 2*(allele_frequencies - 0.5)
Z = M - P
scaling_terms = np.zeros((len(self.loci)))
for idx, probability in enumerate(allele_frequencies):
    scaling_terms[idx] = 2*probability*(1 - probability)

scaling_factor = sum(scaling_terms)

G = Z*Z.T/scaling_factor

In [120]:
M = np.matrix(cm - 1)
P = 2*(segmin_af - 0.5)

In [121]:
Z = M - P

In [122]:
scaling_terms = np.zeros((943))

In [123]:
for idx, prob in enumerate(segmin_af):
    scaling_terms[idx] = 2*prob*(1 - prob)

In [124]:
scaling_factor = sum(scaling_terms)

In [127]:
G = (Z*Z.T)/scaling_factor

In [128]:
G

matrix([[ 0.954,  0.023, -0.021, ...,  0.014,  0.016, -0.036],
        [ 0.023,  1.026,  0.011, ..., -0.052, -0.007,  0.019],
        [-0.021,  0.011,  0.953, ..., -0.063,  0.044, -0.007],
        ..., 
        [ 0.014, -0.052, -0.063, ...,  0.99 ,  0.043,  0.01 ],
        [ 0.016, -0.007,  0.044, ...,  0.043,  0.96 ,  0.066],
        [-0.036,  0.019, -0.007, ...,  0.01 ,  0.066,  0.996]])

In [52]:
np.array(small_data['allele/frequencies/replicate/0'])

KeyError: 'Unable to open object (Component not found)'

In [58]:
minor_allele_frequencies = np.array(small_data['allele/frequency/replicate/0'])[segregating_loci, 3]

In [61]:
segmin_af = minor_allele_frequencies[segregating_loci]

In [62]:
np.argmax(segmin_af)

135

In [None]:
V = ((-1)*count_matrix) + 1

In [92]:
V = cm

In [93]:
V

array([[0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 1],
       ..., 
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
P = np.array([self.pop.dvars().alleleFreq[locus][allele]
                  for locus, allele in zip(self.segregating_loci,
                       self.segregating_minor_alleles)])

In [66]:
P = np.array(small_data['allele/frequency/replicate/0'])[segregating_loci, 3]

In [103]:
P = 1 - P

In [None]:
Z = np.zeros((self.pop.popSize(),
                        len(self.segregating_loci)))



In [104]:
Z = np.zeros((200, 943))

In [None]:
G = np.zeros((self.pop.popSize(), self.pop.popSize()), dtype=np.float)

In [105]:
G = np.zeros((200, 200))

In [106]:
for i in range(self.pop.popSize()):
    Z[i, :] = V[i, :] - 2*(P - 0.5)

NameError: name 'self' is not defined

In [107]:
for i in range(200):
    Z[i, :] = V[i, :] - 2*(P - 0.5)

In [108]:
Z

array([[-0.805, -0.76 , -0.675, ...,  0.665, -0.49 , -0.72 ],
       [ 0.195, -0.76 , -0.675, ...,  0.665, -0.49 , -0.72 ],
       [-0.805,  0.24 , -0.675, ...,  0.665, -0.49 ,  0.28 ],
       ..., 
       [-0.805, -0.76 , -0.675, ...,  0.665, -0.49 , -0.72 ],
       [ 0.195, -0.76 , -0.675, ...,  0.665, -0.49 ,  0.28 ],
       [-0.805, -0.76 , -0.675, ..., -0.335, -0.49 , -0.72 ]])

In [74]:
Z

array([[ 1.805,  1.76 ,  1.675, ...,  0.335,  1.49 ,  1.72 ],
       [ 0.805,  1.76 ,  1.675, ...,  0.335,  1.49 ,  1.72 ],
       [ 1.805,  0.76 ,  1.675, ...,  0.335,  1.49 ,  0.72 ],
       ..., 
       [ 1.805,  1.76 ,  1.675, ...,  0.335,  1.49 ,  1.72 ],
       [ 0.805,  1.76 ,  1.675, ...,  0.335,  1.49 ,  0.72 ],
       [ 1.805,  1.76 ,  1.675, ...,  1.335,  1.49 ,  1.72 ]])

In [None]:
for i in range(self.pop.popSize()):
    for j in range(self.pop.popSize()):
        G[i, j] = np.sum(Z[i, :]*Z.T[:, j])

In [111]:
for i in range(200):
    G[i, i] = np.sum(Z[i, :]*Z.T[:, i])

In [116]:
Z[:, 0]

array([-0.805,  0.195, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
       -0.805, -0.805,  0.195, -0.805, -0.805, -0.805,  0.195, -0.805,
       -0.805,  0.195,  0.195, -0.805, -0.805, -0.805, -0.805, -0.805,
       -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
       -0.805, -0.805, -0.805,  0.195, -0.805, -0.805,  0.195, -0.805,
       -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
       -0.805,  0.195, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
       -0.805,  0.195, -0.805, -0.805, -0.805, -0.805,  0.195, -0.805,
       -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
       -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
        0.195,  0.195, -0.805, -0.805,  0.195, -0.805, -0.805,  0.195,
       -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,  0.195,
       -0.805, -0.805, -0.805,  0.195, -0.805, -0.805, -0.805,  0.195,
       -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805, -0.805,
      

In [119]:
Z.T[0, :] == Z[:, 0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [113]:
G

array([[ 524.508,  263.603,  257.633, ...,  273.053,  269.618,  251.673],
       [ 263.603,  532.698,  260.728, ...,  249.148,  257.713,  260.768],
       [ 257.633,  260.728,  524.758, ...,  252.178,  277.743,  259.798],
       ..., 
       [ 273.053,  249.148,  252.178, ...,  546.598,  283.163,  270.218],
       [ 269.618,  257.713,  277.743, ...,  283.163,  530.728,  281.783],
       [ 251.673,  260.768,  259.798, ...,  270.218,  281.783,  532.838]])

In [112]:
G

array([[ 524.508,  263.603,  257.633, ...,  273.053,  269.618,  251.673],
       [ 263.603,  532.698,  260.728, ...,  249.148,  257.713,  260.768],
       [ 257.633,  260.728,  524.758, ...,  252.178,  277.743,  259.798],
       ..., 
       [ 273.053,  249.148,  252.178, ...,  546.598,  283.163,  270.218],
       [ 269.618,  257.713,  277.743, ...,  283.163,  530.728,  281.783],
       [ 251.673,  260.768,  259.798, ...,  270.218,  281.783,  532.838]])

In [89]:
(Z*Z).shape

(200, 943)

In [None]:
eigenvectors = [ps[i]*svd[i] for i in range(2)]

In [None]:
eigvals = np.array(ps, dtype=np.float_)

In [None]:
np.sort_complex(ps)

In [None]:
svd.dtype

In [None]:
eigenvectors[0].dtype

In [None]:
ps.dtype

In [None]:
svd.dtype

In [None]:
(ps[0]*svd[0]).dtype

In [None]:
svd.dtype

In [None]:
ps.dtype

In [None]:
ps[0]*svd[:, 0]

In [None]:
cm.dtype

In [None]:
minalls[segregating_loci]

In [None]:
small_data.

In [None]:
indir = "/home/vakanas/tassel-5-standalone/input/"
outdir = "/home/vakanas/tassel-5-standalone/output/"
rep_id_name = "0"

In [None]:
gwas = analyze.GWAS(meta_pop, list(range(meta_pop.totNumLoci())), run_id)

In [None]:
ccm = gwas.calculate_count_matrix(minor_alleles, list(range(meta_pop.totNumLoci())))

In [None]:
ps_svd = gwas.pop_struct_svd(ccm)

In [None]:
name = run_id+'_'+rep_id_name

In [None]:
gwas.population_structure_formatter(ps_svd, indir+name+'_structure_matrix.txt')

In [None]:
int_to_snp_map = {0:'A', 1:'C', 2:'G', 3:'T', 4:'-', 5:'+'}

In [None]:
locus_names = list(concordant_segregating_loci)

In [None]:
alleles_column = ['NA']*len(concordant_segregating_loci)

In [None]:
chromosomes = [meta_pop.chromLocusPair(locus)[0]+1 for locus in concordant_segregating_loci]

In [None]:
gwas.hapmap_formatter(concordant_segregating_loci, alleles_column, 
                      locus_names, chromosomes, 
                      locus_names, 
                      indir+name+'_simulated_hapmap.txt')

In [None]:
minor_allele_frequency_table = analyze.minor_allele_frequencies_table(
        meta_pop.dvars().alleleFreq, minor_alleles)

In [None]:
minor_allele_frequencies = np.array(minor_allele_frequency_table.minor_frequency)

In [None]:
minor_allele_frequencies

In [None]:
gwas.calc_kinship_matrix(ccm, minor_allele_frequencies, indir+name+'_kinship_matrix.txt')

In [None]:
gwas.trait_formatter(indir+name+'_trait_vector.txt')

In [None]:
import xml.etree.ElementTree as ET
import lxml.etree as etree

In [None]:
config_file_template = '/home/vakanas/BISB/rjwlab-scripts/saegus_project/devel/magic/1478/gwas_pipeline.xml'

In [None]:
tree = ET.parse(config_file_template)
root = tree.getroot()
lxml_tree = etree.fromstring(ET.tostring(root))
lxml_root = lxml_tree.getroottree()

In [None]:
lxml_root.find('fork1/h').text = indir+name+'_simulated_hapmap.txt'
lxml_root.find('fork2/t').text = indir+name+'_trait_vector.txt'
lxml_root.find('fork3/q').text = indir+name+'_structure_matrix.txt'
lxml_root.find('fork4/k').text = indir+name+'_kinship_matrix.txt'

lxml_root.find('combine6/export').text = outdir+name+'_out_'

In [None]:
lxml_root.write("/home/vakanas/tassel-5-standalone/"+"R"+rep_id_name+'_'+
                run_id+'_'+"_sim_gwas_pipeline.xml",
                encoding="UTF-8",
                method="xml", 
                xml_declaration=True, 
                standalone='',
                pretty_print=True) 

# Run TASSEL at This Point

# Use R Qvalue package to get Qvalues

# Combine TASSEL, Qvalues and Other Information

In [None]:
qvalues = pd.read_csv("/home/vakanas/tassel-5-standalone/output/epsilon_0_qvalues.txt", sep='\t')

In [None]:
qvalues.index = list(concordant_segregating_loci)

In [None]:
qvalues

In [None]:
raw_gwas_results = pd.read_csv("/home/vakanas/tassel-5-standalone/output/epsilon_0_out_2.txt", sep='\t')

In [None]:
raw_gwas_results.drop(0, axis=0, inplace=True)

In [None]:
raw_gwas_results.drop('Trait', axis=1, inplace=True)

In [None]:
raw_gwas_results.index = np.array(list(map(int, raw_gwas_results.Marker)))

In [None]:
raw_gwas_results

In [None]:
raw_gwas_results = raw_gwas_results.join(qvalues)

In [None]:
minor_allele_frequencies

In [None]:
mafrqs = analyze.minor_allele_frequencies_table(meta_pop.dvars().alleleFreq, minor_alleles)

In [None]:
raw_gwas_results = raw_gwas_results.join(mafrqs.ix[concordant_segregating_loci, :])

In [None]:
raw_gwas_results

In [None]:
def tassel_results_tables(gwas_file_name, q_values_file_name, 
                              minor_allele_frequency_table, 
                              quantitative_allele_table):
    raw_gwas_results = pd.read_csv(gwas_file_name, sep='\t')
    raw_gwas_results.drop(0, axis=0, inplace=True)
    raw_gwas_results.drop('Trait', axis=1, inplace=True)
    raw_gwas_results.index = np.array(list(map(int, raw_gwas_results.Marker)))
    q_values = pd.read_csv(q_values_file_name, sep='\t')
    q_values.index = np.array(list(map(int, raw_gwas_results.Marker)))
    raw_gwas_results = raw_gwas_results.join(q_values)
    
    assert minor_allele_frequency_table.index.dtype == raw_gwas_results.index.dtype, "Indexes of these tables are different"
    
    raw_gwas_results = raw_gwas_results.join(minor_allele_frequency_table.ix[raw_gwas_results.index, :])
    
    assert quantitative_allele_table.index.dtype == raw_gwas_results.index.dtype, "Indexes of these tables are different"
    
    raw_gwas_results = raw_gwas_results.join(quantitative_allele_table.ix[raw_gwas_results.index, :])
    return raw_gwas_results

In [None]:
pwd

In [None]:
cd /home/vakanas/tassel-5-standalone/output/

In [None]:
ls

In [None]:
mafrqs = pd.read_csv('epsilon_0_maf_table.txt', sep='\t', index_col=0)

In [None]:
mafrqs

In [None]:
qtad = pd.read_csv('epsilon_0_quant_allele_table.txt', sep='\t', index_col=0)

In [None]:
qtad

In [None]:
super_table = tassel_results_tables('epsilon_0_out_2.txt', 'epsilon_0_qvalues.txt', mafrqs, qtad)

In [None]:
super_table.ix[super_table.q < 0.05]

In [None]:
super_table.ix[super_table.alpha_effect > 0]

In [None]:
mg.multiple_sample_analyzer(meta_populations, qtl, allele_effects, 
                            minor_alleles, concordant_segregating_loci)

In [None]:
analyze.store_allele_effect_frequency_tables(meta_population, alleles, 
                                             qtl,
                                             exponential_allele_effects,
                                            run_id, 'exponential')

In [None]:
loci_conversions = shelve.open(run_id+'_loci_conversions')
saegus_to_tassel_loci = {}
tassel_to_saegus_loci = {}
for idx, locus in enumerate(concordant_segregating_loci):
    saegus_to_tassel_loci[locus] = idx
    tassel_to_saegus_loci[idx] = locus
loci_conversions['saegus_to_tassel'] = saegus_to_tassel_loci
loci_conversions['tassel_to_saegus'] = tassel_to_saegus_loci
loci_conversions.close()

In [None]:
seg_loc_storage = shelve.open('segregating_loci_storage')
seg_loc_storage['bacchus'] = concordant_segregating_loci
seg_loc_storage.close()

In [None]:
int_to_snp = {0: 'A', 1: 'C', 2: 'G', 3: 'T', 4: '-', 5: '+'}
snp_to_int = {'A': 0, 'C': 1, '-': 4, 'G': 2, '+': 5, 'T': 3}
conv = shelve.open('synthesis_parameters')
conv['integer_to_snp'] = int_to_snp
conv['snp_to_integer'] = snp_to_int
conv.close()

In [None]:
exponential_allele_effects_table = analyze.generate_allele_effects_table(qtl, alleles, 
                                                exponential_allele_effects, saegus_to_tassel_loci)

In [None]:
analyze.remap_allele_frequency_table_loci(analyze.reload_allele_frequencies_table(run_id, 0, 250, 
                                                                                  'exponential'), 
                                          concordant_segregating_loci)

In [None]:
analyze.write_multiple_sample_analyzer(sample_library, sample_sizes, qtl, alleles, 
                                       exponential_allele_effects, 0.7,  concordant_segregating_loci, 
                                       run_id=run_id, sub_run_id='_exponential', 
                                       allele_frequency_hdf=run_id+'_allele_frequency_storage.h5')

In [None]:
multiple_sample_analyzer(meta_populations, qtl, allele_effects, minor_alleles, concordant_segregating_loci)

In [None]:
import h5py

In [None]:
with h5py.File('bia_allele_frequencies.hdf5') as biaf:
    reloaded_af = np.array(biaf[afname])

In [None]:
fisegloc = list(concordant_segregating_loci)

In [None]:
minor_allele_frequencies = reloaded_af[fisegloc]

In [None]:
def write_super_tables(power_and_fpr_raw_data, sample_sizes, number_of_replicates, run_id, sub_run_id=''):
    for size in sample_sizes:
        for rep in range(number_of_replicates):
            name = run_id + '_' + sub_run_id + '_' + str(rep) + '_' + str(size) + '_super_table.txt'
            power_and_fpr_raw_data[size][rep].to_csv(name, sep='\t')

In [None]:
expo_power_fpr_raw_data = analyze.collect_power_analysis_data(run_id, sample_sizes, number_of_replicates, concordant_segregating_loci, 'exponential')

In [None]:
expo_power_fpr_raw_data[250]

In [None]:
write_super_tables(expo_power_fpr_raw_data,
                  sample_sizes,
                  number_of_replicates,
                  'bacchus',
                  sub_run_id='exponential')

In [None]:
expo_results, expo_true_positives, expo_false_positives = study.calculate_power_fpr(expo_power_fpr_raw_data, sample_sizes, 
                                                                             number_of_replicates, number_of_qtl)

In [None]:
expo_results

In [None]:
mean_and_stdev = pd.DataFrame([expo_results.mean(), expo_results.std()], index=['mean', 'stdev']).T
mean_and_stdev.to_csv('bacchus_exponential_mean_and_stdev_power_fpr.csv', sep='\t')

In [None]:
geo_results

In [None]:
geometric_allele_effects_table

In [None]:
exponential_allele_effects_table

In [None]:
expo_results.to_csv("bacchus_exponential_power_fpr_results.txt", sep='\t')

In [None]:
mean_and_stdev = pd.DataFrame([geo_results.mean(), geo_results.std()], index=['mean', 'stdev']).T
mean_and_stdev.to_csv('full_icecrown_geometric_mean_and_stdev_power_fpr.txt', sep='\t')

In [None]:
expo_results, expo_true_positives, expo_false_positives = full_icecrown.calculate_power_fpr(expo_power_fpr_raw_data,
                                                                                      sample_sizes,
                                                                                      number_of_replicates,
                                                                                      number_of_qtl)

In [None]:
expo_results

In [None]:
expo_results.to_csv('full_icecrown_exponential_power_fpr_results.txt', sep='\t')

In [None]:
mean_and_stdev = pd.DataFrame([expo_results.mean(), expo_results.std()], index=['mean', 'stdev']).T
mean_and_stdev.to_csv('full_icecrown_exponential_mean_and_stdev_power_fpr.txt', sep='\t')

In [None]:
write_super_tables(expo_power_fpr_raw_data, sample_sizes, number_of_replicates, run_id, 'exponential')

In [None]:
geo_aggregate_estimated_actual = pd.DataFrame([np.array(geo_agg_estimated), np.array(geo_agg_actual)], index=['estimated', 'actual']).T

In [None]:
geo_aggregate_estimated_actual['estimated'] = geo_aggregate_estimated_actual['estimated'].apply(np.fabs)

In [None]:
geo_aggregate_estimated_actual

In [None]:
geo_corr = geo_aggregate_estimated_actual['estimated'].corr(geo_aggregate_estimated_actual['actual'])

In [None]:
geo_agg_estimated

In [None]:
aggregate_estimated_actual

In [None]:
geo_corr

In [None]:
pwd

In [None]:
geo_aggregate_estimated_actual.to_csv('full_icecrown_geometric_estimated_vs_actual_allele_effects.txt', sep='\t')

In [None]:
agg_estimated = []
agg_actual = []

In [None]:
for rep in reps:
    for size in sample_sizes:
        sutable = sutable_collection[rep][size]
        droppable = list(sutable.ix[sutable.ix[:, 'difference'] == 0.0].index)
        qtloci = sutable.drop(droppable, axis=0)
        agg_estimated.extend(list(qtloci['add_effect']))
        agg_actual.extend(list(qtloci['difference']))

In [None]:
aggregate_estimated_actual = pd.DataFrame([np.array(agg_estimated), np.array(agg_actual)], index=['estimated', 'actual']).T

In [None]:
aggregate_estimated_actual['estimated'] = np.fabs(aggregate_estimated_actual['estimated'])

In [None]:
aggregate_estimated_actual

In [None]:
correlation_actual_vs_effects = aggregate_estimated_actual['estimated'].corr(aggregate_estimated_actual['actual'])

In [None]:
aggregate_estimated_actual.to_csv('full_icecrown_exponential_estimated_vs_actual_allele_effects.txt', sep='\t')

In [None]:
aggregate_estimated_actual['estimated'] = np.fabs(aggregate_estimated_actual['estimated'])

In [None]:
cd C:\tassel\output\full_icecrown\exponential

In [None]:
expo_estimated_actual = pd.read_csv('full_icecrown_exponential_estimated_vs_actual_allele_effects.txt', sep='\t', index_col=0)

In [None]:
expo_estimated_actual

In [None]:
aggregate_estimated_actual

In [None]:
, from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

In [None]:
output_notebook()

In [None]:
aggregate_estimated_actual

In [None]:
geo_x = aggregate_estimated_actual['estimated']
geo_y = aggregate_estimated_actual['actual']

In [None]:
p = figure(title="Estimated vs Actual Allele Effects - Geometric Series", 
           title_text_font_size="16",
          x_range=(-0.2, 4))

In [None]:
p.scatter(geo_x, y, x="Estimated", y="Actual")

p.xaxis.axis_label = "Estimated"
p.yaxis.axis_label = "Actual"

In [None]:
show(p)

In [None]:
expo

In [None]:
p = figure(title="Estimated vs Actual Allele Effects - Geometric Series", title_text_font_size="16")

In [None]:
expo_plot = figure(title="Estimated vs Actual Effects - Exponential(lambda=1)", 
                   title_text_font_size="16", 
                  x_range=(0, 4))

x = np.array(expo_estimated_actual['estimated'])
y = np.array(expo_estimated_actual['actual'])

expo_plot.xaxis.axis_label = "Estimated"
expo_plot.yaxis.axis_label = "Actual"

In [None]:
expo_plot.scatter(x, y)

In [None]:
show(expo_plot)

In [None]:
from bokeh.io import hplot

In [None]:
geo_plot = figure(title="Estimated vs Actual Allele Effects - Geometric Series", 
           title_text_font_size="16",
          x_range=(0, 4), y_range=(0, 4))

In [None]:
geo_x = aggregate_estimated_actual['actual']
geo_y = aggregate_estimated_actual['estimated']

In [None]:
geo_plot.xaxis.axis_label = "Actual"
geo_plot.yaxis.axis_label = "Estimated"
geo_plot.scatter(geo_x, geo_y, x="Actual", y="Estimated")

In [None]:
expo_plot = figure(title="Estimated vs Actual Effects - Exponential(lambda=1)", 
                   title_text_font_size="16", 
                  x_range=(0, 4), y_range=(0, 4))

expo_x = np.array(expo_estimated_actual['actual'])
expo_y = np.array(expo_estimated_actual['estimated'])

expo_plot.xaxis.axis_label = "Actual"
expo_plot.yaxis.axis_label = "Estimated"
expo_plot.scatter(expo_x, expo_y)

In [None]:
multi_plot = hplot(geo_plot, expo_plot)
show(multi_plot)

In [None]:
output_file("multi_plot.png")

In [None]:
ls