# Small

In [1]:
import simuOpt
simuOpt.setOptions(alleleType='short', numThreads=4, quiet=True)
import simuPOP as sim
import pandas as pd
from saegus import breed, operators, simulate, analyze, parse, parameters
import shelve
import numpy as np
import random
import h5py
from os import path
import collections as col
np.set_printoptions(suppress=True, precision=3)
pd.options.display.float_format = '{:.4f}'.format

In [2]:
small_data = h5py.File('small_data.hdf5')

In [3]:
small = analyze.Study('small', number_of_replicates=10, data_file=small_data)

In [4]:
run_id = 'small'
generations_of_random_mating = 10
number_of_qtl = 30
number_of_replicates = 10
founders = [[2, 26], [3, 25], [4, 24], [5, 23]]
os_per_pair = 1000
mating_pop_size = len(founders)*os_per_pair
recombination_rates = [0.01]*1478
sample_size = 1000

In [5]:
prefounders = sim.loadPopulation('bia_prefounders.pop')

In [6]:
prefounders.infoFields()

('ind_id',
 'father_id',
 'mother_id',
 'fitness',
 'p',
 'g',
 'generation',
 'replicate')

In [7]:
sim.tagID(prefounders, reset=True)

In [8]:
prefounders.popSize()

26

In [9]:
multi_prefounders = sim.Simulator(prefounders, 10, stealPops=False)

In [10]:
magic = breed.MAGIC(multi_prefounders, founders, recombination_rates)

In [11]:
magic.generate_f_one(founders, os_per_pair)

In [12]:
mrc = breed.MultiRandomCross(multi_prefounders, 4, 1000)

In [13]:
mother_choices, father_choices = mrc.determine_random_cross()

In [14]:
multi_snd_ord_chooser = breed.MultiSecondOrderPairIDChooser(
    mother_choices, father_choices)

In [15]:
multi_prefounders.evolve(
    matingScheme=sim.HomoMating(
        sim.PyParentsChooser(multi_snd_ord_chooser.snd_ord_id_pairs),
        sim.OffspringGenerator(ops=[
            sim.IdTagger(),
            sim.PedigreeTagger(),
            sim.Recombinator(rates=0.01)
        ],
            numOffspring=1),
        subPopSize=[mating_pop_size],
    ),
    gen=1,
)

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

In [16]:
final_mrc = breed.MultiRandomCross(multi_prefounders, 2, 2000)

In [17]:
final_mothers, final_fathers = final_mrc.determine_random_cross()

In [18]:
final_multi_snd_ord_chooser = breed.MultiSecondOrderPairIDChooser(
    final_mothers, final_fathers)

In [19]:
multi_prefounders.evolve(
    matingScheme=sim.HomoMating(
        sim.PyParentsChooser(final_multi_snd_ord_chooser.snd_ord_id_pairs),
        sim.OffspringGenerator(ops=[
            sim.IdTagger(),
            sim.PedigreeTagger(),
            sim.Recombinator(rates=0.01)
        ],
            numOffspring=1),
        subPopSize=[mating_pop_size],
    ),
    gen=1,
)

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

# Random Mating Phase

In [20]:
multi_prefounders.evolve(
    matingScheme=sim.RandomMating(ops=[
            sim.IdTagger(),
            sim.PedigreeTagger(),
            sim.Recombinator(rates=0.01)
        ],
        subPopSize=[mating_pop_size]),
    gen=100,
)

(100, 100, 100, 100, 100, 100, 100, 100, 100, 100)

In [21]:
sample_library = small.collect_samples(multi_prefounders, [sample_size])

In [22]:
for rep_id, sample_list in sample_library.items():
    sim.stat(sample_list[0], numOfSegSites=sim.ALL_AVAIL, vars=['numOfSegSites', 'segSites'])
    sim.stat(sample_list[0], alleleFreq=sim.ALL_AVAIL)

In [23]:
list_of_segs = [sample_library[i][0].dvars().segSites for i in range(number_of_replicates)]

In [24]:
commonly_segregating_loci = list(set(sample_library[0][0].dvars().segSites).intersection(*list_of_segs))

In [25]:
len(commonly_segregating_loci)

943

In [26]:
sample = sample_library[0][0]

In [27]:
astates = small.gather_allele_data(sample)

In [28]:
alleles = np.array([astates[:, 1], astates[:, 2]]).T

In [29]:
segregating_loci = np.array(commonly_segregating_loci)

In [30]:
trait = parameters.Trait()

In [31]:
qtl = sorted(list(random.sample(list(segregating_loci), number_of_qtl)))

In [32]:
allele_effects = trait.construct_allele_effects_table(alleles, qtl, random.expovariate, 1)

In [None]:
allele_effects[qtl]

In [33]:
ae_array = trait.construct_ae_array(allele_effects, qtl)

# Storing Data

In [34]:
small_data['allele/states'] = astates
small_data['segregating_loci'] = np.array(commonly_segregating_loci)
small_data['qtl'] = np.array(qtl)
small_data['allele/effects'] = allele_effects
small_data['recombination_rates'] = np.array(recombination_rates)
small_data['allele/effects_array'] = ae_array

In [35]:
for rep, sample_list in sample_library.items():
    small_data['allele/frequency/replicate/' + str(rep)] = small.gather_allele_frequencies(sample_list[0], astates)
    operators.calculate_g(sample_list[0], ae_array)
    operators.calculate_error_variance(sample_list[0], 0.7)
    operators.calculate_p(sample_list[0])
    small_data['trait/g/replicate/' + str(rep)] = np.array([sample_list[0].indInfo('ind_id'), 
                                                            sample_list[0].indInfo('g')]).T
    small_data['trait/p/replicate/' + str(rep)] = np.array([sample_list[0].indInfo('ind_id'),
                                                          sample_list[0].indInfo('p')]).T
    

In [36]:
small_data['trait'].attrs['heritability'] = np.array([0.7])

In [37]:
minor_alleles = np.array(small_data['allele/states'])[:, 3]

In [38]:
indir = '/home/vakanas/tassel-5-standalone/input'
outdir = '/home/vakanas/tassel-5-standalone/output'

In [39]:
gwas = analyze.GWAS(sample_library[0][0], segregating_loci, minor_alleles, 'small')
cm = gwas.calculate_count_matrix()
val, vec = gwas.pop_struct_eigendecomp(cm)

In [43]:
print("Computing TASSEL Input:")
for rep, sample_list in sample_library.items():
    name = small.run_id + '_' + str(rep)
    print("{current_rep}\t".format(current_rep=str(rep)))
    minor_allele_fs = np.array(small_data['allele/frequency/replicate/' + str(rep)])[segregating_loci, 3]
    gwas = analyze.GWAS(sample_list[0], segregating_loci, minor_alleles, 'small')
    cm = gwas.calculate_count_matrix(count_matrix_file_name=path.join(indir, name+'_count_matrix.txt'))
    ps, svd = gwas.pop_struct_eigendecomp(cm)
    gwas.population_structure_formatter(ps, svd, number_of_pcs=2, 
                                        pop_struct_file_name=path.join(indir, name+'_structure_matrix.txt'))
    gwas.trait_formatter(trait_file_name=path.join(indir, name+'_phenotype_vector.txt'))
    gwas.calc_kinship_matrix(cm, minor_allele_fs, kinship_matrix_file_name=path.join(indir, name+'_kinship_matrix.txt'))
    gwas.hapmap_formatter(hapmap_file_name=path.join(indir, name+'_simulated_hapmap.txt'))
    gwas.single_gen_multi_rep_tassel_config(rep, 'gwas_pipeline.xml', output_prefix=name+'_out_')

Computing TASSEL Input:
0	
1	
2	
3	
4	
5	
6	
7	
8	


LinAlgError: SVD did not converge

# Run TASSEL at This Point

Contents of bash script to automate TASSEL via configuration files.

### simulated_mlm.sh

```bash
#!/bin/bash


echo "Run ID: $1, Number of Replicates $2"
run_id=$1
number_of_replicates=$2
final_rep_index="$((number_of_replicates - 1))"

echo "Beginning TASSEL analysis of Run ID: $run_id"
echo "Number of Replicates: $number_of_replicates"
echo "First configuration file: small_0_gwas_pipeline.xml"onca
echo "Final configuration file: small_"$final_rep_index"_gwas_pipeline.xml"

for i in `seq 0 $final_rep_index`
do
    config_file_name=$run_id$i"_gwas_pipeline.xml"
    echo "$config_file_name"
    ./run_pipeline.pl -Xmx6g -configFile $config_file_name
done


```

### Example output: 
+ small_0_out_1.txt
+ small_0_out_2.txt
+ small_0_out_3.txt
+ small_1_out_1.txt
+ ...
+ small_9_out_3.txt

# Use R Qvalue package to get Qvalues

Contents of R script to obtain qvalues for p column of TASSEL results

```R
#!/usr/bin/env Rscript

library(qvalue)
library(ggplot2)
library(gap)

args = commandArgs(trailingOnly=TRUE)

# test to determine if the file name parameter is supplied to the script
if (length(args)==0) {
  stop("At least one argument must be suppled (input file).\n", call.=FALSE)
}
#setwd("/home/vakanas/tassel-5-standalone/output")  

run_id = args[1]
file_name_match_pattern = paste(run_id, "(.*)_2.txt", sep='')
file_names = list.files(pattern = file_name_match_pattern)

for(n in file_names) {
    print(n)
    input_file_name = n
    run_id_prefix_terminus = nchar(input_file_name) - 5
    run_id_prefix = substring(input_file_name, 1, run_id_prefix_terminus)
    output_file_name = paste(run_id_prefix, 'q_values.txt', sep='')
    print(output_file_name)
    results_header = scan(input_file_name, what="character", nlines=1, sep="\t")
    gwas_results = read.table(input_file_name, header=F, row.names = NULL, skip=2)
    colnames(gwas_results) = results_header
    pvalues = gwas_results$p
    qobj = qvalue(p = pvalues)
    qvalues = data.frame(qobj$qvalues)
    colnames(qvalues) = "q"
    rownames(qvalues) = gwas_results$Marker
    write.table(qvalues, output_file_name, sep="\t")
}

```

# Analysis of TASSEL Results: Comutation of Power & FPR

## Subsetting Raw TASSEL Results and Data Storage
    Each replicate has an associated set of TASSEL output files. The raw
    results are modified and stored in the run's HDF5 file

## Statistical Power and False Positive Rate

In [44]:
power_fprs = small.calculate_power_false_positive_rate(qtl, allele_effects, 8, 
                                                       hdf5_file=small_data)

In [45]:
power_fprs

array([[ 0.   ,  0.033,  0.1  ],
       [ 1.   ,  0.033,  0.133],
       [ 2.   ,  0.033,  0.067],
       [ 3.   ,  0.033,  0.1  ],
       [ 4.   ,  0.033,  0.1  ],
       [ 5.   ,  0.033,  0.1  ],
       [ 6.   ,  0.033,  0.1  ],
       [ 7.   ,  0.033,  0.1  ]])

In [59]:
power_fprs_frame = pd.DataFrame(power_fprs[:, 1:], index=np.array(power_fprs[:, 0], dtype=np.int) ,columns=['power', 'false_positive_rate'])

In [101]:
list(set(detected_loci).intersection(qtl))

[610, 1102, 338, 310, 23, 539, 158, 191]

In [115]:
power_and_fprs = np.zeros((8, 3))
loci_tracker = {}
for i in range(8):
    results = pd.DataFrame(np.array(small_data['tassel/test/replicate/'+str(i)]), index=segregating_loci)
    detected_loci = results.ix[results.ix[:, 4] < 0.05].index
    set_of_true_positives = set(detected_loci).intersection(set(qtl))
    true_positives = sorted(list(set_of_true_positives))
    loci_tracker[i] = true_positives
    power = len(true_positives) / 30
    print("RepID: {rep_id}\tTrue Positive Loci: {true_pos}\tPower: {powe}".format(
        rep_id=i, true_pos=true_positives, 
        powe=power))
    power_and_fprs[i, 0] = i
    power_and_fprs[i, 1] = power

RepID: 0	True Positive Loci: [23, 158, 191, 310, 338, 539, 610, 1102]	Power: 0.26666666666666666
RepID: 1	True Positive Loci: [23, 158, 191, 310, 338, 539, 610]	Power: 0.23333333333333334
RepID: 2	True Positive Loci: [158, 191, 338, 539, 610]	Power: 0.16666666666666666
RepID: 3	True Positive Loci: [23, 158, 191, 310, 338, 539, 610, 1102]	Power: 0.26666666666666666
RepID: 4	True Positive Loci: [23, 158, 191, 338, 539, 888, 1102]	Power: 0.23333333333333334
RepID: 5	True Positive Loci: [23, 158, 191, 338, 539, 1102]	Power: 0.2
RepID: 6	True Positive Loci: [23, 95, 158, 191, 338, 539, 888]	Power: 0.23333333333333334
RepID: 7	True Positive Loci: [23, 158, 191, 338, 539, 610]	Power: 0.2


In [116]:
loci_tracker

{0: [23, 158, 191, 310, 338, 539, 610, 1102],
 1: [23, 158, 191, 310, 338, 539, 610],
 2: [158, 191, 338, 539, 610],
 3: [23, 158, 191, 310, 338, 539, 610, 1102],
 4: [23, 158, 191, 338, 539, 888, 1102],
 5: [23, 158, 191, 338, 539, 1102],
 6: [23, 95, 158, 191, 338, 539, 888],
 7: [23, 158, 191, 338, 539, 610]}

In [121]:
header = "Replicate ID\tDetected_Loci\n"
with open('loci_tracker.txt', 'w') as f:
    f.write(header)
    for i in range(8):
        f.write(str(i) + '\t' + str(loci_tracker[i])+'\n')

In [124]:
allele_effects[loci_tracker[0]]

array([[   23.   ,     4.   ,     1.114,     5.   ,     2.843],
       [  158.   ,     1.   ,     0.047,     3.   ,     2.327],
       [  191.   ,     0.   ,     1.948,     2.   ,     0.27 ],
       [  310.   ,     1.   ,     1.442,     3.   ,     0.129],
       [  338.   ,     4.   ,     0.419,     5.   ,     2.698],
       [  539.   ,     0.   ,     0.003,     3.   ,     1.939],
       [  610.   ,     0.   ,     0.037,     1.   ,     1.578],
       [ 1102.   ,     0.   ,     1.559,     3.   ,     0.228]])

In [126]:
allele_fs = np.array(small_data['allele/frequency/replicate/0'])

In [128]:
allele_fs[loci_tracker[0]]

array([[   23.   ,     0.477,     0.522,     0.477,     0.522],
       [  158.   ,     0.768,     0.232,     0.232,     0.768],
       [  191.   ,     0.559,     0.441,     0.441,     0.559],
       [  310.   ,     0.207,     0.792,     0.207,     0.792],
       [  338.   ,     0.161,     0.84 ,     0.161,     0.84 ],
       [  539.   ,     0.38 ,     0.621,     0.38 ,     0.621],
       [  610.   ,     0.116,     0.884,     0.116,     0.884],
       [ 1102.   ,     0.404,     0.597,     0.404,     0.597]])

In [129]:
allele_fs[qtl]

array([[   12.   ,     0.402,     0.599,     0.402,     0.599],
       [   23.   ,     0.477,     0.522,     0.477,     0.522],
       [   95.   ,     0.269,     0.731,     0.269,     0.731],
       [   96.   ,     0.873,     0.127,     0.127,     0.873],
       [  158.   ,     0.768,     0.232,     0.232,     0.768],
       [  191.   ,     0.559,     0.441,     0.441,     0.559],
       [  255.   ,     0.321,     0.679,     0.321,     0.679],
       [  257.   ,     0.358,     0.641,     0.358,     0.641],
       [  275.   ,     0.949,     0.051,     0.051,     0.949],
       [  304.   ,     0.163,     0.838,     0.163,     0.838],
       [  310.   ,     0.207,     0.792,     0.207,     0.792],
       [  338.   ,     0.161,     0.84 ,     0.161,     0.84 ],
       [  539.   ,     0.38 ,     0.621,     0.38 ,     0.621],
       [  570.   ,     0.624,     0.376,     0.376,     0.624],
       [  577.   ,     0.197,     0.803,     0.197,     0.803],
       [  610.   ,     0.116,     0.884,

numpy.int64

In [238]:
allele_fs[qtl, 1]

array([ 0.402,  0.477,  0.269,  0.873,  0.768,  0.559,  0.321,  0.358,
        0.949,  0.163,  0.207,  0.161,  0.38 ,  0.624,  0.197,  0.116,
        0.897,  0.906,  0.124,  0.851,  0.387,  0.404,  0.028,  0.117,
        0.877,  0.106,  0.936,  0.136,  0.486,  0.625])

In [311]:
power_and_fprs = np.zeros((8, 3))
loci_tracker = {}
for i in range(8):
    results = pd.DataFrame(np.array(small_data['tassel/test/replicate/'+str(i)]), index=segregating_loci)
    detected_loci = results.ix[results.ix[:, 4] < 0.05].index
    set_of_true_positives = set(detected_loci).intersection(set(qtl))
    true_positives = sorted(list(set_of_true_positives))
    loci_tracker[i] = true_positives
    power = len(true_positives) / 30
    print("RepID: {rep_id}\tTrue Positive Loci: {true_pos}\tPower: {powe}".format(
        rep_id=i, true_pos=true_positives, 
        powe=power))
    power_and_fprs[i, 0] = i
    power_and_fprs[i, 1] = power
bulk_loci = []
for i in range(8):
    bulk_loci += loci_tracker[i]
from collections import Counter
count_of_loci = Counter(bulk_loci)
prob_detection = np.zeros((30, 2))
prob_detection[:, 0] = qtl
for idx, loc in enumerate(prob_detection):
    prob_detection[idx, 1] = count_of_loci[int(loc[0])]/8

In [236]:
for locus in list(map(int, qtl)):
    print(locus, sample.chromLocusPair(locus))

12 (0, 12)
23 (0, 23)
95 (0, 95)
96 (0, 96)
158 (0, 158)
191 (0, 191)
255 (1, 46)
257 (1, 48)
275 (1, 66)
304 (1, 95)
310 (1, 101)
338 (1, 129)
539 (3, 5)
570 (3, 36)
577 (3, 43)
610 (3, 76)
658 (3, 124)
879 (5, 36)
888 (5, 45)
917 (5, 74)
1081 (6, 126)
1102 (7, 8)
1198 (7, 104)
1232 (8, 0)
1342 (8, 110)
1354 (8, 122)
1366 (9, 2)
1411 (9, 47)
1424 (9, 60)
1440 (9, 76)


In [244]:
chromosomes = np.array(results.ix[qtl, 0], dtype=np.int_)

In [274]:
alpha = np.array(allele_effects[qtl, 1], dtype=np.int_)

In [251]:
alpha_frequency = np.array(allele_fs[qtl, 1])

In [254]:
alpha_effect = np.array(allele_effects[qtl, 2])

In [256]:
omega = np.array(allele_effects[qtl, 3], dtype=np.int_)

In [262]:
omega_frequency = np.array(allele_fs[qtl, 2])

In [270]:
omega_effect = allele_effects[qtl, 4]

In [271]:
abs_diff = np.abs(alpha_effect - omega_effect)

In [272]:
abs_diff

array([ 0.597,  1.729,  0.486,  1.189,  2.281,  1.678,  0.146,  0.525,
        0.793,  0.302,  1.313,  2.279,  1.937,  0.501,  0.362,  1.541,
        0.608,  0.009,  1.678,  0.092,  0.345,  1.331,  0.152,  0.306,
        0.331,  1.309,  1.136,  0.175,  0.018,  0.541])

In [344]:
alpha_frqs = np.zeros((len(qtl), 9))
omega_frqs = np.zeros((len(qtl), 9))
alpha_frqs[:, 0] = qtl
omega_frqs[:, 0] = qtl
for i in range(8):
    alpha_frqs[:, i+1] = np.array(small_data['allele/frequency/replicate/'+str(i)])[qtl, 1]
    omega_frqs[:, i+1] = np.array(small_data['allele/frequency/replicate/'+str(i)])[qtl, 2]
summary_at_qtl = np.zeros((len(qtl), 5))
summary_at_qtl[:, 0] = qtl
summary_at_qtl[:, 1] = np.apply_along_axis(np.mean, 1, alpha_frqs[:, 1:])
summary_at_qtl[:, 2] = np.std(alpha_frqs[:, 1:], axis=1)
summary_at_qtl[:, 3] = np.mean(omega_frqs[:, 1:], axis=1)
summary_at_qtl[:, 4] = np.std(omega_frqs[:, 1:], axis=1)

In [353]:
small_data['allele/frequency/averages_at_qtl'] = summary_at_qtl

SyntaxError: keyword can't be an expression (<ipython-input-353-71af03c20da7>, line 1)

In [367]:
bulk_loci = []
for i in range(8):
    bulk_loci += loci_tracker[i]
array_loci = np.array(bulk_loci)
from collections import Counter
count_of_loci = Counter(bulk_loci)
prob_detection = np.zeros((30, 2))
prob_detection[:, 0] = qtl
for idx, loc in enumerate(prob_detection):
    prob_detection[idx, 1] = count_of_loci[int(loc[0])]/8

In [368]:
prob_detection

array([[   12.   ,     0.   ],
       [   23.   ,     0.875],
       [   95.   ,     0.125],
       [   96.   ,     0.   ],
       [  158.   ,     1.   ],
       [  191.   ,     1.   ],
       [  255.   ,     0.   ],
       [  257.   ,     0.   ],
       [  275.   ,     0.   ],
       [  304.   ,     0.   ],
       [  310.   ,     0.375],
       [  338.   ,     1.   ],
       [  539.   ,     1.   ],
       [  570.   ,     0.   ],
       [  577.   ,     0.   ],
       [  610.   ,     0.625],
       [  658.   ,     0.   ],
       [  879.   ,     0.   ],
       [  888.   ,     0.25 ],
       [  917.   ,     0.   ],
       [ 1081.   ,     0.   ],
       [ 1102.   ,     0.5  ],
       [ 1198.   ,     0.   ],
       [ 1232.   ,     0.   ],
       [ 1342.   ,     0.   ],
       [ 1354.   ,     0.   ],
       [ 1366.   ,     0.   ],
       [ 1411.   ,     0.   ],
       [ 1424.   ,     0.   ],
       [ 1440.   ,     0.   ]])

In [354]:
print(summary_at_qtl)

[[   12.        0.33      0.055     0.67      0.055]
 [   23.        0.533     0.071     0.467     0.071]
 [   95.        0.292     0.032     0.708     0.032]
 [   96.        0.881     0.044     0.119     0.044]
 [  158.        0.73      0.022     0.27      0.022]
 [  191.        0.554     0.06      0.446     0.06 ]
 [  255.        0.271     0.038     0.729     0.038]
 [  257.        0.454     0.068     0.546     0.068]
 [  275.        0.945     0.018     0.055     0.018]
 [  304.        0.12      0.034     0.88      0.034]
 [  310.        0.261     0.046     0.739     0.046]
 [  338.        0.182     0.034     0.818     0.034]
 [  539.        0.368     0.032     0.633     0.032]
 [  570.        0.645     0.057     0.355     0.057]
 [  577.        0.245     0.047     0.755     0.047]
 [  610.        0.182     0.056     0.818     0.056]
 [  658.        0.867     0.044     0.133     0.044]
 [  879.        0.937     0.021     0.063     0.021]
 [  888.        0.147     0.03      0.853     

In [349]:
alpha_frqs

array([[   12.   ,     0.402,     0.286,     0.394,     0.258,     0.276,
            0.374,     0.361,     0.289],
       [   23.   ,     0.477,     0.673,     0.567,     0.472,     0.535,
            0.443,     0.503,     0.597],
       [   95.   ,     0.269,     0.283,     0.349,     0.252,     0.316,
            0.257,     0.292,     0.322],
       [   96.   ,     0.873,     0.885,     0.877,     0.924,     0.925,
            0.907,     0.882,     0.777],
       [  158.   ,     0.768,     0.719,     0.722,     0.689,     0.745,
            0.727,     0.743,     0.729],
       [  191.   ,     0.559,     0.5  ,     0.515,     0.677,     0.515,
            0.489,     0.566,     0.614],
       [  255.   ,     0.321,     0.225,     0.258,     0.298,     0.328,
            0.268,     0.231,     0.24 ],
       [  257.   ,     0.358,     0.431,     0.492,     0.381,     0.517,
            0.438,     0.582,     0.433],
       [  275.   ,     0.949,     0.939,     0.907,     0.939,     0.952

In [350]:
omega_frqs

array([[   12.   ,     0.599,     0.714,     0.607,     0.743,     0.724,
            0.625,     0.639,     0.711],
       [   23.   ,     0.522,     0.327,     0.433,     0.528,     0.465,
            0.557,     0.497,     0.403],
       [   95.   ,     0.731,     0.717,     0.65 ,     0.749,     0.684,
            0.744,     0.708,     0.678],
       [   96.   ,     0.127,     0.115,     0.123,     0.075,     0.075,
            0.093,     0.117,     0.223],
       [  158.   ,     0.232,     0.281,     0.279,     0.311,     0.256,
            0.273,     0.258,     0.272],
       [  191.   ,     0.441,     0.499,     0.485,     0.323,     0.484,
            0.511,     0.434,     0.387],
       [  255.   ,     0.679,     0.775,     0.742,     0.702,     0.672,
            0.733,     0.769,     0.76 ],
       [  257.   ,     0.641,     0.569,     0.508,     0.619,     0.483,
            0.562,     0.418,     0.567],
       [  275.   ,     0.051,     0.061,     0.093,     0.061,     0.048

In [345]:
summary_at_qtl

array([[   12.   ,     0.33 ,     0.055,     0.67 ,     0.055],
       [   23.   ,     0.533,     0.071,     0.467,     0.071],
       [   95.   ,     0.292,     0.032,     0.708,     0.032],
       [   96.   ,     0.881,     0.044,     0.119,     0.044],
       [  158.   ,     0.73 ,     0.022,     0.27 ,     0.022],
       [  191.   ,     0.554,     0.06 ,     0.446,     0.06 ],
       [  255.   ,     0.271,     0.038,     0.729,     0.038],
       [  257.   ,     0.454,     0.068,     0.546,     0.068],
       [  275.   ,     0.945,     0.018,     0.055,     0.018],
       [  304.   ,     0.12 ,     0.034,     0.88 ,     0.034],
       [  310.   ,     0.261,     0.046,     0.739,     0.046],
       [  338.   ,     0.182,     0.034,     0.818,     0.034],
       [  539.   ,     0.368,     0.032,     0.633,     0.032],
       [  570.   ,     0.645,     0.057,     0.355,     0.057],
       [  577.   ,     0.245,     0.047,     0.755,     0.047],
       [  610.   ,     0.182,     0.056,

In [264]:
alpha_frequency

array([ 0.402,  0.477,  0.269,  0.873,  0.768,  0.559,  0.321,  0.358,
        0.949,  0.163,  0.207,  0.161,  0.38 ,  0.624,  0.197,  0.116,
        0.897,  0.906,  0.124,  0.851,  0.387,  0.404,  0.028,  0.117,
        0.877,  0.106,  0.936,  0.136,  0.486,  0.625])

In [265]:
omega_frequency

array([ 0.599,  0.522,  0.731,  0.127,  0.232,  0.441,  0.679,  0.641,
        0.051,  0.838,  0.792,  0.84 ,  0.621,  0.376,  0.803,  0.884,
        0.102,  0.093,  0.875,  0.149,  0.614,  0.597,  0.973,  0.882,
        0.122,  0.893,  0.064,  0.865,  0.513,  0.374])

In [252]:
alphas

array([1, 4, 0, 1, 1, 0, 1, 1, 0, 1, 1, 4, 0, 1, 1, 0, 1, 0, 0, 2, 2, 0, 1,
       0, 1, 1, 1, 0, 2, 1])

In [253]:
alpha_frequency

array([ 0.402,  0.477,  0.269,  0.873,  0.768,  0.559,  0.321,  0.358,
        0.949,  0.163,  0.207,  0.161,  0.38 ,  0.624,  0.197,  0.116,
        0.897,  0.906,  0.124,  0.851,  0.387,  0.404,  0.028,  0.117,
        0.877,  0.106,  0.936,  0.136,  0.486,  0.625])

In [255]:
alpha_effect

array([ 0.324,  1.114,  0.549,  1.894,  0.047,  1.948,  1.398,  1.7  ,
        0.946,  1.625,  1.442,  0.419,  0.003,  0.933,  0.348,  0.037,
        1.645,  0.524,  1.159,  2.874,  0.43 ,  1.559,  0.188,  0.384,
        0.087,  0.191,  1.663,  3.133,  0.36 ,  0.594])

In [257]:
omega

array([3, 5, 2, 3, 3, 2, 3, 3, 2, 2, 3, 5, 3, 2, 2, 1, 3, 1, 2, 3, 3, 3, 3,
       2, 2, 3, 3, 1, 3, 3])

In [239]:
data_table = {}

In [356]:
summary_at_qtl[:5]

array([[  12.   ,    0.33 ,    0.055,    0.67 ,    0.055],
       [  23.   ,    0.533,    0.071,    0.467,    0.071],
       [  95.   ,    0.292,    0.032,    0.708,    0.032],
       [  96.   ,    0.881,    0.044,    0.119,    0.044],
       [ 158.   ,    0.73 ,    0.022,    0.27 ,    0.022]])

In [419]:
data_table = dict(
    QTL=qtl, 
    Chromosome=chromosomes,
    Alpha=alpha,
    AverageAlphaFrequency=summary_at_qtl[:, 1],
    AlphaEffect=alpha_effect,
    Omega=omega,
    AverageOmegaFrequency=summary_at_qtl[:, 3],
    OmegaEffect=omega_effect,
    StdDevAlleleFrq=summary_at_qtl[:, 2],
    AbsEffectDiff=abs_diff,
    FrequencyDetected=prob_detection[:, 1]
)

In [420]:
data_table

{'AbsEffectDiff': array([ 0.597,  1.729,  0.486,  1.189,  2.281,  1.678,  0.146,  0.525,
         0.793,  0.302,  1.313,  2.279,  1.937,  0.501,  0.362,  1.541,
         0.608,  0.009,  1.678,  0.092,  0.345,  1.331,  0.152,  0.306,
         0.331,  1.309,  1.136,  0.175,  0.018,  0.541]),
 'Alpha': array([1, 4, 0, 1, 1, 0, 1, 1, 0, 1, 1, 4, 0, 1, 1, 0, 1, 0, 0, 2, 2, 0, 1,
        0, 1, 1, 1, 0, 2, 1]),
 'AlphaEffect': array([ 0.324,  1.114,  0.549,  1.894,  0.047,  1.948,  1.398,  1.7  ,
         0.946,  1.625,  1.442,  0.419,  0.003,  0.933,  0.348,  0.037,
         1.645,  0.524,  1.159,  2.874,  0.43 ,  1.559,  0.188,  0.384,
         0.087,  0.191,  1.663,  3.133,  0.36 ,  0.594]),
 'AverageAlphaFrequency': array([ 0.33 ,  0.533,  0.292,  0.881,  0.73 ,  0.554,  0.271,  0.454,
         0.945,  0.12 ,  0.261,  0.182,  0.368,  0.645,  0.245,  0.182,
         0.867,  0.937,  0.147,  0.867,  0.432,  0.349,  0.064,  0.137,
         0.882,  0.123,  0.945,  0.128,  0.512,  0.628]),
 'Av

In [282]:
data_table.keys()

dict_keys(['qtl', 'chromosomes', 'alpha', 'alpha_frequency', 'alpha_effect', 'omega', 'omega_frequency', 'omega_effect', 'abs_diff'])

In [423]:
allele_data_table = pd.DataFrame(data_table, columns=['QTL', 'Chromosome', 'Alpha', 'AlphaEffect', 
                                 'AverageAlphaFrequency', 'Omega','OmegaEffect', 'AverageOmegaFrequency',  
                                                      'StdDevAlleleFrq', 'AbsEffectDiff', 'FrequencyDetected'])

In [424]:
allele_data_table

Unnamed: 0,QTL,Chromosome,Alpha,AlphaEffect,AverageAlphaFrequency,Omega,OmegaEffect,AverageOmegaFrequency,StdDevAlleleFrq,AbsEffectDiff,FrequencyDetected
0,12,1,1,0.3244,0.3299,3,0.9214,0.6701,0.0546,0.597,0.0
1,23,1,4,1.1141,0.5334,5,2.8435,0.4666,0.0713,1.7294,0.875
2,95,1,0,0.5485,0.2924,2,0.0627,0.7076,0.0321,0.4858,0.125
3,96,1,1,1.8937,0.8814,3,0.7047,0.1186,0.0439,1.189,0.0
4,158,1,1,0.0466,0.7301,3,2.3273,0.2699,0.0216,2.2807,1.0
5,191,1,0,1.948,0.5545,2,0.2702,0.4455,0.0601,1.6778,1.0
6,255,2,1,1.3982,0.271,3,1.2521,0.729,0.0377,0.146,0.0
7,257,2,1,1.7003,0.454,3,1.1757,0.546,0.0685,0.5246,0.0
8,275,2,0,0.9465,0.9454,2,1.7394,0.0546,0.0181,0.7929,0.0
9,304,2,1,1.6253,0.1196,2,1.3229,0.8804,0.0341,0.3025,0.0


In [425]:
allele_data_table.to_csv('small_allele_data_summary.txt', sep='\t', index=False, float_format='%.4f')

In [373]:
small_data['allele/summary_data'] = allele_data_table

In [376]:
small_data['allele/summary_data'].attrs['columns'] = list(map(np.string_, allele_data_table.columns))

In [416]:
abseffectdiff_correlations = allele_data_table.corr()['AbsEffectDiff']

In [418]:
abseffectdiff_correlations

QTL                     -0.3668
Chromosome              -0.3736
Alpha                    0.1629
AlphaEffect             -0.1659
AverageAlphaFrequency   -0.0734
Omega                    0.4025
AverageOmegaFrequency    0.0734
StdDevAlleleFrq          0.0609
AbsEffectDiff            1.0000
FrequencyDetected        0.8555
Name: AbsEffectDiff, dtype: float64

In [417]:
allele_data_table.corr()

Unnamed: 0,QTL,Chromosome,Alpha,AlphaEffect,AverageAlphaFrequency,Omega,AverageOmegaFrequency,StdDevAlleleFrq,AbsEffectDiff,FrequencyDetected
QTL,1.0,0.9943,-0.1361,-0.0424,-0.0231,-0.2167,0.0231,-0.4153,-0.3668,-0.4024
Chromosome,0.9943,1.0,-0.1404,-0.0379,-0.0255,-0.2042,0.0255,-0.397,-0.3736,-0.4118
Alpha,-0.1361,-0.1404,1.0,-0.0399,0.0431,0.814,-0.0431,0.3166,0.1629,0.2084
AlphaEffect,-0.0424,-0.0379,-0.0399,1.0,0.1734,-0.066,-0.1734,0.1611,-0.1659,-0.1704
AverageAlphaFrequency,-0.0231,-0.0255,0.0431,0.1734,1.0,0.0133,-1.0,-0.1157,-0.0734,-0.1205
Omega,-0.2167,-0.2042,0.814,-0.066,0.0133,1.0,-0.0133,0.2061,0.4025,0.3555
AverageOmegaFrequency,0.0231,0.0255,-0.0431,-0.1734,-1.0,-0.0133,1.0,0.1157,0.0734,0.1205
StdDevAlleleFrq,-0.4153,-0.397,0.3166,0.1611,-0.1157,0.2061,0.1157,1.0,0.0609,0.1721
AbsEffectDiff,-0.3668,-0.3736,0.1629,-0.1659,-0.0734,0.4025,0.0734,0.0609,1.0,0.8555
FrequencyDetected,-0.4024,-0.4118,0.2084,-0.1704,-0.1205,0.3555,0.1205,0.1721,0.8555,1.0


In [82]:
detected_loci = results.ix[results.ix[:, 4] < 0.05].index

In [83]:
detected_loci

Int64Index([23, 158, 191, 310, 338, 539, 610, 1102], dtype='int64')

In [93]:
allele_effects[detected_loci]

array([[   23.   ,     4.   ,     1.114,     5.   ,     2.843],
       [  158.   ,     1.   ,     0.047,     3.   ,     2.327],
       [  191.   ,     0.   ,     1.948,     2.   ,     0.27 ],
       [  310.   ,     1.   ,     1.442,     3.   ,     0.129],
       [  338.   ,     4.   ,     0.419,     5.   ,     2.698],
       [  539.   ,     0.   ,     0.003,     3.   ,     1.939],
       [  610.   ,     0.   ,     0.037,     1.   ,     1.578],
       [ 1102.   ,     0.   ,     1.559,     3.   ,     0.228]])

In [94]:
set(detected_loci).intersection(qtl)

{23, 158, 191, 310, 338, 539, 610, 1102}

In [85]:
len(set(detected_loci).intersection(qtl))

8

## Allele Effects: Estimated vs. Actual
    Purpose of this table to compare TASSEL's effect size estimates
    with the known true values we assigned.

In [None]:
gt = small.genotypic_effects_table(sample_library, ae_array,
                          segregating_loci, qtl, hdf5_file=small_data)

In [141]:
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

In [142]:
output_notebook()

In [144]:
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
from bokeh.plotting import figure

In [161]:
list(map(str, sorted(list(map(int, count_of_loci.keys())))))

['23', '95', '158', '191', '310', '338', '539', '610', '888', '1102']

In [164]:
ordered_loci = sorted(list(map(int, count_of_loci.keys())))

In [177]:
counts = [count_of_loci[locus] for locus in ordered_loci]

In [178]:
counts

[7, 1, 8, 8, 3, 8, 8, 5, 2, 4]

In [179]:
count_of_loci

Counter({23: 7,
         95: 1,
         158: 8,
         191: 8,
         310: 3,
         338: 8,
         539: 8,
         610: 5,
         888: 2,
         1102: 4})

In [180]:
loci = list(map(str, sorted(list(map(int, count_of_loci.keys())))))

In [181]:
dict(loci=loci, counts=counts)

{'counts': [7, 1, 8, 8, 3, 8, 8, 5, 2, 4],
 'loci': ['23', '95', '158', '191', '310', '338', '539', '610', '888', '1102']}

In [None]:
dic

In [182]:
source = ColumnDataSource(data=dict(loci=loci, counts=counts))

In [216]:
p = figure(x_range=loci, plot_height=350, toolbar_location=None, title="Loci Counts")

In [217]:
p.vbar(x='loci', top='counts', width=0.9, source=source,
       line_color='white')

In [218]:
p.xgrid.grid_line_color = None

In [221]:
p.xaxis.axis_label = "Loci"
p.yaxis.axis_label = "Counts"

In [219]:
p.y_range.start = 0
p.y_range.end = 9