In [1]:
import numpy as np
import pandas as pd
import collections as col
import shelve

In [5]:
af_from_hmp = pd.read_csv('R0_100_infinite_simulated_hapmap.txt', sep='\t')
af_from_hmp.drop(['rs', 'alleles', 'chrom', 'pos', 'strand', 
                  'assembly', 'center', 'protLSID', 'assayLSID',
                  'panelLSID', 'QCcode'], axis=1, inplace=True)

In [6]:
genotypes = np.array(af_from_hmp)

In [7]:
genotypes

array([['CC', 'CT', 'CC', ..., 'CC', 'CT', 'CC'],
       ['TT', 'TT', 'TT', ..., 'TT', 'CT', 'TT'],
       ['AG', 'GG', 'GG', ..., 'GG', 'GG', 'AG'],
       ..., 
       ['GG', 'AA', 'AA', ..., 'AA', 'AG', 'AA'],
       ['AA', 'CC', 'CC', ..., 'AC', 'AC', 'CC'],
       ['CT', 'TT', 'TT', ..., 'TT', 'CT', 'TT']], dtype=object)

In [2]:
syn_parameters = shelve.open('synthesis_parameters')
assert 'snp_to_integer' in list(syn_parameters), "This shelf does not contain the proper string to integer conversions."
snp_to_integer = syn_parameters['snp_to_integer']

In [84]:
integer_alleles = np.zeros((866, 200), dtype=np.int8)

In [105]:
allele_array = np.zeros((866, 2), dtype=np.int8)
frequencies_array = np.zeros((866, 2))

In [106]:
array_based_afs = np.array([allele_array, frequencies_array])

In [108]:
for locus, af_data in gwide_allele_frequencies.items():
    array_based_afs[0][locus][...] = list(af_data.keys())
    array_based_afs[1][locus][...] = list(af_data.values())

In [109]:
arr_alleles = np.zeros((866, 2), dtype=np.int8)
arr_allele_frequencies = np.zeros((866, 2))
af_array = np.array([arr_alleles, arr_allele_frequencies])

In [110]:
def allele_frq(hapmap_file_name, sep='\t'):
    genotypes_from_hmp = pd.read_csv(hapmap_file_name, sep=sep)
    genotypes_from_hmp.drop(['rs', 'alleles', 'chrom', 'pos', 'strand', 
                  'assembly', 'center', 'protLSID', 'assayLSID',
                  'panelLSID', 'QCcode'], axis=1, inplace=True)
    genotypes = np.array(genotypes_from_hmp)
    del genotypes_from_hmp
    
    array_of_alleles = np.zeros((866, 2), dtype=np.int8)
    array_of_allele_frequencies = np.zeros((866, 2))
    array_alleles_and_frequencies = np.array([array_of_alleles, array_of_allele_frequencies])
    
    for locus, geno_row in enumerate(genotypes):
        allele_counts = col.Counter(''.join(geno_row))
        array_alleles_and_frequencies[0][locus][...] = [snp_to_integer[str_allele] for str_allele in allele_counts.keys()]
        array_alleles_and_frequencies[1][locus][...] = np.array(list(allele_counts.values())) / sum(allele_counts.values())
    return array_alleles_and_frequencies

In [111]:
allele_frequencies = allele_frq('R0_100_infinite_simulated_hapmap.txt')

In [112]:
allele_frequencies[0][:10], allele_frequencies[1][:10]

(array([[ 1.,  3.],
        [ 1.,  3.],
        [ 2.,  0.],
        [ 2.,  0.],
        [ 2.,  0.],
        [ 2.,  0.],
        [ 1.,  3.],
        [ 1.,  3.],
        [ 2.,  0.],
        [ 1.,  3.]]), array([[ 0.82 ,  0.18 ],
        [ 0.085,  0.915],
        [ 0.93 ,  0.07 ],
        [ 0.255,  0.745],
        [ 0.14 ,  0.86 ],
        [ 0.295,  0.705],
        [ 0.71 ,  0.29 ],
        [ 0.29 ,  0.71 ],
        [ 0.855,  0.145],
        [ 0.555,  0.445]]))

In [3]:
ae_table = pd.read_hdf('expanded_infinite_allele_effects.hdf')

In [14]:
def replicate_gwas_results(gwas_results_file, q_values_file, expanded_allele_effects_table, delim="\t"):
    """
    Useful function which parses the output from TASSEL, collects the useful
    pieces of information such as p values, q values and allele effects
    into a useful table.
    """


    gwas_results = pd.read_csv(gwas_results_file, sep=delim)
    gwas_results.drop('Trait', axis=1, inplace=True)
    gwas_results.drop('Pos', axis=1, inplace=True)
    gwas_results.drop(0, axis=0, inplace=True)
    gwas_results = gwas_results.ix[:, 'Marker':'p']
    gwas_results.index = gwas_results.index - 1
    gwas_results.drop('Marker', axis=1, inplace=True)
    qvalues = pd.read_csv(q_values_file, sep=delim)
    qvalues.columns = ['q']
    qvalues.index = qvalues.index - 1
    results = gwas_results.join(qvalues)
    greater_results = results.join(expanded_allele_effects_table)

    return greater_results

In [115]:
sample_sizes = [100, 250, 500]
replicates = 0

In [118]:
rgwas = replicate_gwas_results('R0_500_infinite_out_2.txt', 'R0_500_infinite_qvalues.txt', ae_table)

In [119]:
rgwas

Unnamed: 0,Chr,df,F,p,q,difference
0,1,2,3.62291,0.02742,0.961930,0
1,1,2,2.02774,0.13273,0.961930,0
2,1,2,0.12017,0.88680,0.961930,0
3,1,2,2.30223,0.10111,0.961930,0
4,1,2,0.09147,0.91260,0.961930,0
5,1,2,1.43023,0.24024,0.961930,0
6,1,2,3.66628,0.02627,0.961930,0
7,1,2,1.17395,0.31000,0.961930,0
8,1,2,1.01512,0.36311,0.961930,0
9,1,2,0.30656,0.73612,0.961930,0


In [120]:
rgwas[rgwas.ix[:, 'q'] < 0.05]

Unnamed: 0,Chr,df,F,p,q,difference
207,3,2,17.03032,7.0338e-08,5.9e-05,5.626062
230,3,2,10.61181,3.0719e-05,0.012903,5.446156


In [121]:
rgwas[rgwas.ix[:, 'difference'] > 0.0]

Unnamed: 0,Chr,df,F,p,q,difference
44,1,2,0.18151,0.83406,0.96193,0.36361
157,2,2,2.32406,0.09894,0.96193,1.073728
207,3,2,17.03032,7.0338e-08,5.9e-05,5.626062
230,3,2,10.61181,3.0719e-05,0.012903,5.446156
241,3,2,5.22477,0.00568,0.795283,2.28243
425,5,2,0.22756,0.79655,0.96193,0.373849
539,6,2,0.92245,0.39822,0.96193,1.640569
646,8,2,3.0659,0.0475,0.96193,2.194446
787,9,2,1.63438,0.19612,0.96193,0.950822
792,9,2,0.51433,0.59822,0.96193,2.282157


In [122]:
aes = np.array(ae_table[ae_table.ix[:, 'difference'] > 0])

In [123]:
aes = aes.flatten()

In [124]:
aes

array([ 0.36360971,  1.07372753,  5.62606182,  5.44615592,  2.2824303 ,
        0.37384873,  1.64056879,  2.19444571,  0.95082249,  2.282157  ])

In [41]:
indices = np.arange(0, 10)

In [6]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [4]:
gwas_file_names = np.zeros((50, 3), dtype=np.object)
qvalues_file_names = np.empty((50, 3), dtype=np.object)
reps = list(range(50))
sample_sizes = [100, 250, 500]

In [5]:
for rep, gwas_files, qvalue_files in zip(reps, gwas_file_names, qvalues_file_names):
    gwas_files[...] = np.array(['R' + str(rep)+ '_' + str(sample_size) + '_infinite_out_2.txt' 
                           for sample_size in sample_sizes])
    qvalue_files[...] = np.array(['R'+str(rep)+ '_' + str(sample_size) + '_infinite_qvalues.txt' 
                      for sample_size in sample_sizes])

In [9]:
aggregated_file_names = np.array([gwas_file_names, qvalues_file_names])

In [15]:
panel_map_100 = {}
for rep, gwas_file, qvalue_file in zip(list(range(50)), gwas_file_names, qvalues_file_names):
    #print(gwas_file[0])
    gwas_res = replicate_gwas_results(gwas_file[0], qvalue_file[0], ae_table)
    panel_map_100[rep] = gwas_res

In [16]:
panel_map_250 = {}
for rep, gwas_file, qvalue_file in zip(list(range(50)), gwas_file_names, qvalues_file_names):
    #print(gwas_file[0])
    gwas_res = replicate_gwas_results(gwas_file[1], qvalue_file[1], ae_table)
    panel_map_250[rep] = gwas_res

In [17]:
panel_map_500 = {}
for rep, gwas_file, qvalue_file in zip(list(range(50)), gwas_file_names, qvalues_file_names):
    #print(gwas_file[0])
    gwas_res = replicate_gwas_results(gwas_file[2], qvalue_file[2], ae_table)
    panel_map_500[rep] = gwas_res

In [18]:
pf_100 = pd.Panel(panel_map_100)
pf_250 = pd.Panel(panel_map_250)
pf_500 = pd.Panel(panel_map_500)

In [19]:
panel_names = ['infinite_gwas_results_panel_100.hdf', 'infinite_gwas_results_panel_250.hdf',
              'infinite_gwas_results_panel_500.hdf']

In [20]:
sample_panels = [pf_100, pf_250, pf_500]
for panel, name in zip(sample_panels, panel_names):
    panel.to_hdf(name, 'data')

In [21]:
testing_capabilities = np.zeros((50, 6)).T

In [22]:
testing_100 = np.zeros((50, 2))
testing_250 = np.zeros((50, 2))
testing_500 = np.zeros((50, 2))

In [23]:
testing_data_sets = [testing_100, testing_250, testing_500]

## Calculating Power and FPR

In [24]:
for tds, panel in zip(testing_data_sets, sample_panels):
    for rep in range(50):
        tds[rep, 0] = len(panel[rep][(panel[rep].ix[:, 'q'] < 0.05) 
                                     & (panel[rep].ix[:, 'difference'] > 0.0)]) / 10
        tds[rep, 1] = len(panel[rep][(panel[rep].ix[:, 'q'] < 0.05) 
                                     & (panel[rep].ix[:, 'difference'] == 0.0)]) / 855

In [25]:
gwas_success_analysis = np.hstack([testing_data_sets[0], testing_data_sets[1], testing_data_sets[2]])

In [26]:
columns = ['size_100_power', 'size_100_fpr', 'size_250_power', 
           'size_250_fpr', 'size_500_power', 'size_500_fpr']

In [29]:
pd.DataFrame(gwas_success_analysis, columns=columns).to_csv('gwas_power_evaluation.csv', sep='\t')

In [103]:
combined_columns = ['size_100_power', 'size_100_fpr', 'size_250_power', 'size_250_fpr', 'size_500_power', 'size_500_fpr']

In [28]:
import sqlite3

In [1]:
import pandas as pd

In [4]:
stats = pd.read_csv('gwas_power_evaluation.csv', sep='\t', index_col=0)

In [5]:
stats

Unnamed: 0,size_100_power,size_100_fpr,size_250_power,size_250_fpr,size_500_power,size_500_fpr
0,0.0,0.0,0.1,0.0,0.2,0.0
1,0.0,0.0,0.2,0.0,0.3,0.002339
2,0.0,0.0,0.0,0.0,0.2,0.0
3,0.0,0.0,0.0,0.0,0.2,0.0
4,0.0,0.0,0.0,0.0,0.2,0.003509
5,0.0,0.0,0.1,0.0,0.2,0.00117
6,0.0,0.0,0.0,0.0,0.2,0.0
7,0.0,0.0,0.0,0.0,0.1,0.0
8,0.0,0.0,0.1,0.00117,0.2,0.0
9,0.0,0.0,0.0,0.0,0.2,0.0
