In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import time
import amt.method as md
import amt.data_loader as dl
import pickle

%matplotlib inline
%load_ext autoreload
%autoreload 2

# Load the data 

In [9]:
output_folder = '/home/martin/adapative_MC_test/results/GWAS'
output_file = output_folder + '/small_GWAS.pickle'
with open('/home/martin/adapative_MC_test/parkinsons/parkinsons.pickle', 'rb') as f:
    X = pickle.load(f)
    y = pickle.load(f)
    miss_prop = pickle.load(f)
y = y-1
file_map = '/home/martin/adapative_MC_test/parkinsons/parkinsons.map'
df_map = pd.read_csv(file_map, delimiter='\t', 
                     names=['chromosome', 'snp', 'start', 'end'])
n_sample, n_snp = X.shape
ind_small = np.array(df_map['chromosome']==4, dtype=bool)
ind_snp = np.array(miss_prop[ind_small]<0.05, dtype=bool)
n_hypothesis = np.sum(ind_snp)

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
# Compute the expected observations
Exp = np.zeros([8, n_snp], dtype=float)
for iy in range(2):
    for ix in range(4):
        Exp[iy*4+ix,:] = np.mean(y==iy) * np.mean(X==ix,axis=0)
Exp = Exp*n_sample
r_Exp = 1/Exp.clip(min=1e-6)*(Exp>0)
chi2_obs = md.compute_chi2(y, X, Exp, r_Exp)
data_gwas = {'X':X, 'y':y, 'Exp':Exp, 'r_Exp':r_Exp, 'chi2_obs':chi2_obs}

# fMC MC sample generation

In [5]:
# alpha = 0.05
# ind_small = np.zeros([n_snp], dtype=bool)
# ind_small[0:100] = True
# n_hypothesis = np.sum(ind_small)
# n_fMC = int(2.5e5)
n_fMC = int(5e5)
data_gwas_small = {'X':X[:,ind_small], 'y':y, 'Exp':Exp[:,ind_small],
                   'r_Exp':r_Exp[:,ind_small], 'chi2_obs':chi2_obs[ind_small]}

In [6]:
start_time = time.time()
B = md.permute_chi2_batch_ncore(data_gwas_small['y'],
                                data_gwas_small['X'],
                                data_gwas_small['Exp'],
                                data_gwas_small['r_Exp'],
                                data_gwas_small['chi2_obs'], n_fMC,
                                verbose=True)
p_fmc = (np.sum(B, axis=0)+1)/(n_fMC+1)
print('# Time=%0.1fs'%(time.time()-start_time))
time_fMC = time.time()-start_time
res_fMC = {'time':time.time()-start_time,
           'p_fmc': p_fmc,
           'B1':B[0:100000,:],
           'B2':B[100000:,:]}
with open(output_file, "wb") as f:
    pickle.dump(res_fMC, f)    

# Time=7.5s


# Result analysis

In [9]:
with open(output_file, 'rb')as f:
    res_dic = pickle.load(f)
B = np.concatenate([res_dic['B1'], res_dic['B2']], axis=0)
p_fmc = res_dic['p_fmc']

In [34]:
h_amt.shape

(23915,)

In [32]:
snp_list = ['rs10501570', 'rs281357', 'rs2242330', 'rs1480597', 'rs6826751', 'rs4888984',
            'rs4862792', 'rs3775866', 'rs2235617', 'rs988421', 'rs7097094', 'rs999473',
            'rs1912373', 'rs1887279', 'rs2986574', 'rs11090762', 'rs6125829', 'rs7796855',
            'rs355477', 'rs3010040', 'rs2296713', 'rs355461', 'rs355506', 'rs355464',
            'rs1497430', 'rs11946612']
tau_fmc = md.bh(p_fmc[ind_snp], alpha=0.1)
h_fmc = (p_fmc[ind_snp] <= tau_fmc)
print(np.sum(h_fmc))
df_map_c4 = df_map.loc[ind_small]
for snp in snp_list:
    temp = df_map_c4['snp']==snp
    if np.sum(temp)>0:
        print('###')
        print(df_map_c4.loc[temp])
        print('decision', h_fmc[temp[ind_snp]])
        print('miss_prop=%0.4f, p_fmc='%(miss_prop[ind_small][temp]),
                                        p_fmc[temp])
        print('')

47
###
       chromosome        snp  start       end
100847          4  rs2242330      0  68276015
decision [ True]
miss_prop=0.0000, p_fmc= [7.999968e-06]

###
       chromosome        snp  start       end
100837          4  rs6826751      0  68262621
decision [ True]
miss_prop=0.0019, p_fmc= [1.5999936e-05]

###
       chromosome        snp  start        end
115749          4  rs4862792      0  188576499
decision [ True]
miss_prop=0.0483, p_fmc= [3.999984e-06]

###
       chromosome        snp  start       end
100842          4  rs3775866      0  68272946
decision [ True]
miss_prop=0.0000, p_fmc= [3.5999856e-05]

###
       chromosome       snp  start       end
100830          4  rs355477      0  68225291
decision [ True]
miss_prop=0.0074, p_fmc= [7.999968e-05]

###
       chromosome       snp  start       end
100828          4  rs355461      0  68209490
decision [ True]
miss_prop=0.0000, p_fmc= [7.999968e-05]

###
       chromosome       snp  start       end
100829          4  rs355

In [27]:
np.sum(ind_snp)

23915

# Corresponding AMT result

In [15]:
temp_B = B[:,ind_snp]

In [28]:
start_time = time.time()
p_hat_ub, p_hat, tau_hat, n_amt = md.amt(md.f_sample_dummy, temp_B, n_hypothesis,
                                         alpha=0.05, n_fMC=n_fMC,
                                         verbose=True, delta=0.001)
h_amt = (p_hat_ub <= tau_hat)
print('# AMT: avg. MC samples = %0.1f, time=%0.2fs'%(np.mean(n_amt),
                                                     time.time()-start_time))
print('# D_AMT=%d, D_overlap=%d, D_fMC=%d'%(md.result_compare(h_amt, h_fmc)))
print('')

# Initialization parameters
# n_hypothesis=23915, n_fMC=250000, alpha=0.05, increment=1.10
# delta 0.001
# delta_CI 3.3642314968430376e-09
# r_hat=23915, tau_hat=0.0500
# batch_size [  100   111   122   134   147   162   178   195   215   236   260   286
   314   346   380   418   460   506   556   612   673   741   815   896
   985  1084  1192  1311  1443  1587  1745  1920  2112  2323  2555  2811
  3092  3401  3741  4115  4526  4979  5477  6025  6627  7290  8018  8820
  9702 10672 11740 12913 14205 15625 17188 18906 20797 22210]
# sum of batch size = 250000
# Initialization completed: time=-0.006s
# 0, avg_sample=100.0, tau=0.00706, r_hat=3379, n_u=3379, n_g=20536, n_l=0
# 1, avg_sample=115.7, tau=0.00468, r_hat=2240, n_u=2240, n_g=21675, n_l=0
# 2, avg_sample=127.1, tau=0.00346, r_hat=1656, n_u=1656, n_g=22259, n_l=0
# 3, avg_sample=136.4, tau=0.00273, r_hat=1307, n_u=1307, n_g=22608, n_l=0
# 4, avg_sample=144.4, tau=0.00230, r_hat=1101, n_u=1101, n_g=22814, n_l=0
# 5, avg_sample=151

# Directly run AMT 

In [20]:
temp_data = {'X':X[:,ind_small][:, ind_snp], 'y':y, 
             'Exp':Exp[:,ind_small][:, ind_snp],
             'r_Exp':r_Exp[:,ind_small][:, ind_snp],
             'chi2_obs':chi2_obs[ind_small][ind_snp]}

In [25]:
start_time = time.time()
p_hat_ub, p_hat, tau_hat, n_amt = md.amt(md.f_sample_chi2, temp_data, n_hypothesis,
                                         alpha=0.05, n_fMC=n_fMC,
                                         verbose=True, delta=0.001,
                                         random_state=0)
h_amt = (p_hat_ub <= tau_hat)
print('# AMT: avg. MC samples = %0.1f, time=%0.2fs'%(np.mean(n_amt),
                                                     time.time()-start_time))
print('# D_AMT=%d, D_overlap=%d, D_fMC=%d'%(md.result_compare(h_amt, h_fmc)))
print('')

# Initialization parameters
# n_hypothesis=23915, n_fMC=250000, alpha=0.05, increment=1.10
# delta 0.001
# delta_CI 3.3642314968430376e-09
# r_hat=23915, tau_hat=0.0500
# batch_size [  100   111   122   134   147   162   178   195   215   236   260   286
   314   346   380   418   460   506   556   612   673   741   815   896
   985  1084  1192  1311  1443  1587  1745  1920  2112  2323  2555  2811
  3092  3401  3741  4115  4526  4979  5477  6025  6627  7290  8018  8820
  9702 10672 11740 12913 14205 15625 17188 18906 20797 22210]
# sum of batch size = 250000
# Initialization completed: time=-0.006s
# 0, avg_sample=100.0, tau=0.00719, r_hat=3438, n_u=3438, n_g=20477, n_l=0
# 1, avg_sample=116.0, tau=0.00463, r_hat=2216, n_u=2216, n_g=21699, n_l=0
# 2, avg_sample=127.3, tau=0.00340, r_hat=1628, n_u=1628, n_g=22287, n_l=0
# 3, avg_sample=136.4, tau=0.00266, r_hat=1271, n_u=1271, n_g=22644, n_l=0
# 4, avg_sample=144.2, tau=0.00229, r_hat=1096, n_u=1096, n_g=22819, n_l=0
# 5, avg_sample=151

# Old code

In [None]:
permute_chi2_batch_ncore(y, X, Exp, r_Exp, chi2_obs, n_permute, random_state=0,
                             verbose=False, n_core=32)

In [None]:
f_sample_chi2(data, ind_sample, n_new_sample, sample_start=None, n_core=32,
        random_state=None):

In [7]:
start_time = time.time()
B = md.permute_chi2_batch(data_gwas_small['y'],
                          data_gwas_small['X'],
                          data_gwas_small['Exp'],
                          data_gwas_small['r_Exp'],
                          data_gwas_small['chi2_obs'], n_fMC,
                          verbose=True)
p_fmc = (np.sum(B, axis=0)+1)/(n_fMC+1)
tau_fmc = md.bh(p_fmc, alpha=alpha)
h_fmc = (p_fmc <= tau_fmc)
print('# fMC: avg. MC samples = %d, time=%0.2fs, discoveries=%d'%(np.mean(n_fMC),
                                                                  time.time()-start_time,
                                                                  np.sum(h_fmc)))
time_fMC = time.time()-start_time
res_fMC = {'time':time.time()-start_time,
           'p_fmc': p_fmc,
           'B':B}
with open(output_file, "wb") as f:
    pickle.dump(res_fMC, f)    

0/241930, time=0.2s
1000/241930, time=143.0s
2000/241930, time=286.5s
3000/241930, time=430.7s
4000/241930, time=581.5s
5000/241930, time=728.4s
6000/241930, time=865.7s
7000/241930, time=1003.0s
8000/241930, time=1140.0s
9000/241930, time=1277.5s
10000/241930, time=1417.8s
11000/241930, time=1553.7s
12000/241930, time=1703.7s
13000/241930, time=1841.5s
14000/241930, time=1993.8s
15000/241930, time=2136.5s
16000/241930, time=2277.4s
17000/241930, time=2422.3s
18000/241930, time=2558.3s
19000/241930, time=2694.1s
20000/241930, time=2828.3s
21000/241930, time=2973.9s
22000/241930, time=3109.6s
23000/241930, time=3251.7s
24000/241930, time=3391.4s
25000/241930, time=3532.9s
26000/241930, time=3671.8s
27000/241930, time=3813.7s
28000/241930, time=3950.0s
29000/241930, time=4086.3s
30000/241930, time=4225.5s
31000/241930, time=4378.6s
32000/241930, time=4519.0s
33000/241930, time=4666.2s
34000/241930, time=4814.0s
35000/241930, time=4952.4s
36000/241930, time=5093.6s
37000/241930, time=5227

OverflowError: cannot serialize a bytes object larger than 4 GiB

In [16]:
res_fMC = {'time':time.time()-start_time,
           'p_fmc': p_fmc,
           'B1':B[0:100000,:],
           'B2':B[100000:,:]}
with open(output_file, "wb") as f:
    pickle.dump(res_fMC, f) 

In [17]:
with open(output_file, 'rb')as f:
    res_dic = pickle.load(f)
B = np.concatenate([res_fMC['B1'], res_fMC['B2']], axis=0)

In [70]:
np.sum(ind_sample)

23915

In [69]:
ind_sample = (miss_prop[ind_small]<0.05)
tau_fmc = md.bh(p_fmc[ind_sample], alpha=0.1)
h_fmc = (p_fmc[ind_sample] <= tau_fmc)
print(np.sum(h_fmc))
df_map_c4 = df_map.loc[df_map['chromosome']==4]
df_map_c4 = df_map_c4.loc[ind_sample]
for snp in snp_list:
    if np.sum(df_map_c4['snp']==snp)>0:
        print(df_map_c4.loc[df_map_c4['snp']==snp], h_fmc[df_map_c4['snp']==snp],
              p_fmc[ind_sample][df_map_c4['snp']==snp])

44
       chromosome        snp  start       end
100847          4  rs2242330      0  68276015 [ True] [4.13340994e-06]
       chromosome        snp  start       end
100837          4  rs6826751      0  68262621 [ True] [1.24002298e-05]
       chromosome        snp  start        end
115749          4  rs4862792      0  188576499 [ True] [4.13340994e-06]
       chromosome        snp  start       end
100842          4  rs3775866      0  68272946 [ True] [4.13340994e-05]
       chromosome       snp  start       end
100830          4  rs355477      0  68225291 [ True] [0.00010334]
       chromosome       snp  start       end
100828          4  rs355461      0  68209490 [ True] [9.50684286e-05]
       chromosome       snp  start       end
100829          4  rs355506      0  68214848 [ True] [9.50684286e-05]
       chromosome       snp  start       end
100825          4  rs355464      0  68207890 [ True] [0.00013227]
       chromosome        snp  start       end
100821          4  rs1497430 

In [45]:
# with open(output_file, 'rb')as f:
#     res_dic = pickle.load(f)
# B = np.concatenate([res_fMC['B1'], res_fMC['B2']], axis=0)
alpha=0.05
start_time = time.time()
p_hat_ub, p_hat, tau_hat, n_amt = md.amt(md.f_sample_dummy, B, n_hypothesis,
                                         alpha=alpha, n_fMC=n_fMC,
                                         verbose=True, delta=0.001)
h_amt = (p_hat_ub <= tau_hat)
print('# AMT: avg. MC samples = %0.1f, time=%0.2fs'%(np.mean(n_amt),
                                                     time.time()-start_time))
print('# D_AMT=%d, D_overlap=%d, D_fMC=%d'%(md.result_compare(h_amt, h_fmc)))
print('')

# Initialization parameters
# n_hypothesis=24193, n_fMC=241930, alpha=0.05, increment=1.10
# delta 0.001
# delta_CI 3.334375938385421e-09
# r_hat=24193, tau_hat=0.0500
# batch_size [  100   111   122   134   147   162   178   195   215   236   260   286
   314   346   380   418   460   506   556   612   673   741   815   896
   985  1084  1192  1311  1443  1587  1745  1920  2112  2323  2555  2811
  3092  3401  3741  4115  4526  4979  5477  6025  6627  7290  8018  8820
  9702 10672 11740 12913 14205 15625 17188 18906 20797 14140]
# sum of batch size = 241930
# Initialization completed: time=-0.005s
# 0, avg_sample=100.0, tau=0.00729, r_hat=3526, n_u=3526, n_g=20667, n_l=0
# 1, avg_sample=116.2, tau=0.00490, r_hat=2373, n_u=2373, n_g=21820, n_l=0
# 2, avg_sample=128.1, tau=0.00367, r_hat=1778, n_u=1778, n_g=22415, n_l=0
# 3, avg_sample=138.0, tau=0.00311, r_hat=1505, n_u=1505, n_g=22688, n_l=0
# 4, avg_sample=147.1, tau=0.00255, r_hat=1236, n_u=1236, n_g=22957, n_l=0
# 5, avg_sample=155.

In [67]:
ind_sample = (miss_prop[ind_small]<0.05)
tau_fmc = md.bh(p_fmc[ind_sample], alpha=0.1)
h_fmc = (p_fmc[ind_sample] <= tau_fmc)
print(np.sum(h_fmc))
# h_fmc = (p_fmc <= tau_fmc)

44


In [68]:
df_map_c4 = df_map.loc[df_map['chromosome']==4]
df_map_c4 = df_map_c4.loc[ind_sample]
for snp in snp_list:
    if np.sum(df_map_c4['snp']==snp)>0:
        print(df_map_c4.loc[df_map_c4['snp']==snp], h_fmc[df_map_c4['snp']==snp],
              p_fmc[ind_sample][df_map_c4['snp']==snp])

       chromosome        snp  start       end
100847          4  rs2242330      0  68276015 [ True] [4.13340994e-06]
       chromosome        snp  start       end
100837          4  rs6826751      0  68262621 [ True] [1.24002298e-05]
       chromosome        snp  start        end
115749          4  rs4862792      0  188576499 [ True] [4.13340994e-06]
       chromosome        snp  start       end
100842          4  rs3775866      0  68272946 [ True] [4.13340994e-05]
       chromosome       snp  start       end
100830          4  rs355477      0  68225291 [ True] [0.00010334]
       chromosome       snp  start       end
100828          4  rs355461      0  68209490 [ True] [9.50684286e-05]
       chromosome       snp  start       end
100829          4  rs355506      0  68214848 [ True] [9.50684286e-05]
       chromosome       snp  start       end
100825          4  rs355464      0  68207890 [ True] [0.00013227]
       chromosome        snp  start       end
100821          4  rs1497430    

In [55]:
df_map_c4 = df_map.loc[df_map['chromosome']==4]
for snp in snp_list:
    if np.sum(df_map_c4['snp']==snp)>0:
        print(df_map_c4.loc[df_map_c4['snp']==snp], h_fmc[df_map_c4['snp']==snp],
              p_fmc[df_map_c4['snp']==snp])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 23915 but corresponding boolean dimension is 24193

# Full GWAS

In [25]:
snp_list = ['rs10501570', 'rs281357', 'rs2242330', 'rs1480597', 'rs6826751', 'rs4888984',
            'rs4862792', 'rs3775866', 'rs2235617', 'rs988421', 'rs7097094', 'rs999473',
            'rs1912373', 'rs1887279', 'rs2986574', 'rs11090762', 'rs6125829', 'rs7796855',
            'rs355477', 'rs3010040', 'rs2296713', 'rs355461', 'rs355506', 'rs355464',
            'rs1497430', 'rs11946612']

In [38]:
for snp in snp_list:
    if np.sum(df_map['snp']==snp)>0:
        print(df_map.loc[df_map['snp']==snp])
    else:
        print('not found %s'%snp)

       chromosome         snp  start       end
262094         11  rs10501570      0  84095494
       chromosome       snp  start       end
343235         17  rs281357      0  19683106
       chromosome        snp  start       end
100847          4  rs2242330      0  68276015
       chromosome        snp  start       end
237247         10  rs1480597      0  44481115
       chromosome        snp  start       end
100837          4  rs6826751      0  68262621
       chromosome        snp  start       end
337424         16  rs4888984      0  78066835
       chromosome        snp  start        end
115749          4  rs4862792      0  188576499
       chromosome        snp  start       end
100842          4  rs3775866      0  68272946
       chromosome        snp  start       end
380211         20  rs2235617      0  47988384
      chromosome       snp  start       end
10385          1  rs988421      0  72261857
       chromosome        snp  start       end
237252         10  rs7097094      0 

In [5]:
np.sum(data_gwas_small['chi2_obs']==md.compute_chi2(data_gwas_small['y'],
                                                    data_gwas_small['X'],
                                                    data_gwas_small['Exp'],
                                                    data_gwas_small['r_Exp']))

1000

In [6]:
B = md.permute_chi2_batch(data_gwas_small['y'],
                          data_gwas_small['X'],
                          data_gwas_small['Exp'],
                          data_gwas_small['r_Exp'],
                          data_gwas_small['chi2_obs'], 5000)

In [7]:
np.mean(B, axis=0)[0:5]

array([0.0582, 0.1778, 0.4416, 0.4678, 1.    ])

In [14]:
ind_sample = np.zeros([1000], dtype=bool)
ind_sample[0:5] = True
n_new_sample = np.ones([5], dtype=int)*5000
n_success = md.f_sample_chi2(data_gwas_small, ind_sample, n_new_sample, sample_start=None)

In [15]:
n_success/5000

array([0.0656, 0.172 , 0.4394, 0.4764, 1.    ])

In [12]:
n_new_sample

array([5000, 5000, 5000, 5000, 5000])

In [13]:
n_success

array([10840, 10840, 10840, 10840, 10840])