In [1]:
## import public packages
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.cluster import KMeans
from scipy.stats import norm
from scipy.stats import multivariate_normal
from scipy.stats import ttest_ind
from sklearn.cluster import KMeans
import time
import os

## import self-written packages 
from adafdr.util import *
import adafdr.method as md
import adafdr.data_loader as dl

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
data_name = 'Colon_Sigmoid'
file_input = '/data3/martin/gtex_data/GTEx_Analysis_v7_eQTL_all_associations/' +\
             '%s.allpairs.txt.processed'%data_name
file_output = '/data3/martin/nfdr2_simulation_data/gtex_cluster_data/' +\
             '%s.allpairs.txt.processed'%data_name
file_output_small = '/data3/martin/nfdr2_simulation_data/gtex_cluster_data_small/' +\
                    '%s.allpairs.txt.processed'%data_name

In [12]:
def preprocess_gtex(x_input):
    x = np.copy(x_input)
    for i in range(x.shape[1]):
        ind_nan = np.isnan(x[:, i])
        x[ind_nan, i] = np.mean(x[~ind_nan, i])
    x[:, 0] = np.log10(x[:, 0]+0.5)
    return x

# Pretrain the clustering model
n_pretrain = 5000
p = np.zeros([n_pretrain])
x = np.zeros([n_pretrain, 4])
f_input = open(file_input, 'r')
for i_line,line in enumerate(f_input):
    line = line.strip().split(', ')
    x[i_line] = line[1:5]
    p[i_line] = line[-1]
    if i_line == n_pretrain-1:
        break
f_input.close()
x = preprocess_gtex(x)
x_mean = np.mean(x, axis=0)
x_std = np.std(x, axis=0)
x = (x-x_mean)/x_std
kmeans_pretrain = KMeans(n_clusters=20, random_state=0).fit(x)

In [43]:
np.random.rand(1)[0]>0.01

True

In [40]:
# Writing the clustering data
f_input = open(file_input, 'r')
f_output = open(file_output, 'w')
f_output_small = open(file_output_small, 'w')
n_batch = 5
n_small = 100
for i_line,line in enumerate(f_input):
    if i_line%n_batch == 0:
        if i_line > 0:
            x = preprocess_gtex(x)
            x = (x-x_mean)/x_std
            x_label = kmeans_pretrain.predict(x)
            for i_hypothesis in range(len(p_value_list)):
                csv_str = '%s, %s, %d\n'%(cis_name_list[i_hypothesis],\
                                          p_value_list[i_hypothesis],\
                                          x_label[i_hypothesis])
                f_output.write(csv_str)
                if i_line <= n_small:
                    f_output_small.write(csv_str)
        cis_name_list = []
        p_value_list = []
        x = np.zeros([n_batch, 4], dtype=float)
    line = line.strip().split(', ')
    cis_name_list.append(line[0])
    p_value_list.append(line[-1])
    x[i_line%n_batch,:] = line[1:5]
    print(i_line, line)
    if i_line > 10:
        break
# write the last few hypotheses
x = preprocess_gtex(x)
x = (x-x_mean)/x_std
x_label = kmeans_pretrain.predict(x)
for i_hypothesis in range(len(p_value_list)):
    csv_str = '%s, %s, %d\n'%(cis_name_list[i_hypothesis],\
                              p_value_list[i_hypothesis],\
                              x_label[i_hypothesis])
    print(csv_str)
    f_output.write(csv_str)
    
f_input.close()
f_output.close()
f_output_small.close()

0 ['ENSG00000227232.4-1_13417_C_CGAGA_b37', '14.2', 'nan', '-16136', '5', '25', '0.0160937']
1 ['ENSG00000227232.4-1_17559_G_C_b37', '14.2', 'nan', '-11994', '5', '25', '0.0228588']
2 ['ENSG00000227232.4-1_54421_A_G_b37', '14.2', 'nan', '24868', '15', '25', '0.387377']
3 ['ENSG00000227232.4-1_54490_G_A_b37', '14.2', '0.09605', '24937', '15', '25', '0.603472']
4 ['ENSG00000227232.4-1_61920_G_A_b37', '14.2', '0.01158', '32367', '15', '25', '0.642313']
5 ['ENSG00000227232.4-1_64649_A_C_b37', '14.2', '0.01038', '35096', '15', '25', '0.649705']
6 ['ENSG00000227232.4-1_115746_C_T_b37', '14.2', '0.02756', '86193', '15', '25', '0.00975286']
7 ['ENSG00000227232.4-1_133160_G_A_b37', '14.2', 'nan', '103607', '15', '25', '0.442331']
8 ['ENSG00000227232.4-1_135203_G_A_b37', '14.2', '0.02796', '105650', '15', '25', '0.0173291']
9 ['ENSG00000227232.4-1_230894_TAA_T_b37', '14.2', 'nan', '201341', '15', '25', '0.689686']
10 ['ENSG00000227232.4-1_231153_CTT_C_b37', '14.2', 'nan', '201600', '15', '25', '

In [36]:
x_label

array([12, 12, 12, 12, 12], dtype=int32)