In [1]:
% matplotlib inline

import numpy as np
import pandas as pd
import matplotlib, collections, itertools, os, re, textwrap, logging
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from functools import reduce

from logging.config import dictConfig
from logging import getLogger

dictConfig(dict(
    version = 1,
    formatters = {'f': {'format': '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}},
    handlers = {
        'h': {'class': 'logging.StreamHandler','formatter': 'f',
              'level': logging.DEBUG}},
    root = {'handlers': ['h'], 'level': logging.DEBUG,},
))

matplotlib.rc('font',**{'size':16, 'family':'sans-serif','sans-serif':['HelveticaNeue', 'Helvetica']})

logger = getLogger('notebook')


In [2]:
repo_dir=os.path.realpath(
    os.path.dirname(os.path.dirname(os.getcwd()))
)


In [3]:
data_dir=os.path.realpath(
    os.path.join(os.path.dirname(os.getcwd()), 'private_data')
)

In [42]:
PCA_scores_tmp  =pd.read_csv(
    os.path.join(data_dir, 'loadingsSquared.csv'),
    compression=None
)

In [43]:
PCA_scores_tmp.head()

Unnamed: 0.1,Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC643,PC644,PC645,PC646,PC647,PC648,PC649,PC650,PC651,PC652
0,chr1_9,3.34676e-08,1.652835e-06,1.018775e-06,9.793547e-10,1.424085e-06,2.80597e-09,1.098704e-05,8.466703e-08,3.518017e-07,...,6.9e-05,2.4e-05,0.004622,1e-06,0.000364378,0.003338,0.002233,0.007102,1.102895e-05,0.002538
1,chr1_10,1.107327e-07,2.233828e-06,7.540501e-07,6.449884e-08,1.98789e-07,5.751813e-08,9.136495e-06,3.674732e-08,9.50677e-07,...,0.000237,0.00392,0.014245,0.001979,0.005243388,0.00168,0.008684,0.007596,0.0006995106,0.000614
2,chr1_11,1.187491e-05,7.828986e-06,3.388332e-09,2.527551e-06,3.058192e-10,9.145115e-09,2.779372e-07,1.194209e-07,1.51698e-07,...,5e-06,0.000576,0.000164,0.001285,3.625869e-07,0.000916,0.000228,0.001897,0.0002249642,0.000805
3,chr1_13,1.26453e-07,4.100765e-07,2.341848e-07,1.279432e-07,3.809663e-10,4.49875e-07,5.803602e-07,3.396979e-08,1.290322e-05,...,2.5e-05,3e-06,0.000303,0.001172,0.0002260119,0.000786,0.002089,0.000186,1.035302e-07,0.002028
4,chr1_14,4.416048e-08,2.714393e-07,1.402728e-10,5.970114e-08,7.029725e-09,4.257667e-09,7.345866e-07,1.034405e-07,7.987249e-06,...,0.000391,0.001577,0.001045,0.013824,0.003604385,0.001266,0.003559,0.001278,0.001905602,0.005421


In [48]:
PCA_scores_mat = PCA_scores_tmp.iloc[:, 1:].as_matrix()


In [49]:
PCA_scores_mat.shape

(379541, 652)

In [53]:
PCA_scores_idx=np.array(PCA_scores_tmp.iloc[:, 0])

In [80]:
bed_file_df = pd.DataFrame(collections.OrderedDict((
    ('chrom', [x.split('_')[0] for x in PCA_scores_idx]),
    ('chromStart', [(int(x.split('_')[1])) * 1000 for x in PCA_scores_idx]),
    ('chromEnd', [(int(x.split('_')[1]) + 1) * 1000 for x in PCA_scores_idx]),
    ('name', PCA_scores_idx))
))

In [81]:
bed_file_df.head()

Unnamed: 0,chrom,chromStart,chromEnd,name
0,chr1,9000,10000,chr1_9
1,chr1,10000,11000,chr1_10
2,chr1,11000,12000,chr1_11
3,chr1,13000,14000,chr1_13
4,chr1,14000,15000,chr1_14


In [61]:
topk=5000

In [63]:
component_idx=0

In [88]:
for component_idx in range(PCA_scores_mat.shape[1]):
    if(component_idx % 20 == 0):
        print(component_idx)
    topk_value = np.sort(PCA_scores_mat[:,component_idx])[-topk]

    top_bins_filter = [
        PCA_scores_mat[bin_idx, component_idx] >= topk_value
        for bin_idx in range(len(bed_file_df))
    ]

    bed_file_df[top_bins_filter].to_csv(
        os.path.join(data_dir, 'GREAT_query', '{}.bed'.format(component_idx)), 
        sep='\t', index=False, header=False
    ) 

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
