In [2]:
import pandas as pd
import numpy as np
import os
import shutil
import glob
import time
import subprocess as sp
from collections import OrderedDict

import snmcseq_utils
from snmcseq_utils import create_logger
from __init__ import *

In [3]:
# create logger
log = create_logger()

In [4]:
# define names and paths 
dataset = 'CEMBA_3C_171206'
ens = 'Ens1'
ens_description = 'Singleton ensemble of {} dataset.'.format(dataset)

# path_datasets is the path of "datasets" folder
bin_size=BIN_SIZE
path_datasets = PATH_DATASETS
path_ensembles = PATH_ENSEMBLES

# dataset_path or dataset_paths is/are the path(s) of specific datasets
dataset_path = os.path.join(path_datasets, dataset) 
ens_path = os.path.join(path_ensembles, ens)

# metadata (mapping summaries)
meta_fin = os.path.join(dataset_path, 'mapping_summary_{}.tsv'.format(dataset))
meta_fout = os.path.join(ens_path, 'mapping_summary_{}.tsv'.format(ens))

# file paths should be sorted!
# allcs
allc_paths = sorted(glob.glob(os.path.join(dataset_path, 'allc/allc_*.tsv.bgz')))
# genebodys 
genebody_paths = sorted(glob.glob(os.path.join(dataset_path, 'gene_level/genebody_*.tsv.bgz')))
# bincs
binc_paths = sorted(glob.glob(os.path.join(dataset_path, 'binc/binc_*_{}.tsv.bgz'.format(bin_size))))

# README.txt
readme_f = os.path.join(ens_path, 'README_{}.txt'.format(ens))

# ensemble genelevel path
ens_genelevel_path = os.path.join(ens_path, 'gene_level')
# ensemble binc path
ens_binc_path = os.path.join(ens_path, 'binc')


# # set up variables
# # cells = sorted(meta_df.index.tolist())
# cells_allc = [os.path.basename(allc_path)[len('allc_'):-len('.tsv.bgz')]
#                      for allc_path in allc_paths] 
# cells_genebody = [os.path.basename(genebody_path)[len('genebody_'):-len('.tsv.bgz')]
#                      for genebody_path in genebody_paths] 
# cells_binc = [os.path.basename(binc_path)[len('binc_'):-len('_{}.tsv.bgz'.format(bin_size))]
#                        for binc_path in binc_paths] 

In [45]:
# setup ensemble

if not os.path.isdir(ens_path):
    # create folder
    os.makedirs(ens_path)
    log.info("{} created!".format(ens_path))
    
    # create README.txt
    with open(readme_f, 'w') as readme:
        readme.write(ens_description + '\n')
    log.info("{} created:\n{}".format(readme_f, ens_description[:min(len(ens_description), 50)]))
    
    # copy metadata over
    shutil.copyfile(meta_fin, meta_fout)
    log.info("{} created from {}".format(meta_fout, meta_fin))
    
    # check if metadata agree with allc/gene_level/binc info
    # for singleton ensemble, metadata samples need to match allc/gene_level/binc
    # for non-singleton ensemble, metadata samples need to be in allc/gene_level/binc
    meta_df = pd.read_table(meta_fin, index_col='Sample')
    cells = sorted(meta_df.index.tolist())
    cells_allc = [os.path.basename(allc_path)[len('allc_'):-len('.tsv.bgz')]
                         for allc_path in allc_paths] 
    cells_genebody = [os.path.basename(genebody_path)[len('genebody_'):-len('.tsv.bgz')]
                         for genebody_path in genebody_paths] 
    cells_binc = [os.path.basename(binc_path)[len('binc_'):-len('_{}.tsv.bgz'.format(bin_size))]
                         for binc_path in binc_paths] 
    
    assert cells == cells_allc
    assert cells == cells_genebody
    assert cells == cells_binc
    log.info("Mapping summary (cell metadata) matches allc, gene_level, and binc files!")
    
else:
    log.info("Error: {} already exists!".format(ens_path))
#     raise ValueError("Error: {} already exists!".format(ens_path))


01/29/2018 04:06:43 PM /cndd/Public_Datasets/CEMBA/Ensembles/Ens1 created!
01/29/2018 04:06:43 PM /cndd/Public_Datasets/CEMBA/Ensembles/Ens1/README_Ens1.txt created:
Singleton ensemble of CEMBA_3C_171206 dataset.
01/29/2018 04:06:43 PM /cndd/Public_Datasets/CEMBA/Ensembles/Ens1/mapping_summary_Ens1.tsv created from /cndd/Public_Datasets/CEMBA/Datasets/CEMBA_3C_171206/mapping_summary_CEMBA_3C_171206.tsv
01/29/2018 04:06:43 PM Mapping summary (cell metadata) matches allc, gene_level, and binc files!


In [3]:
# begin by create metadata

# set up file paths
# # set up file paths
# datasets_path = '/cndd/Public_Datasets/CEMBA/Datasets'
# datasets = ['CEMBA_3C_171206',
#            'CEMBA_3C_171207',
#            'CEMBA_4B_171212',
#            'CEMBA_4B_171213']

# meta_fout = os.path.join(ens_path, 'mapping_summary_Ens1.tsv')
# meta_fins = ['{}/{}/mapping_summary_{}.tsv'.format(datasets_path, dataset, dataset) for dataset in datasets]

# dfs = []
# for dataset, meta_fin in zip(datasets, meta_fins):
#     df = pd.read_table(meta_fin, index_col='Sample')
#     df['Dataset'] = dataset
#     dfs.append(df)
    
# df_meta_out = pd.concat(dfs)
# print(df_meta_out.shape)
# df_meta_out.head()

# # save it
# df_meta_out.to_csv(meta_fout, sep='\t', na_rep='NA', header=True, index=True)

In [53]:
# pull genebody info from dataset
log.info("Pulling genebody information ({} cells)...".format(len(cells_genebody)))
ti = time.time()

contexts = CONTEXTS
if not os.path.isdir(ens_genelevel_path):
    # create folder
    os.makedirs(ens_genelevel_path)
    log.info("{} created!".format(ens_genelevel_path))
    
output_fnames = [os.path.join(ens_genelevel_path, 'genebody_m{}_{}.tsv'.format(context, ens)) 
                 for context in contexts]

for i, (cell, genebody_path) in enumerate(zip(cells_genebody, genebody_paths)):
    
    df_gnb = pd.read_table(genebody_path, index_col='gene_id', compression='gzip')
    # df_gnb = pd.read_table(genebody_path, index_col='gene_id', compression='gzip').sort_index()
    
    if i == 0:
        dfs = [pd.DataFrame(index=df_gnb.index)]*len(contexts)
        
    for j, context in enumerate(contexts):
    
        assert dfs[j].index.tolist() == df_gnb.index.tolist()
        
        dfs[j][cell+'_mc'] = df_gnb['m'+context] 
        dfs[j][cell+'_c'] = df_gnb[context] 
        
    log.info('Loaded cell: {} ({}/{})'.format(cell, i+1, len(cells_genebody)))

for df, context, output in zip(dfs, contexts, output_fnames):
    df.to_csv(output, sep='\t', na_rep='NA', index=True, header=True)
    # compress and name them .bgz
    sp.run("bgzip -f {}".format(output), shell=True)
    sp.run("mv {}.gz {}.bgz".format(output, output), shell=True)

    log.info('Output genebody info to \n{}.bgz'.format(output))

tf = time.time()
log.info("Time spent on pulling genebody information: {} sec".format(tf - ti))

01/29/2018 04:50:16 PM Pulling genebody information (1202 cells)...
01/29/2018 04:50:16 PM /cndd/Public_Datasets/CEMBA/Ensembles/Ens1/gene_level created!
01/29/2018 04:50:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed (1/1202)
01/29/2018 04:50:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed (2/1202)
01/29/2018 04:50:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed (3/1202)
01/29/2018 04:50:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed (4/1202)
01/29/2018 04:50:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD008_indexed (5/1202)
01/29/2018 04:50:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD010_indexed (6/1202)
01/29/2018 04:50:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206

01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A7_AD012_indexed (64/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD001_indexed (65/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD004_indexed (66/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD006_indexed (67/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD007_indexed (68/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD008_indexed (69/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD010_indexed (70/1202)
01/29/2018 04:50:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_

01/29/2018 04:50:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD001_indexed (128/1202)
01/29/2018 04:50:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD002_indexed (129/1202)
01/29/2018 04:50:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD004_indexed (130/1202)
01/29/2018 04:50:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD006_indexed (131/1202)
01/29/2018 04:50:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD007_indexed (132/1202)
01/29/2018 04:50:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD010_indexed (133/1202)
01/29/2018 04:50:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD012_indexed (134/1202)
01/29/2018 04:50:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C1_AD012_indexed (192/1202)
01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD001_indexed (193/1202)
01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD002_indexed (194/1202)
01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD004_indexed (195/1202)
01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD006_indexed (196/1202)
01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD007_indexed (197/1202)
01/29/2018 04:50:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD008_indexed (198/1202)
01/29/2018 04:50:34 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 04:50:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D11_AD008_indexed (256/1202)
01/29/2018 04:50:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD001_indexed (257/1202)
01/29/2018 04:50:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD002_indexed (258/1202)
01/29/2018 04:50:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD006_indexed (259/1202)
01/29/2018 04:50:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD007_indexed (260/1202)
01/29/2018 04:50:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD008_indexed (261/1202)
01/29/2018 04:50:40 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD010_indexed (262/1202)
01/29/2018 04:50:40 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA1

01/29/2018 04:50:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D8_AD008_indexed (320/1202)
01/29/2018 04:50:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D8_AD010_indexed (321/1202)
01/29/2018 04:50:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D8_AD012_indexed (322/1202)
01/29/2018 04:50:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD001_indexed (323/1202)
01/29/2018 04:50:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD002_indexed (324/1202)
01/29/2018 04:50:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD004_indexed (325/1202)
01/29/2018 04:50:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD006_indexed (326/1202)
01/29/2018 04:50:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 04:50:59 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD004_indexed (384/1202)
01/29/2018 04:50:59 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD007_indexed (385/1202)
01/29/2018 04:51:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD008_indexed (386/1202)
01/29/2018 04:51:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD010_indexed (387/1202)
01/29/2018 04:51:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD012_indexed (388/1202)
01/29/2018 04:51:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E7_AD001_indexed (389/1202)
01/29/2018 04:51:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E7_AD004_indexed (390/1202)
01/29/2018 04:51:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 04:51:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD001_indexed (448/1202)
01/29/2018 04:51:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD002_indexed (449/1202)
01/29/2018 04:51:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD006_indexed (450/1202)
01/29/2018 04:51:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD007_indexed (451/1202)
01/29/2018 04:51:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD008_indexed (452/1202)
01/29/2018 04:51:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD010_indexed (453/1202)
01/29/2018 04:51:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD012_indexed (454/1202)
01/29/2018 04:51:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 04:51:24 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD004_indexed (512/1202)
01/29/2018 04:51:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD006_indexed (513/1202)
01/29/2018 04:51:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD007_indexed (514/1202)
01/29/2018 04:51:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD008_indexed (515/1202)
01/29/2018 04:51:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD001_indexed (516/1202)
01/29/2018 04:51:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD002_indexed (517/1202)
01/29/2018 04:51:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD004_indexed (518/1202)
01/29/2018 04:51:26 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 04:51:36 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H12_AD008_indexed (576/1202)
01/29/2018 04:51:36 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H12_AD010_indexed (577/1202)
01/29/2018 04:51:36 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H12_AD012_indexed (578/1202)
01/29/2018 04:51:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD001_indexed (579/1202)
01/29/2018 04:51:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD002_indexed (580/1202)
01/29/2018 04:51:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD004_indexed (581/1202)
01/29/2018 04:51:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD006_indexed (582/1202)
01/29/2018 04:51:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA17120

01/29/2018 04:51:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD001_indexed (640/1202)
01/29/2018 04:51:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD002_indexed (641/1202)
01/29/2018 04:51:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD004_indexed (642/1202)
01/29/2018 04:51:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD006_indexed (643/1202)
01/29/2018 04:51:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD007_indexed (644/1202)
01/29/2018 04:51:50 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD008_indexed (645/1202)
01/29/2018 04:51:50 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD010_indexed (646/1202)
01/29/2018 04:51:50 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 04:52:01 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD002_indexed (704/1202)
01/29/2018 04:52:01 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD004_indexed (705/1202)
01/29/2018 04:52:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD006_indexed (706/1202)
01/29/2018 04:52:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD010_indexed (707/1202)
01/29/2018 04:52:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD012_indexed (708/1202)
01/29/2018 04:52:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A9_AD001_indexed (709/1202)
01/29/2018 04:52:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A9_AD002_indexed (710/1202)
01/29/2018 04:52:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 04:52:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B4_AD008_indexed (768/1202)
01/29/2018 04:52:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B4_AD010_indexed (769/1202)
01/29/2018 04:52:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B4_AD012_indexed (770/1202)
01/29/2018 04:52:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD001_indexed (771/1202)
01/29/2018 04:52:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD002_indexed (772/1202)
01/29/2018 04:52:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD004_indexed (773/1202)
01/29/2018 04:52:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD006_indexed (774/1202)
01/29/2018 04:52:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 04:52:26 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C1_AD008_indexed (832/1202)
01/29/2018 04:52:26 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C1_AD010_indexed (833/1202)
01/29/2018 04:52:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD001_indexed (834/1202)
01/29/2018 04:52:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD004_indexed (835/1202)
01/29/2018 04:52:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD007_indexed (836/1202)
01/29/2018 04:52:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD008_indexed (837/1202)
01/29/2018 04:52:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD010_indexed (838/1202)
01/29/2018 04:52:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 04:52:38 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD007_indexed (896/1202)
01/29/2018 04:52:38 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD008_indexed (897/1202)
01/29/2018 04:52:38 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD010_indexed (898/1202)
01/29/2018 04:52:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD012_indexed (899/1202)
01/29/2018 04:52:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD002_indexed (900/1202)
01/29/2018 04:52:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD004_indexed (901/1202)
01/29/2018 04:52:40 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD008_indexed (902/1202)
01/29/2018 04:52:40 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 04:52:51 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD006_indexed (960/1202)
01/29/2018 04:52:51 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD007_indexed (961/1202)
01/29/2018 04:52:51 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD008_indexed (962/1202)
01/29/2018 04:52:51 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD010_indexed (963/1202)
01/29/2018 04:52:52 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD012_indexed (964/1202)
01/29/2018 04:52:52 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D9_AD001_indexed (965/1202)
01/29/2018 04:52:52 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D9_AD002_indexed (966/1202)
01/29/2018 04:52:52 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 04:53:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD007_indexed (1024/1202)
01/29/2018 04:53:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD008_indexed (1025/1202)
01/29/2018 04:53:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD010_indexed (1026/1202)
01/29/2018 04:53:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD012_indexed (1027/1202)
01/29/2018 04:53:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD001_indexed (1028/1202)
01/29/2018 04:53:05 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD002_indexed (1029/1202)
01/29/2018 04:53:05 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD004_indexed (1030/1202)
01/29/2018 04:53:05 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 04:53:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD002_indexed (1087/1202)
01/29/2018 04:53:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD004_indexed (1088/1202)
01/29/2018 04:53:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD006_indexed (1089/1202)
01/29/2018 04:53:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD007_indexed (1090/1202)
01/29/2018 04:53:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD008_indexed (1091/1202)
01/29/2018 04:53:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD010_indexed (1092/1202)
01/29/2018 04:53:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD012_indexed (1093/1202)
01/29/2018 04:53:18 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 04:53:29 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G3_AD012_indexed (1150/1202)
01/29/2018 04:53:29 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD001_indexed (1151/1202)
01/29/2018 04:53:29 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD007_indexed (1152/1202)
01/29/2018 04:53:30 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD008_indexed (1153/1202)
01/29/2018 04:53:30 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD010_indexed (1154/1202)
01/29/2018 04:53:30 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD012_indexed (1155/1202)
01/29/2018 04:53:30 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G5_AD007_indexed (1156/1202)
01/29/2018 04:53:30 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

In [9]:
# pull binc info from dataset (and reduce to 100kb bin)
# consider hdf5 format as an option
# define a function
# store 100kb file with X chromosome doubled only

log.info("Pulling binc information ({} cells)...".format(len(cells_binc)))
ti = time.time()

contexts = CONTEXTS
bin_size = BIN_SIZE

if not os.path.isdir(ens_binc_path):
    # create folder
    os.makedirs(ens_binc_path)
    log.info("{} created!".format(ens_binc_path))
    
output_fnames = [os.path.join(ens_binc_path, 'binc_m{}_{}_{}.tsv'.format(context, ens, bin_size)) 
                 for context in contexts]

for i, (cell, binc_path) in enumerate(zip(cells_binc, binc_paths)):
    
    df_bin = snmcseq_utils.read_binc(binc_path, compression='gzip')
    
    if i == 0:
        dfs = [pd.DataFrame(index=df_bin.index)]*len(contexts)
        
    for j, context in enumerate(contexts):
    
        assert dfs[j].index.tolist() == df_bin.index.tolist()
        
        dfs[j][cell+'_mc'] = df_bin['m'+context] 
        dfs[j][cell+'_c'] = df_bin[context] 
        
    log.info('Loaded cell: {} ({}/{})'.format(cell, i+1, len(cells_binc)))

for df, context, output in zip(dfs, contexts, output_fnames):
    df.to_csv(output, sep='\t', na_rep='NA', index=True, header=True)
    # compress and name them .bgz
    sp.run("bgzip -f {}".format(output), shell=True)
    sp.run("mv {}.gz {}.bgz".format(output, output), shell=True)

    log.info('Output binc info to \n{}.bgz'.format(output))

tf = time.time()
log.info("Time spent on pulling binc information: {} sec".format(tf - ti))

01/29/2018 05:27:09 PM Pulling binc information (1202 cells)...
01/29/2018 05:27:09 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed (1/1202)
01/29/2018 05:27:10 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed (2/1202)
01/29/2018 05:27:11 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed (3/1202)
01/29/2018 05:27:11 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed (4/1202)
01/29/2018 05:27:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD008_indexed (5/1202)
01/29/2018 05:27:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD010_indexed (6/1202)
01/29/2018 05:27:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD012_indexed (7/1202)
01/29/2018 05:27:13 PM Loaded cell: 1712

01/29/2018 05:27:53 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD001_indexed (65/1202)
01/29/2018 05:27:54 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD004_indexed (66/1202)
01/29/2018 05:27:54 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD006_indexed (67/1202)
01/29/2018 05:27:55 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD007_indexed (68/1202)
01/29/2018 05:27:56 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD008_indexed (69/1202)
01/29/2018 05:27:56 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD010_indexed (70/1202)
01/29/2018 05:27:57 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A8_AD012_indexed (71/1202)
01/29/2018 05:27:58 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A9_

01/29/2018 05:28:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD002_indexed (129/1202)
01/29/2018 05:28:40 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD004_indexed (130/1202)
01/29/2018 05:28:41 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD006_indexed (131/1202)
01/29/2018 05:28:41 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD007_indexed (132/1202)
01/29/2018 05:28:42 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD010_indexed (133/1202)
01/29/2018 05:28:43 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B5_AD012_indexed (134/1202)
01/29/2018 05:28:43 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_B6_AD001_indexed (135/1202)
01/29/2018 05:28:44 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 05:29:24 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD001_indexed (193/1202)
01/29/2018 05:29:24 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD002_indexed (194/1202)
01/29/2018 05:29:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD004_indexed (195/1202)
01/29/2018 05:29:26 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD006_indexed (196/1202)
01/29/2018 05:29:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD007_indexed (197/1202)
01/29/2018 05:29:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD008_indexed (198/1202)
01/29/2018 05:29:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_C2_AD010_indexed (199/1202)
01/29/2018 05:29:29 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 05:30:10 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD001_indexed (257/1202)
01/29/2018 05:30:11 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD002_indexed (258/1202)
01/29/2018 05:30:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD006_indexed (259/1202)
01/29/2018 05:30:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD007_indexed (260/1202)
01/29/2018 05:30:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD008_indexed (261/1202)
01/29/2018 05:30:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD010_indexed (262/1202)
01/29/2018 05:30:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D12_AD012_indexed (263/1202)
01/29/2018 05:30:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA1

01/29/2018 05:30:57 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D8_AD010_indexed (321/1202)
01/29/2018 05:30:58 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D8_AD012_indexed (322/1202)
01/29/2018 05:30:58 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD001_indexed (323/1202)
01/29/2018 05:30:59 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD002_indexed (324/1202)
01/29/2018 05:31:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD004_indexed (325/1202)
01/29/2018 05:31:00 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD006_indexed (326/1202)
01/29/2018 05:31:01 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_D9_AD007_indexed (327/1202)
01/29/2018 05:31:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 05:31:42 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD007_indexed (385/1202)
01/29/2018 05:31:43 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD008_indexed (386/1202)
01/29/2018 05:31:44 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD010_indexed (387/1202)
01/29/2018 05:31:44 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E6_AD012_indexed (388/1202)
01/29/2018 05:31:45 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E7_AD001_indexed (389/1202)
01/29/2018 05:31:46 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E7_AD004_indexed (390/1202)
01/29/2018 05:31:46 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_E7_AD006_indexed (391/1202)
01/29/2018 05:31:47 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 05:32:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD002_indexed (449/1202)
01/29/2018 05:32:29 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD006_indexed (450/1202)
01/29/2018 05:32:31 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD007_indexed (451/1202)
01/29/2018 05:32:31 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD008_indexed (452/1202)
01/29/2018 05:32:32 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD010_indexed (453/1202)
01/29/2018 05:32:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F4_AD012_indexed (454/1202)
01/29/2018 05:32:33 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_F5_AD001_indexed (455/1202)
01/29/2018 05:32:34 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 05:33:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD006_indexed (513/1202)
01/29/2018 05:33:16 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD007_indexed (514/1202)
01/29/2018 05:33:17 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G2_AD008_indexed (515/1202)
01/29/2018 05:33:18 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD001_indexed (516/1202)
01/29/2018 05:33:19 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD002_indexed (517/1202)
01/29/2018 05:33:19 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD004_indexed (518/1202)
01/29/2018 05:33:20 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_G3_AD006_indexed (519/1202)
01/29/2018 05:33:21 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3

01/29/2018 05:34:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H12_AD010_indexed (577/1202)
01/29/2018 05:34:03 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H12_AD012_indexed (578/1202)
01/29/2018 05:34:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD001_indexed (579/1202)
01/29/2018 05:34:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD002_indexed (580/1202)
01/29/2018 05:34:05 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD004_indexed (581/1202)
01/29/2018 05:34:06 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD006_indexed (582/1202)
01/29/2018 05:34:06 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_H1_AD008_indexed (583/1202)
01/29/2018 05:34:07 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206

01/29/2018 05:34:47 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD002_indexed (641/1202)
01/29/2018 05:34:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD004_indexed (642/1202)
01/29/2018 05:34:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD006_indexed (643/1202)
01/29/2018 05:34:50 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD007_indexed (644/1202)
01/29/2018 05:34:50 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD008_indexed (645/1202)
01/29/2018 05:34:51 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD010_indexed (646/1202)
01/29/2018 05:34:52 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A11_AD012_indexed (647/1202)
01/29/2018 05:34:53 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 05:35:36 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD004_indexed (705/1202)
01/29/2018 05:35:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD006_indexed (706/1202)
01/29/2018 05:35:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD010_indexed (707/1202)
01/29/2018 05:35:38 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A8_AD012_indexed (708/1202)
01/29/2018 05:35:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A9_AD001_indexed (709/1202)
01/29/2018 05:35:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A9_AD002_indexed (710/1202)
01/29/2018 05:35:40 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_A9_AD004_indexed (711/1202)
01/29/2018 05:35:41 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 05:36:23 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B4_AD010_indexed (769/1202)
01/29/2018 05:36:24 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B4_AD012_indexed (770/1202)
01/29/2018 05:36:24 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD001_indexed (771/1202)
01/29/2018 05:36:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD002_indexed (772/1202)
01/29/2018 05:36:26 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD004_indexed (773/1202)
01/29/2018 05:36:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD006_indexed (774/1202)
01/29/2018 05:36:27 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_B5_AD007_indexed (775/1202)
01/29/2018 05:36:28 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 05:37:10 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C1_AD010_indexed (833/1202)
01/29/2018 05:37:10 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD001_indexed (834/1202)
01/29/2018 05:37:11 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD004_indexed (835/1202)
01/29/2018 05:37:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD007_indexed (836/1202)
01/29/2018 05:37:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD008_indexed (837/1202)
01/29/2018 05:37:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD010_indexed (838/1202)
01/29/2018 05:37:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_C2_AD012_indexed (839/1202)
01/29/2018 05:37:15 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 05:37:57 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD008_indexed (897/1202)
01/29/2018 05:37:58 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD010_indexed (898/1202)
01/29/2018 05:37:58 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D11_AD012_indexed (899/1202)
01/29/2018 05:37:59 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD002_indexed (900/1202)
01/29/2018 05:38:02 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD004_indexed (901/1202)
01/29/2018 05:38:03 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD008_indexed (902/1202)
01/29/2018 05:38:03 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D12_AD010_indexed (903/1202)
01/29/2018 05:38:04 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 05:38:46 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD007_indexed (961/1202)
01/29/2018 05:38:46 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD008_indexed (962/1202)
01/29/2018 05:38:47 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD010_indexed (963/1202)
01/29/2018 05:38:48 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D8_AD012_indexed (964/1202)
01/29/2018 05:38:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D9_AD001_indexed (965/1202)
01/29/2018 05:38:49 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D9_AD002_indexed (966/1202)
01/29/2018 05:38:50 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_D9_AD004_indexed (967/1202)
01/29/2018 05:38:51 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3

01/29/2018 05:39:34 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD008_indexed (1025/1202)
01/29/2018 05:39:35 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD010_indexed (1026/1202)
01/29/2018 05:39:35 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E5_AD012_indexed (1027/1202)
01/29/2018 05:39:36 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD001_indexed (1028/1202)
01/29/2018 05:39:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD002_indexed (1029/1202)
01/29/2018 05:39:37 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD004_indexed (1030/1202)
01/29/2018 05:39:38 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_E6_AD006_indexed (1031/1202)
01/29/2018 05:39:39 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 05:40:20 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD004_indexed (1088/1202)
01/29/2018 05:40:21 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD006_indexed (1089/1202)
01/29/2018 05:40:21 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD007_indexed (1090/1202)
01/29/2018 05:40:22 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD008_indexed (1091/1202)
01/29/2018 05:40:23 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD010_indexed (1092/1202)
01/29/2018 05:40:23 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F3_AD012_indexed (1093/1202)
01/29/2018 05:40:24 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_F4_AD004_indexed (1094/1202)
01/29/2018 05:40:25 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

01/29/2018 05:41:09 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD001_indexed (1151/1202)
01/29/2018 05:41:10 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD007_indexed (1152/1202)
01/29/2018 05:41:11 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD008_indexed (1153/1202)
01/29/2018 05:41:11 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD010_indexed (1154/1202)
01/29/2018 05:41:12 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G4_AD012_indexed (1155/1202)
01/29/2018 05:41:13 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G5_AD007_indexed (1156/1202)
01/29/2018 05:41:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_G5_AD008_indexed (1157/1202)
01/29/2018 05:41:14 PM Loaded cell: 171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA1

In [42]:
# # test loading time, comparing with hdf5
# gene_mc_c = os.path.join(ens_path, 'gene_level/genebody_mCH_Ens1.tsv.bgz')
# bin_mc_c = os.path.join(ens_path, 'binc/binc_mCH_Ens1_10000.tsv.bgz')

# # contexts = CONTEXTS
# # ti = time.time()
# # dfs = []
# # for context in contexts: 
# #     gene_mc_c = os.path.join(ens_path, 'gene_level/genebody_m{}_Ens1.tsv.bgz'.format(context))
# #     dfs.append(pd.read_table(gene_mc_c, index_col='gene_id', compression='gzip'))
# # tf = time.time()

# # print(tf-ti)

# # ti = time.time()
# # for context,df in zip(contexts, dfs):
# #     f = os.path.join(ens_path, 'gene_level/genebody_Ens1.tsv.bgz.hdf5')
# #     df.to_hdf(f, key='m'+context, format='fixed')
# # tf = time.time()
# # print(tf-ti)

# ti = time.time()
# df = pd.read_table(bin_mc_c, index_col=['chr', 'bin'], compression='gzip', dtype={'chr': object, 'bin': int})
# tf = time.time()
# # print(tf-ti)

# f = os.path.join(ens_path, 'binc/binc_Ens1.tsv.bgz.hdf5')
# ti = time.time()
# df.to_hdf(f, key='mCH')
# tf = time.time()
# print(tf-ti)


In [5]:
# load 10kb bins
f = os.path.join(ens_path, 'binc/binc_Ens1.tsv.bgz.hdf5')
df = pd.read_hdf(f, key='mCH').reset_index()

print(df.shape)
df.head()

(263369, 2406)


Unnamed: 0,chr,bin,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed_c,...,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD007_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD007_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD008_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD008_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD012_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD012_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H9_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H9_AD008_indexed_c
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,10000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,20000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,30000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,40000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
df.iloc[1000:1005]

Unnamed: 0,chr,bin,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed_c,...,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD007_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD007_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD008_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD008_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD012_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD012_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H9_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H9_AD008_indexed_c
1000,1,10000000,0,50,1,27,5,107,1,25,...,0,10,1,52,5,50,0,13,7,133
1001,1,10010000,1,55,0,25,0,25,0,10,...,1,32,4,95,0,9,4,54,1,58
1002,1,10020000,0,47,0,15,0,75,0,38,...,2,51,1,50,1,26,3,63,2,51
1003,1,10030000,0,55,0,19,1,50,0,28,...,0,108,0,58,0,44,1,45,1,113
1004,1,10040000,0,40,1,52,0,31,0,33,...,0,82,1,82,0,20,3,59,1,98


In [18]:
def merge_bins(df, bin_size=10*BIN_SIZE, double_xsize=True, 
               output_file=None):
    """
    Merge bins of BIN_SIZE to n*BIN_SIZE, where n has to be an integer.
    The last incomplete bin for each chromosome is removed.
   
    df has columns: ['chr', 'bin'] (0 based) and [$sample_mc, $sample_c, ....]
    
    return binc file (or choose to save to file)
    """ 
    
    chromosomes = snmcseq_utils.get_mouse_chromosomes()
    chrs_all = np.asarray([])
    bins_all = np.asarray([])
    mc_c_all = OrderedDict()
    
    for col in df.columns:
        if col not in ['chr', 'bin']:
            mc_c_all[col] = np.array([])
        
    for chromosome, df_sub in df.groupby('chr'):
        # here -1 is very important!
        bins = (np.arange(0, snmcseq_utils.get_chrom_lengths_mouse()[chromosome], bin_size) - 1)

        if double_xsize and chromosome == 'X':
            # here -1 is very important!
            bins = (np.arange(0, snmcseq_utils.get_chrom_lengths_mouse()[chromosome], 2*bin_size) - 1)
        
        res = df_sub.groupby(pd.cut(df_sub['bin'], bins)).sum().fillna(0)
        
        chrs = np.asarray([chromosome]*(len(bins)-1))
        bins_all = np.concatenate([bins_all, (bins+1)[:-1]]) # +1 to restore 0-based
        chrs_all = np.concatenate([chrs_all, chrs])
        
        for col in df.columns:
            if col not in ['chr', 'bin']:
                mc_c_all[col] = np.concatenate([mc_c_all[col], res[col]])
        
    # binc
    columns = ['chr', 'bin'] + [key for key in mc_c_all]
    binc = pd.DataFrame(columns=columns)
    binc['chr'] = chrs_all.astype(object)
    binc['bin'] = bins_all.astype(int)
    for key, value in mc_c_all.items():
        binc[key] = value.astype(int) 
    
    if output_file:
        binc.to_csv(output_file, na_rep='NA', sep="\t", header=True, index=False)
        # compress and name them .bgz
        sp.run("bgzip -f {}".format(output_file), shell=True)
        sp.run("mv {}.gz {}.bgz".format(output_file, output_file), shell=True)
        log.info("Done with binc processing, saving results to: {}.bgz".format(output_file))

    return binc

    

In [19]:
# 10kb bins to 100kb bins

# load 10kb bins
output_file = os.path.join(ens_path, 'binc/binc_100kb_test.tsv')

ti = time.time()
df_bin = merge_bins(df, bin_size=100000, double_xsize=True, output_file=output_file)
print(df_bin.shape)
tf = time.time()
log.info(tf - ti)


01/30/2018 05:18:10 PM Done with binc processing, saving results to: /cndd/Public_Datasets/CEMBA/Ensembles/Ens1/binc/binc_100kb_test.tsv.bgz
01/30/2018 05:18:10 PM 503.4617955684662


(25475, 2406)


In [14]:
df_bin.iloc[1000:1010]

Unnamed: 0,chr,bin,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD001_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD002_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD004_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_1_CEMBA171206_3C_3_A10_AD007_indexed_c,...,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD007_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD007_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H6_AD008_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD008_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD012_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H7_AD012_indexed_c,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H9_AD008_indexed_mc,171213_CEMBA_mm_P56_P63_3C_MOp_CEMBA171206_3C_4_CEMBA171206_3C_5_H9_AD008_indexed_c
1000,1,100000000,31,584,45,577,21,980,19,335,...,10,342,45,517,25,480,32,328,24,611
1001,1,100100000,18,347,40,601,12,412,26,589,...,18,366,44,713,33,483,54,623,39,892
1002,1,100200000,37,383,55,569,11,345,17,323,...,6,93,24,494,20,388,26,420,39,726
1003,1,100300000,30,528,34,425,10,400,13,425,...,1,57,23,578,23,460,31,536,51,931
1004,1,100400000,27,549,36,438,27,571,23,510,...,19,511,26,602,22,463,24,476,26,640
1005,1,100500000,17,467,34,399,17,555,23,437,...,10,579,24,423,13,286,23,504,30,970
1006,1,100600000,25,432,31,360,22,480,14,368,...,17,433,28,519,23,427,21,380,31,724
1007,1,100700000,33,513,29,509,16,568,37,583,...,7,173,39,640,26,353,21,488,26,658
1008,1,100800000,14,388,39,462,17,418,20,517,...,9,225,41,419,39,607,16,275,46,971
1009,1,100900000,22,391,33,570,7,459,34,679,...,15,353,27,648,29,457,20,264,54,1243


In [20]:
503/60

8.383333333333333