In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
import glob
import os



In [2]:
cell_types = ['hDL-1', 'hDL-2', 'hDL-3', 'hL2/3', 'hL4', 'hL5-1', 'hL5-2', 'hL5-3',
       'hL5-4', 'hL6-1', 'hL6-2', 'hL6-3', 'hNdnf', 'hNos', 'hPv-1',
       'hPv-2', 'hSst-1', 'hSst-2', 'hSst-3', 'hVip-1', 'hVip-2', 'unlabeled']

cell_classes = ['exci', 'exci', 'exci', 'exci', 'exci', 'exci', 'exci', 'exci',
       'exci', 'exci', 'exci', 'exci', 'inhi', 'inhi', 'inhi',
       'inhi', 'inhi', 'inhi', 'inhi', 'inhi', 'inhi', 'glia']

df_cell_class = pd.DataFrame()
df_cell_class['cell_type'] = cell_types
df_cell_class['cell_class'] = cell_classes
df_cell_class = df_cell_class.set_index('cell_type')
df_cell_class

Unnamed: 0_level_0,cell_class
cell_type,Unnamed: 1_level_1
hDL-1,exci
hDL-2,exci
hDL-3,exci
hL2/3,exci
hL4,exci
hL5-1,exci
hL5-2,exci
hL5-3,exci
hL5-4,exci
hL6-1,exci


In [3]:
meta_file = './data/metadata/metadata_human_combined_updated.tsv' 
cluster_file = './data/cluster/cluster_MB_v1_MB_EA_MB_EB/clusters_v1_binc_mCH_louvain.tsv'
tsne_file = './data/tsne/tsne_perp30_binc_mCH_human_combined_100000_summary_nmcc_v3.tsv'
gene_id_name_file = './data/references/gene_id_to_names.tsv'
annotation_file = './data/cluster/cluster_MB_v1_MB_EA_MB_EB/clusters_v1_binc_mCH_louvain_annotation.tsv'

df_meta = pd.read_table(meta_file, index_col='Sample')
df_cluster = pd.read_table(cluster_file, index_col='sample')
df_tsne = pd.read_table(tsne_file, index_col='sample')
df_gene = pd.read_table(gene_id_name_file, index_col='geneID')
df_annot = pd.read_table(annotation_file, index_col='cluster_ID').fillna('unlabeled')

df_cluster = pd.merge(df_cluster, df_annot, left_on='cluster_ID', right_index=True)
df_cluster = pd.merge(df_cluster, df_cell_class, left_on='cluster_annotation', right_index=True)

print(df_meta.shape)
print(df_tsne.shape)
print(df_cluster.shape)
print(df_gene.shape)

df_info = pd.merge(df_meta, df_cluster, left_index=True, right_index=True)
df_info = pd.merge(df_info, df_tsne, left_index=True, right_index=True)
print(df_info.shape)

(6435, 16)
(6435, 2)
(6435, 3)
(57820, 5)
(6435, 21)


In [4]:
df_info.head()

Unnamed: 0,Library pool,Layer,Total reads,Mapped reads,Mapping rate,Nonclonal reads,Nonclonal rates,Lambda mC/C,mCCC/CCC,mCG/CG,...,Estimated mCG/CG,Estimated mCH/CH,% Genome covered,Biosample,allc file location (Mukamel lab),cluster_ID,cluster_annotation,cell_class,tsne_x,tsne_y
170508_MB_EA_hs_58yr_BA10_pool_1873_AD006_indexed,U,Deep,4222180.0,2709090.0,64.2%,1539515.0,56.8%,,0.00757,0.82011,...,0.81874,0.03269,5.375979,MB_EA,,cluster_5,hPv-1,inhi,-7.209526,52.020226
170508_MB_EA_hs_58yr_BA10_pool_1874_AD010_indexed,U,Deep,6158974.0,4073199.0,66.1%,2184050.0,53.6%,,0.00833,0.79886,...,0.79717,0.03729,7.390827,MB_EA,,cluster_7,hL4,exci,5.325146,-6.188778
170508_MB_EA_hs_58yr_BA10_pool_1875_AD010_indexed,U,Deep,4494592.0,2906561.0,64.7%,1597297.0,55.0%,,0.0048,0.77485,...,0.77376,0.01248,5.492284,MB_EA,,cluster_9,unlabeled,glia,57.501087,10.186932
170508_MB_EA_hs_58yr_BA10_pool_1876_AD006_indexed,U,Deep,5027606.0,3186947.0,63.4%,1661907.0,52.1%,,0.00382,0.76026,...,0.75934,0.00395,5.600525,MB_EA,,cluster_11,unlabeled,glia,81.862511,-6.417159
170508_MB_EA_hs_58yr_BA10_pool_1876_AD010_indexed,U,Deep,5041710.0,3265710.0,64.8%,1702322.0,52.1%,,0.00388,0.75388,...,0.75292,0.00338,5.719636,MB_EA,,cluster_11,unlabeled,glia,80.223312,-16.956831


In [46]:
dct = OrderedDict()
dct_files = OrderedDict()
path = '/cndd/fangming/snmcseq_dev/data/allc/hs_combined'
with open('test.txt', 'w') as file:
    for biosample, df_bio in df_info[df_info.cluster_annotation=='hL4'].groupby('Biosample'):
        dct[biosample] = samples
        
        samples = ', '.join(df_bio.index.values)
        file.write('{}\n'.format(biosample))
        file.write(samples+'\n')
        
        sample_dirs = [path+'/'+sample for sample in df_bio.index.values]
        dct_files[biosample] = sample_dirs

 

In [51]:
files = []
for dr in dct_files['MB_v1']:
    files += glob.glob(os.path.join(dr, '*.tsv.gz'))
print(len(files))

with open('./data/allc_merged/MB_v1_files.txt', 'w') as f:
    files = ' '.join(files)
    f.write(files+'\n')
    

2568


In [20]:
allcs = ['/cndd/fangming/snmcseq_dev/data/allc/hs_combined/171030_MB_EB_hs_25yr_BA10_FCL1_NA_A11_AD008_indexed/allc_171030_MB_EB_hs_25yr_BA10_FCL1_NA_A11_AD008_indexed_22.tsv.gz',
        '/cndd/fangming/snmcseq_dev/data/allc/hs_combined/171030_MB_EB_hs_25yr_BA10_FCU1_NA_B11_AD010_indexed/allc_171030_MB_EB_hs_25yr_BA10_FCU1_NA_B11_AD010_indexed_22.tsv.gz', 
        '/cndd/fangming/snmcseq_dev/data/allc/hs_combined/171030_MB_EB_hs_25yr_BA10_FCU1_NA_B12_AD010_indexed/allc_171030_MB_EB_hs_25yr_BA10_FCU1_NA_B12_AD010_indexed_22.tsv.gz', 
        ]

df_list = []

n = 0
for allc in allcs:
    df = pd.read_table(allc)
    n += df.shape[0]
    print(df.shape)
    df_list.append(df)

print(n)
pd.concat(df_list).groupby(['chr', 'pos']).sum()


(593679, 7)
(470073, 7)
(611209, 7)
1674961


Unnamed: 0_level_0,Unnamed: 1_level_0,mc_count,total,methylated
chr,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22,16050396,0,1,0
22,16050397,0,1,0
22,16050409,0,1,0
22,16050414,0,1,0
22,16050415,0,1,0
22,16050424,0,1,0
22,16050430,0,1,0
22,16050431,0,1,0
22,16050441,0,1,0
22,16050445,0,1,0


In [1]:
len('............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................')

796