In [None]:
import queue 

from __init__ import *
import snmcseq_utils
from CEMBA_update_mysql import connect_sql

In [None]:
pd.__version__

In [None]:
# get a clustering result
ens = 'Ens7'
ens_path = os.path.join(PATH_ENSEMBLES, ens)
 
database = 'CEMBA'
engine = connect_sql(database)
sql = '''SELECT cell_name, dataset, cluster_mCHmCG_lv_npc50_k30 
        FROM cells 
        RIGHT JOIN {} 
        ON cells.cell_id = {}.cell_id'''.format(ens, ens)
df_cluster = pd.read_sql(sql, engine, index_col='cell_name')
df_cluster.columns = ['dataset', 'cluster'] 
print(df_cluster.shape)
df_cluster.head()

In [9]:
def encode_allc_chrom(chrom):
    """give every chromosome an integer name to facilitate sorting (as CEMBA order)
    """
    trans_dict={'L': -4,
                'M': -3, 
                'X': -2, 
                'Y': -1, 
                }
    try:
        chrom = int(chrom)
    except:
        chrom = trans_dict[chrom]
    return chrom


In [16]:
# def merge_allc(allc_paths, context='CG', chunksize=100000):
#     """Merge allc tables given the allc_files
#     Allc files are assumed to have CEMBA format (no header, all chromosomes in one file, and bgzipped)
#     """
#     iter_allcs = [snmcseq_utils.read_allc_CEMBA(allc_path, chunksize=chunksize)
#                for allc_path in allc_paths]
    
#     merged_cks = []
#     i = 0
#     ti = time.time()
#     while True:
#         print(".", end='')
        
#         i += 1
#         if i%5 == 0:
#             print(i)
#             print(time.time() - ti)
#             ti = time.time()
        
#         dfs_ck = []
#         empty = True
#         # iterate over all iterators once
#         for iter_allc in iter_allcs:
#             try:
#                 df_ck = next(iter_allc)
#                 df_ck = df_ck.loc[df_ck.context.isin(snmcseq_utils.get_expanded_context(context)), ['mc', 'c']]
#                 dfs_ck.append(df_ck)
#                 empty = False
#             except:
#                 pass

#         if empty:
#             break # end the while-loop 
#         else:
#             # merge and append merged chunk
#             merged_ck = pd.concat(dfs_ck).groupby(['chr', 'pos']).sum()
#             merged_cks.append(merged_ck)
#     return merged_cks 

def merge_allc_v2(allc_paths, context='CG', chunksize=100000):
    """Merge allc tables given the allc_files
    Allc files are assumed to have CEMBA format (no header, all chromosomes in one file, and bgzipped)
    """
    iter_allcs = [snmcseq_utils.read_allc_CEMBA(allc_path, chunksize=chunksize, pindex=False)
               for allc_path in allc_paths]
    
    merged_cks = []
    i = 0
    ti = time.time()
    while True:
        print(".", end='')
        
        i += 1
        if i%5 == 0:
            print(i)
            print(time.time() - ti)
            ti = time.time()
        
        dfs_ck = []
        empty = True
        # read phase 
        # iterate over all iterators once
        for iter_allc in iter_allcs:
            try:
                df_ck = next(iter_allc)
                df_ck = df_ck.loc[df_ck.context.isin(snmcseq_utils.get_expanded_context(context)), 
                                  ['chr', 'pos', 'mc', 'c']]
                dfs_ck.append(df_ck)
                empty = False
            except:
                pass

        if empty: # end the while-loop
            break  
        else: # concat and merge phase
            # concat
            df_ck = pd.concat(dfs_ck)
            df_ck['chr_code'] = df_ck['chr'].apply(encode_allc_chrom)
            df_ck = df_ck.set_index(['chr_code', 'pos'])[['mc', 'c']]
            
            # merge 
            merged_ck = df_ck.groupby(['chr_code', 'pos']).sum()
            
            # enqueue
            merged_cks.append(merged_ck)
    return merged_cks 

def merge_allc_v3(allc_paths, context='CG', chunksize=100000):
    """Merge allc tables given the allc_files
    Allc files are assumed to have CEMBA format (no header, all chromosomes in one file, and bgzipped)
    """
    iter_allcs = [snmcseq_utils.read_allc_CEMBA(allc_path, chunksize=chunksize, pindex=False)
               for allc_path in allc_paths]
    
    merged_cks = []
    i = 0
    ti = time.time()
    
    
    while True: # iterate over all iterators once 
                # end while loop if empty for all of them 
        print(".", end='')
        
        i += 1
        if i%5 == 0:
            print(i)
            print(time.time() - ti)
            ti = time.time()
        
        
        
        # establish queue by iterating over all file once
        q = queue.Queue()
        # read phase  (50 at a time)
        j = 0
        tj = time.time()
        for iter_allc_ck in snmcseq_utils.chunks(iter_allcs, 50):
            
            print("-", end='')

            j += 1
            if j%5 == 0:
                print(j)
                print(time.time() - tj)
                tj = time.time()
            
            
            
            # load 50  
            dfs_ck = []
            for iter_allc in iter_allc_ck:
                try:
                    df_ck = next(iter_allc)
                    df_ck = df_ck.loc[df_ck.context.isin(snmcseq_utils.get_expanded_context(context)), 
                                      ['chr', 'pos', 'mc', 'c']]
                    dfs_ck.append(df_ck)
#                     empty = False
                    empty_in = False
                except:
                    pass
                
            if not dfs_ck:
                pass
            else: # concat and merge phase
                df_ck = pd.concat(dfs_ck)
                df_ck['chr_code'] = df_ck['chr'].apply(encode_allc_chrom)
                df_ck = df_ck.set_index(['chr_code', 'pos'])[['mc', 'c']]

                # merge 
                merged_ck_tmp = df_ck.groupby(['chr_code', 'pos']).sum()

                # enqueue
                q.put(merged_ck_tmp)
            
            
        # dequeue
        if q.empty(): # end the while-loop
            break  
        else: # concat and merge phase (second merge)
            merged_ck = queue_merge(q, 20)
            
            # enqueue
            merged_cks.append(merged_ck)
            
    return merged_cks 


def queue_merge(q, n_chunk):
    """Queue merge
    
    Arguments: q (a queue object with dataframes)
    """
    i = 0
    # get n_chunk out if not empty
    dfs = [q.get() for i in range(n_chunk) if not q.empty()]
    ti = time.time()
    while not q.empty():
        i += 1
        print('.', end='')
        if i%10==0:
            print(i, time.time()-ti)
            ti = time.time()

        # merge them and put back in queue
        df = pd.concat(dfs).groupby(['chr_code', 'pos']).sum()
        q.put(df)

        # get n_chunk out if not empty
        dfs = [q.get() for i in range(n_chunk) if not q.empty()]

    # merge them 
    df_final = pd.concat(dfs).groupby(['chr_code', 'pos']).sum()
    
    return df_final

In [13]:
# group allcs for each cluster_id
for cluster_id, df_sub in df_cluster.groupby('cluster'):
#     print(cluster_id)
#     allc_paths = [os.path.join(PATH_DATASETS, '{}/allc/allc_{}.tsv.bgz').format(dataset, cell) 
#                       for (dataset, cell) in zip(df_sub.dataset, df_sub.index)]
        
#     print(len(allc_paths))
    
    if cluster_id == 1:
        allc_paths = [os.path.join(PATH_DATASETS, '{}/allc/allc_{}.tsv.bgz').format(dataset, cell) 
                      for (dataset, cell) in zip(df_sub.dataset, df_sub.index)]
        
        n_files = len(allc_paths)
        print(len(allc_paths))
        
        ti = time.time()
#         chunksize = int(50*100000/n_files)
        chunksize = 100000
        merged_cks = merge_allc_v3(allc_paths, context='CG', chunksize=chunksize)
        tf = time.time()

print(tf-ti)

445
.-----5
15.946000576019287
----.-----5
14.444870471954346
----.-----5
14.13407826423645
----.-----5
14.199315071105957
----.5
132.86358499526978
-----5
13.857809066772461
----.-----5
13.86664366722107
----.-----5
13.539661169052124
----.-----5
13.706116914749146
----.-----5
12.152093410491943
----.10
153.09321475028992
-----5
12.337215185165405
----.-----5
13.327457427978516
----.-----5
13.339228868484497
----.-----5
12.430927276611328
----.-----5
10.946303844451904
----.15
145.71687173843384
-----5
13.503376007080078
----.-----5
13.121249198913574
----.-----5
12.668793439865112
----.-----5
13.060347080230713
----.-----5
12.32868480682373
----.20
151.51914930343628
-----5
11.372429132461548
----.-----5
13.015902042388916
----.-----5
12.85295581817627
----.-----5
11.5370774269104
----.-----5
11.176199913024902
----.25
140.1683053970337
-----5
12.992091178894043
----.-----5
11.959651947021484
----.-----5
12.346632719039917
----.-----5
12.27901029586792
----.-----5
11.783691644668579


----.235
136.57371950149536
-----5
12.608751058578491
----.-----5
12.51162314414978
----.-----5
10.778774976730347
----.-----5
11.360918283462524
----.-----5
12.0990731716156
----.240
138.46681308746338
-----5
10.721327304840088
----.-----5
10.866297006607056
----.-----5
10.710044622421265
----.-----5
10.83768916130066
----.-----5
10.965251684188843
----.245
127.63300657272339
-----5
10.994908809661865
----.-----5
10.97047734260559
----.-----5
11.059962034225464
----.-----5
11.045299053192139
----.-----5
11.10091757774353
----.250
129.4646372795105
-----5
10.685959100723267
----.-----5
10.671967029571533
----.-----5
10.677012920379639
----.-----5
10.72856879234314
----.-----5
10.644976139068604
----.255
126.3274974822998
-----5
10.677276134490967
----.-----5
10.6582612991333
----.-----5
10.882297277450562
----.-----5
10.689499378204346
----.-----5
10.59207558631897
----.260
125.81151223182678
-----5
10.593566656112671
----.-----5
11.071518421173096
----.-----5
10.665956974029541
----.-

----.-----5
3.6755411624908447
----.-----5
3.5666983127593994
----.-----5
3.480149269104004
----.-----5
3.4850170612335205
----.475
53.58407282829285
-----5
3.5108137130737305
----.-----5
3.4782512187957764
----.-----5
3.4206132888793945
----.-----5
3.5589635372161865
----.-----5
3.701361894607544
----.480
51.77638912200928
-----5
3.4293923377990723
----.-----5
3.3666398525238037
----.-----5
3.297206163406372
----.-----5
3.3577258586883545
----.-----5
3.28932523727417
----.485
49.95013761520386
-----5
3.2535688877105713
----.-----5
3.223402738571167
----.-----5
3.167501211166382
----.-----5
3.173450469970703
----.-----5
3.1097216606140137
----.490
47.636120080947876
-----5
2.9503636360168457
----.-----5
2.9175403118133545
----.-----5
2.9318301677703857
----.-----5
2.952453136444092
----.-----5
2.9365737438201904
----.495
45.30369281768799
-----5
2.934861183166504
----.-----5
2.9533257484436035
----.-----5
2.983452081680298
----.-----5
2.9659619331359863
----.-----5
2.976454973220825
--

----.-----5
0.12610268592834473
----.-----5
0.125640869140625
----.705
2.8081002235412598
-----5
0.12913155555725098
----.-----5
0.12617063522338867
----.-----5
0.12489795684814453
----.-----5
0.12864089012145996
----.-----5
0.1273207664489746
----.710
2.4040415287017822
-----5
0.1295456886291504
----.-----5
0.12819933891296387
----.-----5
0.12867116928100586
----.-----5
0.12633705139160156
----.-----5
0.1297597885131836
----.715
2.324474811553955
-----5
0.13122081756591797
----.-----5
0.13022208213806152
----.-----5
0.1340494155883789
----.-----5
0.13288211822509766
----.-----5
0.12923097610473633
----.720
2.347007989883423
-----5
0.1318957805633545
----.-----5
0.13142657279968262
----.-----5
0.13207030296325684
----.-----5
0.1312882900238037
----.-----5
0.13283705711364746
----.725
2.353078842163086
-----5
0.13333916664123535
----.-----5
0.12756609916687012
----.-----5
0.12849020957946777
----.-----5
0.1253964900970459
----.-----5
0.12873601913452148
----.730
2.1374199390411377
-----

----.-----5
0.0018270015716552734
----.925
1.0108554363250732
-----5
0.0017960071563720703
----.-----5
0.0017838478088378906
----.-----5
0.0017142295837402344
----.-----5
0.0018546581268310547
----.-----5
0.001966238021850586
----.930
1.0066943168640137
-----5
0.001730203628540039
----.-----5
0.0018270015716552734
----.-----5
0.001714468002319336
----.-----5
0.001953601837158203
----.-----5
0.0018203258514404297
----.935
1.0167264938354492
-----5
0.0017855167388916016
----.-----5
0.0020668506622314453
----.-----5
0.0018284320831298828
----.-----5
0.001842498779296875
----.-----5
0.0018568038940429688
----.940
1.010538101196289
-----5
0.0018024444580078125
----.-----5
0.0017108917236328125
----.-----5
0.00176239013671875
----.-----5
0.0019538402557373047
----.-----5
0.0019478797912597656
----.945
1.0182380676269531
-----5
0.0017268657684326172
----.-----5
0.001714468002319336
----.-----5
0.00173187255859375
----.-----5
0.0019681453704833984
----.-----5
0.0019631385803222656
----.950
1.0

----.-----5
0.0023398399353027344
----.-----5
0.0023288726806640625
----.1140
0.40579962730407715
-----5
0.002367734909057617
----.-----5
0.0023412704467773438
----.-----5
0.002334117889404297
----.-----5
0.002340555191040039
----.-----5
0.0023055076599121094
----.1145
0.41362953186035156
-----5
0.0024313926696777344
----.-----5
0.002348661422729492
----.-----5
0.0023691654205322266
----.-----5
0.002347230911254883
----.-----5
0.0024251937866210938
----.1150
0.38823533058166504
-----5
0.0024154186248779297
----.-----5
0.002345561981201172
----.-----5
0.0023751258850097656
----.-----5
0.0023446083068847656
----.-----5
0.0023453235626220703
----.1155
0.38515806198120117
-----5
0.0023241043090820312
----.-----5
0.0023775100708007812
----.-----5
0.002357959747314453
----.-----5
0.002353191375732422
----.-----5
0.002351999282836914
----.1160
0.39206862449645996
-----5
0.0023651123046875
----.-----5
0.0023696422576904297
----.-----5
0.0022430419921875
----.-----5
0.0016765594482421875
----.-

In [14]:
print(len(merged_cks))

a = [mck.shape[0] for mck in merged_cks]
sum(a) 

1325


719458627

In [None]:
q = queue.Queue()
for mck in merged_cks:
    q.put(mck)
q.qsize()

n_chunk = 20 
print(n_chunk)

tii = time.time()
df_final = queue_merge(q, n_chunk)

# 2. keep last few (5*n_chunks in queue) and do splitting and combining
# groupby(chrom), append to chromosome specific list and do concat and merge seperately


# 1. first get keys of all c in mm10, play with it and see if they can be easily stored as a dataframe (not easy)






# get n_chunk out if not empty
# dfs = [q.get() for i in range(n_chunk) if not q.empty()]
# while not q.empty():
#     i += 1
#     print('.', end='')
#     if i%10==0:
#         print(i, time.time()-ti)
#         ti = time.time()
    
#     # merge them and put back in queue
#     df = pd.concat(dfs).groupby(['chr_code', 'pos']).sum()
#     q.put(df)
     
#     # get n_chunk out if not empty
#     dfs = [q.get() for i in range(n_chunk) if not q.empty()]
    
# # merge them 
# df_final = pd.concat(dfs).groupby(['chr_code', 'pos']).sum()
print(time.time()-tii)

20
..........10 231.436594247818
..........20 367.7123522758484
..........30 178.97358012199402
..........40 11.897481203079224
..........50 1.6734213829040527
..........60 0.4113740921020508
........

KeyboardInterrupt: 

In [67]:
# check df_final
unique_chr = []
for ind in df_final.index.get_level_values(0):
    if ind not in unique_chr:
        unique_chr.append(ind)
    else:
        pass
print(unique_chr) 

# scale up...
# check total length after phase 1 (insensitive to input number N?)
# setup phase 1 parameters (chunksize)


# add context info back in 
df_final.iloc[1000500:1000505]

[-4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


Unnamed: 0_level_0,Unnamed: 1_level_0,mc,c
chr_code,pos,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,10487025,1,1
-1,10487026,2,2
-1,10487802,0,1
-1,10487949,1,1
-1,10488096,1,1


In [68]:
# 
a = [mck.shape[0] for mck in merged_cks]

total_rows = df_final.shape[0]
total_rows_raw = sum(a) 

total_rows_raw/total_rows

2.38468025501715

In [1]:
n_chunk = 10
tii = time.time()
ti = time.time()
merged_cks_tmp = []
for i, mcks in enumerate(snmcseq_utils.chunks(merged_cks, n_chunk)): 
    print('.', end='')
    if i%10==0:
        print(i)
        print(time.time() - ti)
        ti = time.time()
    tmp = pd.concat(mcks).groupby(['chr', 'pos']).sum()
    merged_cks_tmp.append(tmp)
    
print(time.time()-tii)

NameError: name 'time' is not defined

In [28]:
n_chunk = 10
tii = time.time()
ti = time.time()
merged_cks_tmp2 = []
for i, mcks in enumerate(snmcseq_utils.chunks(merged_cks_tmp, n_chunk)): 
    print('.', end='')
    if i%10==0:
        print(i)
        print(time.time() - ti)
        ti = time.time()
    tmp = pd.concat(mcks).groupby(['chr', 'pos']).sum()
    merged_cks_tmp2.append(tmp)
    
print(time.time()-tii)

.0
0.0002295970916748047
........58.811389207839966


In [31]:
ti = time.time()
merged_final = pd.concat(merged_cks).groupby(['chr', 'pos']).sum()
print(time.time()-ti)

45.11698007583618


In [2]:
print(54040047/33804630)
print(75457653/33804630)
print(80554541/33804630)
print(719458627/33804630)

# group by chromosome (splitting & combining)


1.5985989788972694
2.2321691732759685
2.382944022756646
21.28284282360138


In [8]:
df = snmcseq_utils.read_allc_CEMBA(allc_paths[0])
print(df.shape)
df.head()

  mask |= (ar1 == a)


(47614230, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,strand,context,mc,c,methylated
chr,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L,15,+,CGG,0,1,1
L,23,+,CGC,0,1,1
L,25,+,CTA,0,1,1
L,42,+,CCG,0,1,1
L,43,+,CGG,0,1,1


In [45]:
df_s = df.iloc[:100000]
df_s.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,strand,context,mc,c,methylated
chr,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L,15,+,CGG,0,1,1
L,23,+,CGC,0,1,1
L,25,+,CTA,0,1,1
L,42,+,CCG,0,1,1
L,43,+,CGG,0,1,1


In [13]:
context = 'CG'
df = df.loc[df.context.isin(snmcseq_utils.get_expanded_context(context)), ['mc', 'c']] 

In [48]:
# new_index = [for index in df.index]
# df.reindex()

# new_index = [(allc_chrom_order_CEMBA(chrom), pos) for (chrom, pos) in df_s.index]
new_index = [allc_chrom_order_CEMBA(chrom) for chrom in df_s.index.get_level_values(0)]
df_s.reindex(new_index, level=0)

NotImplementedError: Index._join_level on non-unique index is not implemented

In [38]:
df_test = pd.DataFrame([[1, 2], [2, 3]])

In [39]:
print(type(df_test.index))

<class 'pandas.core.indexes.range.RangeIndex'>
