In [21]:
%load_ext autoreload
%autoreload 2

import numpy as np
import util

import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
# Tasic dataset

identities = np.load('../data/tasic/iden-tasic.npy')
ss_data = np.load('../data/tasic/npdata-tasic.npy')

ss_data = np.transpose(ss_data)
combined = [(identities[i], ss_data[i]) for i in range(len(identities))]

# Group by cluster identity
combined.sort(key=lambda x: x[0])
ss_data = np.array([x[1] for x in combined])

# Log normalize
ss_data = np.log(ss_data + 1)

# Get cluster sizes 
cluster_sizes = [0] * int(max(identities) + 1)
for i, v in enumerate(identities):
    cluster_sizes[int(v)] += 1

# Get compressibility with first 10 PCs
C, avg_inter_tasic, avg_intra_tasic = util.get_compressibility(ss_data, cluster_sizes, 10, seq=True)

In [46]:
print(avg_inter_tasic)
print(avg_intra_tasic)
print(np.mean(avg_inter_tasic))
print(np.mean(avg_intra_tasic))

[1.70394092 2.43997804 1.86283541 2.43339189 1.75709956 2.0138611
 1.69034742 2.32287857]
[ 8.24586712  4.21065732  7.73153907  4.13706243  7.82577833 10.51190154
 10.57733006  2.53632435]
2.028041613807816
6.972057528782857


In [47]:
intra_compress = [avg_intra_tasic[i] * cluster_sizes[i] * (cluster_sizes[i] - 1) for i in range(len(avg_intra_tasic))]
inter_compress = [avg_inter_tasic[i] * cluster_sizes[i] * (sum(cluster_sizes) - cluster_sizes[i]) for i in range(len(avg_inter_tasic))]

In [48]:
comb = [x * (x - 1) for x in cluster_sizes]
comb2 = [x * (sum(cluster_sizes) - x) for x in cluster_sizes]

In [56]:
print(np.sum(inter_compress) / np.sum(comb2))
print(np.sum(intra_compress) / np.sum(comb))

2.341184247254701
4.179817045377


In [54]:
# Duo et al. datasets
dspath=['../data/Koh/koh.h5ad','../data/Kumar/kumar.h5ad', '../data/simkumar/simkumar4easy.h5ad','../data/simkumar/simkumar4hard.h5ad','../data/simkumar/simkumar8hard.h5ad','../data/Trapnell/trapnell.h5ad','../data/zheng/sce_full_Zhengmix4eq.h5ad','../data/zheng/sce_full_Zhengmix4uneq.h5ad','../data/zheng/sce_full_Zhengmix8eq.h5ad']
dsname=['Koh','Kumar','simkumar4easy','simkumar4hard','simkumar8hard','Trapnell','Zheng4eq','Zheng4uneq','Zheng8eq']

def process(data,cs,labels,dimension,fix_ch):
    C, avg_inter, avg_intra = util.get_compressibility(data, cs, dimension, reduce_dim=True, seq=True)
    
    print(avg_inter)
    print(avg_intra)
    print(np.mean(avg_inter))
    print(np.mean(avg_intra))

    comb = [x * (x - 1) for x in cs]
    comb2 = [x * (sum(cs) - x) for x in cs]

    intra_compress = [avg_intra[i] * cs[i] * (cs[i] - 1) for i in range(len(avg_intra))]
    inter_compress = [avg_inter[i] * cs[i] * (sum(cs) - cs[i]) for i in range(len(avg_inter))]
    print(np.sum(inter_compress) / np.sum(comb2))
    print(np.sum(intra_compress) / np.sum(comb))

In [55]:
for i in range(9):
    data, cs, labels = util.initiate(fix_ch=i, dsname=dsname, dspath=dspath)
    k = len(cs)
    print(dsname[i], end='\n')
    process(data,cs,labels,k,i)
    print()

Koh
[2.38896511 2.49758144 2.65013279 2.47731001 2.48533099 2.89492615
 2.32211225 2.60024809 2.00142873]
[7.3149614  9.17370346 6.40218809 5.76634565 6.14379324 8.02608899
 6.34435389 6.99869275 7.06631549]
2.479781728670819
7.026271441008337
2.4591391070288227
6.834903059882185

Kumar
[2.01377167 1.9041688  1.51138835]
[10.94612409  8.73087202 10.34746097]
1.8097762751465822
10.008152359782665
1.8213756365746636
9.850538074623607

simkumar4easy
[3.39515966 3.30827046 3.07721169 2.80944844]
[14.09592463 14.59219633 13.54697101 13.2656434 ]
3.147522560576424
13.87518384237309
3.07835491307922
13.568106243859708

simkumar4hard
[4.92631478 4.53597503 4.66258449 4.3331671 ]
[12.47330674 14.27119879 12.32181331 11.77220617]
4.614510346576734
12.709631251632945
4.618550315210353
12.329185164625859

simkumar8hard
[4.31808964 4.45923483 4.24690334 3.96011005 4.26250373 3.93207221
 3.92825694 3.6124513 ]
[8.49811361 9.59935225 8.01092148 7.66473021 8.43577178 8.70695602
 7.96468426 9.89330056]