In [1]:
import pandas as pd
import sklearn.metrics
import numpy as np
import dit
from functools import wraps

In [2]:
def results_matrix(x, y):
    res = pd.DataFrame(index=sorted(x.unique()), columns=sorted(y.unique()))
    for i in res.index:
        res.loc[i,:] = y[(x[x == i]).index].value_counts(sort=False).sort_index()
    return res.fillna(0)

In [3]:
def van_dongen(x, y):
    m = results_matrix(x, y)
    n = m.sum().sum()
    sum1 = np.sum((m.loc[i, :].max() for i in m.index))
    sum2 = np.sum((m.loc[:, i].max() for i in m.columns))
    return 2 * n - sum1 - sum2

In [4]:
def variation_of_information(x, y):
    m = results_matrix(x, y)
    pmf = []
    outs = []
    n = m.sum().sum()
    for i in m.index:
        for j in m.columns:
            p_i_j = m.loc[i, j] / n
            outs.append((i, j))
            pmf.append(p_i_j)
    dist = dit.Distribution(outs, pmf=pmf)
    x_ent = dit.shannon.entropy(dist, rvs=[0])
    y_ent = dit.shannon.entropy(dist, rvs=[1])
    mutual_info = dit.shannon.mutual_information(dist, rvs_X=[0], rvs_Y=[1])
    return x_ent + y_ent - 2 * mutual_info

In [5]:
def map_score(x, y):
    m = results_matrix(x, y)
    left_score = 0
    for i in m.index:
        max_idx = m.loc[i, :].idxmax()
        left_score += (np.sum(m.loc[i, :]) - m.loc[i, max_idx]) / m.loc[i, max_idx]
    left_score = left_score / m.shape[0] / (m.shape[1] - 1) / 2
    
    right_score = 0
    for i in m.columns:
        max_idx = m.loc[:,i].idxmax()
        right_score += (np.sum(m.loc[:, i]) - m.loc[max_idx, i]) / m.loc[max_idx, i]
    right_score = right_score / m.shape[1] / (m.shape[0] - 1) / 2
    return left_score + right_score

In [6]:
MEASURE_NAMES = ['adjusted_rand_score', 
            'adjusted_mutual_info_score', 
            'normalized_mutual_info_score',
            'fowlkes_mallows_score',
            'jaccard_similarity_score',
            'cohen_kappa_score',
           ]
MEASURES = list(map(lambda x: getattr(sklearn.metrics, x), MEASURE_NAMES)) + \
           [van_dongen, variation_of_information, map_score]

In [7]:
clusters = pd.read_csv('../01-cluster-sc01-sc02/SC01_assgn.csv')
clusters.columns = ['Cell', 'Cluster']
clusters.set_index('Cell', inplace=True)

In [8]:
clusters.Cluster.value_counts()

0     1255
1      819
2      789
3      721
4      445
5      410
6      311
7      281
8      272
9      214
10     202
11     197
12     186
13     177
14     165
15      89
16      78
17      55
Name: Cluster, dtype: int64

In [9]:
def frac_size(sample, frac):
    return len(sample.sample(frac=frac))

In [10]:
def on_some(func, selector):
    @wraps(func)
    def inner(clusters, frac=0.1):
        exp = clusters.copy()
        idx = exp[selector(exp)].sample(n=frac_size(clusters, frac)).index
        return func(exp, idx)
    return inner

def on_all(func):
    return on_some(func, lambda exp: exp.index)

def on_big(func):
    return on_some(func, lambda exp: exp < BIG)

def on_small(func):
    return on_some(func, lambda exp: exp > SMALL)

In [11]:
@on_all
def rename(exp, idx):
    exp[exp.index] += 18
    return exp

In [12]:
@on_all
def mistakes_all(exp, idx):
    exp[idx] -= 1
    exp[exp == -1] = 17
    return exp

In [13]:
@on_big
def mistakes_big(exp, idx):
    exp[idx] -= 1
    exp[exp == -1] = 17
    return exp

In [14]:
@on_small
def mistakes_small(exp, idx):
    exp[idx] -= 1
    return exp

In [15]:
@on_all
def splits_all(exp, idx):
    exp[idx] += 18
    return exp

In [16]:
@on_big
def splits_big(exp, idx):
    exp[idx] += 18
    return exp

In [17]:
@on_small
def splits_small(exp, idx):
    exp[idx] += 18
    return exp

In [18]:
@on_all
def unites_all(exp, idx):
    exp[idx] += 18
    exp[(exp > 17) & (exp % 2 == 1)] -= 1
    return exp

In [19]:
@on_big
def unites_big(exp, idx):
    exp[idx] += 18
    exp[(exp > 17) & (exp % 2 == 1)] -= 1
    return exp

In [20]:
@on_small
def unites_small(exp, idx):
    exp[idx] += 18
    exp[(exp > 17) & (exp % 2 == 1)] -= 1
    return exp

In [21]:
@on_all
def splits_unites_all(exp, idx):
    exp[idx] += 18
    exp[(exp > 17) & (exp % 4).isin((1,))] -= 1
    return exp

In [22]:
@on_big
def splits_unites_big(exp, idx):
    exp[idx] += 18
    exp[(exp > 17) & (exp % 4).isin((1,))] -= 1
    return exp

In [23]:
@on_small
def splits_unites_small(exp, idx):
    exp[idx] += 18
    exp[(exp > 17) & (exp % 4).isin((1,))] -= 1
    return exp

In [24]:
EXPERIMENTS = [
    'rename',
    'mistakes_all',
    'mistakes_big',
    'mistakes_small',
    'splits_all',
    'splits_big',
    'splits_small',
    'unites_all',
    'unites_big',
    'unites_small',
    'splits_unites_all',
    'splits_unites_big',
    'splits_unites_small',
]

In [25]:
BIG = 4
SMALL = 9
for key in EXPERIMENTS:
    clusters[key] = locals()[key](clusters.Cluster, frac=0.1)

In [26]:
results = pd.DataFrame(index=[x.__name__ for x in MEASURES])
for key in EXPERIMENTS:
    res = []
    for measure in MEASURES:
        res.append(measure(clusters.Cluster, clusters[key]))
    results[key] = res

In [27]:
results

Unnamed: 0,rename,mistakes_all,mistakes_big,mistakes_small,splits_all,splits_big,splits_small,unites_all,unites_big,unites_small,splits_unites_all,splits_unites_big,splits_unites_small
adjusted_rand_score,1.0,0.812389,0.766745,0.971039,0.894717,0.849268,0.987869,0.88597,0.83591,0.979844,0.891815,0.845596,0.983552
adjusted_mutual_info_score,1.0,0.875596,0.90136,0.947183,0.886603,0.908144,0.956253,0.884372,0.906095,0.954997,0.885907,0.90723,0.955659
normalized_mutual_info_score,1.0,0.882788,0.912535,0.951019,0.942559,0.95344,0.978163,0.928727,0.939848,0.964511,0.936662,0.947393,0.970948
fowlkes_mallows_score,1.0,0.830072,0.788722,0.973821,0.907996,0.870009,0.989072,0.899771,0.855995,0.981764,0.905233,0.865787,0.985132
jaccard_similarity_score,0.0,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994,0.89994
cohen_kappa_score,0.0,0.889684,0.88996,0.889359,0.890509,0.891101,0.889687,0.890546,0.891097,0.889689,0.89052,0.89108,0.889689
van_dongen,0.0,1242.0,1150.0,1028.0,667.0,667.0,482.0,953.0,934.0,776.0,793.0,800.0,631.0
variation_of_information,0.0,0.879453,0.659524,0.363636,0.467819,0.372665,0.168166,0.565439,0.469262,0.267209,0.509376,0.415823,0.220653
map_score,0.0,0.0059,0.002801,0.019281,0.001705,0.001206,0.00634,0.008925,0.003326,0.011704,0.004405,0.002469,0.008715
