### This script tests different ways of iterating through all sample pairs and determining whether or not they are in the same cluster
### Tests 4 and 5 are the fastest methods and are basically equivalent in time

In [1]:
import pandas as pd
import MarineDNA as md
import plotly.express as px
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
import itertools

In [2]:
file1 = "../../../Data/Flyer2018_16S_table_counts.tsv"
file2 = "../../../Data/Flyer2018_18S_table_counts.tsv"
asvs_16S = pd.read_csv(file1, index_col=0, sep="\t")
asvs_18S = pd.read_csv(file2, index_col=0, sep="\t")

In [3]:
n_clust = 4
n_rep = 10

def isSameCluster(pws, df, col):
    return df.iloc[pws[0], col] == df.iloc[pws[1], col]
    
def maxSame(row):
    return row.value_counts().max()
    
# cluster a random sample of logit(relative percentages)
cluster_samples = [md.doClustering(md.ranRelPct(asvs_16S), n_clust) for i in range(n_rep)]

In [6]:
%%time
cluster_samples = pd.DataFrame(cluster_samples).transpose()

CPU times: user 507 µs, sys: 48 µs, total: 555 µs
Wall time: 559 µs


In [19]:
%%time
pws_same_1 = pd.DataFrame([[isSameCluster(pair, cluster_samples, col) 
                            for col in range(cluster_samples.shape[1])] 
                           for pair in itertools.combinations(range(cluster_samples.shape[0]), 2)])
pws_same_1.head(3)

CPU times: user 103 ms, sys: 3.93 ms, total: 107 ms
Wall time: 106 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,True,False,False,False,False,False,True,False,False,False,...,False,True,False,True,False,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,False,False,False,False,False,True,False,False,False,...,False,True,False,True,False,True,True,True,True,True


In [13]:
%%time
# unique pairs of rows
*pws_rows, = itertools.combinations(range(cluster_samples.shape[0]), 2)
pws_same_2 = pd.DataFrame([[isSameCluster(pair, cluster_samples, col) 
                            for col in range(cluster_samples.shape[1])] 
                           for pair in pws_rows])
pws_same_2.head(3)

CPU times: user 96.1 ms, sys: 75 µs, total: 96.2 ms
Wall time: 95 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,True,False,False,False,False,False,True,False,False,False,...,False,True,False,True,False,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,False,False,False,False,False,True,False,False,False,...,False,True,False,True,False,True,True,True,True,True


In [20]:
%%time
*pws_rows, = itertools.combinations(range(cluster_samples.shape[0]), 2)
pws_same_3 = pd.DataFrame(index = range(len(pws_rows)), columns = range(n_rep))
for c in range(pws_same_3.shape[1]):
    for r in range(pws_same_3.shape[0]):
        pws_same_3.iloc[r, c] = isSameCluster(pws_rows[r], cluster_samples, c)
pws_same_3.head(3)

CPU times: user 41.9 ms, sys: 156 µs, total: 42 ms
Wall time: 41.3 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,True,False,False,False,False,False,True,False,False,False
1,True,True,True,True,True,True,True,True,True,True
2,True,False,False,False,False,False,True,False,False,False


In [29]:
%%timeit
*pws_rows, = itertools.combinations(range(cluster_samples.shape[0]), 2)
pws_same_4 = np.empty([len(pws_rows), n_rep])
for c in range(pws_same_4.shape[1]):
    for r in range(pws_same_4.shape[0]):
        pws_same_4[r, c] = isSameCluster(pws_rows[r], cluster_samples, c)
pd.DataFrame(pws_same_4).head(3)

9.95 ms ± 56.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
pws_same_4

In [30]:
%%timeit
*pws_rows, = itertools.combinations(range(cluster_samples.shape[0]), 2)
pws_same_5 = np.empty([len(pws_rows), n_rep])
for c in range(pws_same_5.shape[1]):
    for r in range(pws_same_5.shape[0]):
        pws_same_5[r, c] = cluster_samples.iloc[pws_rows[r][0], c] == cluster_samples.iloc[pws_rows[r][1], c]
pd.DataFrame(pws_same_5).head(3)

9.68 ms ± 272 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
