In [1]:
import pandas as pd
import MarineDNA as md
import seaborn as sn
import numpy as np

In [2]:
file1 = "../../../Data/Flyer2018_16S_table_counts.tsv"
asvs1 = pd.read_csv(file1, index_col=0, sep="\t")

In [3]:
# Return 3-d array of Beta alpha and beta parameters for a data frame of raw read counts
# Output dimensions are (parameters, ASVs, samples)
def betaParams(raw_counts):
    def colParams(col):
        return (col + 1, col.sum() - col + 1)
    return np.stack(
        [colParams(raw_counts.iloc[:, i]) for i in range(raw_counts.shape[1])], 
        axis = 2,
        dtype = np.dtype(np.int64, metadata = {"asvs": asvs1.index, "samples": asvs1.columns})
    )

In [4]:
p = betaParams(asvs1)
p

array([[[  553,  7416,  8750, ...,  2016,  1848,  1887],
        [  211,  1934,  2809, ...,  8727,  9331,  8853],
        [  146,  2090,  2531, ...,    23,    29,    45],
        ...,
        [    1,     1,     1, ...,     1,     1,     1],
        [    1,     1,     1, ...,     1,     1,     1],
        [    1,     1,     1, ...,     1,     1,     1]],

       [[ 3411, 33496, 54308, ..., 48232, 56372, 51754],
        [ 3753, 38978, 60249, ..., 41521, 48889, 44788],
        [ 3818, 38822, 60527, ..., 50225, 58191, 53596],
        ...,
        [ 3963, 40911, 63057, ..., 50247, 58219, 53640],
        [ 3963, 40911, 63057, ..., 50247, 58219, 53640],
        [ 3963, 40911, 63057, ..., 50247, 58219, 53640]]])

In [5]:
# Return random draw from Beta distribution of read percentages.
# Output is a data frame with rows = samples ad columns = ASVs
def ranRelPct_new(beta_params, asLogOdds = True):
    # function to draw from distribution and set percentages to sum to 1 for each sample
    def betaCol(col):
        beta_dist = np.random.beta(col[0, :], col[1, :])
        return beta_dist / beta_dist.sum()
    # pre-allocate result array (NOTE: transposes data structure)
    result = np.empty([beta_params.shape[2], beta_params.shape[1]])
    # draw from for each sample
    for i in range(result.shape[0]):
        result[i, :] = betaCol(beta_params[:, :, i])
    if asLogOdds:
        result = np.log(result / (1 - result))
    return pd.DataFrame(result, index = beta_params.dtype.metadata["samples"], columns = beta_params.dtype.metadata["asvs"])

In [6]:
%%time
lo_df = ranRelPct_new(p)
lo_df

CPU times: user 20.1 ms, sys: 1.99 ms, total: 22.1 ms
Wall time: 21.1 ms


ASV ID,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,a140195871278e8fcf9447e42bad8786,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
CN18Fc12_8_eDNA,-2.482497,-3.372453,-3.929155,-3.941854,-3.675396,-4.868287,-7.175893,-4.283818,-3.566674,-4.121777,...,-7.944596,-8.239636,-9.406338,-9.290135,-8.499904,-7.778991,-13.447584,-7.798745,-10.486547,-8.106554
CN18Fc19_5_eDNA,-1.602445,-3.067253,-2.970020,-3.134322,-3.168373,-4.489017,-5.224283,-4.232380,-4.024320,-3.803584,...,-9.916714,-10.331589,-9.850553,-12.125296,-11.071119,-10.760761,-12.647413,-11.865538,-16.161217,-11.390026
CN18Fc21_6_eDNA,-1.871843,-3.128108,-3.222800,-3.196497,-3.584604,-4.431885,-4.588735,-4.002870,-3.707520,-4.001419,...,-12.240816,-12.335745,-10.761990,-11.521752,-9.242597,-11.827339,-10.019897,-13.508473,-14.099467,-11.352880
CN18Fc22_6_eDNA,-1.701029,-3.205109,-3.186796,-3.174771,-3.311810,-4.661550,-4.975694,-4.213150,-4.015982,-4.044792,...,-13.814150,-13.110847,-11.257118,-14.637416,-10.307194,-10.583961,-11.152100,-11.051184,-10.176532,-9.792099
CN18Fc24_6_eDNA,-1.882129,-3.469615,-3.074475,-3.131952,-3.344010,-4.280186,-5.114053,-4.139956,-3.778453,-4.179285,...,-12.118052,-11.075628,-14.459912,-10.417430,-11.051113,-10.496508,-10.595897,-11.454550,-11.098257,-11.176752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CN18SESPkoa_SC42,-3.059620,-1.614692,-6.902313,-5.872030,-6.436036,-4.616366,-3.016618,-5.717092,-6.387763,-5.277537,...,-9.429014,-11.477770,-10.899572,-10.466202,-16.191389,-13.719681,-15.679952,-10.906642,-10.655353,-12.686100
CN18SESPkoa_SC44,-5.904932,-4.688109,-5.208801,-4.820635,-4.680770,-3.224438,-10.749887,-3.503430,-5.396000,-8.178404,...,-9.296137,-9.437723,-8.737774,-9.620677,-9.668309,-8.976626,-8.973744,-7.973343,-8.633349,-9.602706
CN18SESPkoa_SC45,-3.225555,-1.625303,-7.737947,-6.199121,-6.867544,-5.009891,-2.861057,-6.146659,-6.248491,-5.850504,...,-12.181451,-11.065742,-11.704675,-12.204790,-11.517881,-11.430694,-13.612065,-12.539411,-10.893068,-11.801298
CN18SESPkoa_SC47,-3.460759,-1.717478,-7.896068,-6.272470,-6.637340,-5.197082,-3.175606,-6.530200,-6.631821,-5.674820,...,-11.684278,-12.848542,-11.741065,-10.708043,-11.504759,-13.764356,-11.885744,-10.158822,-11.524498,-13.338486
