In [1]:
import pandas as pd
import MarineDNA as md
import seaborn as sn
import numpy as np

In [2]:
file1 = "../../../Data/Flyer2018_16S_table_counts.tsv"
asvs1 = pd.read_csv(file1, index_col=0, sep="\t").transpose()

# The raw data

In [3]:
asvs1.head()

ASV ID,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,a140195871278e8fcf9447e42bad8786,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
CN18Fc12_8_eDNA,552,210,145,130,156,49,0,89,190,97,...,0,0,0,0,0,0,0,0,0,0
CN18Fc19_5_eDNA,7415,1933,2089,1830,1742,488,234,595,767,918,...,0,0,0,0,0,0,0,0,0,0
CN18Fc21_6_eDNA,8749,2808,2530,2516,1761,787,632,1162,1545,1252,...,0,0,0,0,0,0,0,0,0,0
CN18Fc22_6_eDNA,8152,1967,2086,2178,1855,510,353,750,988,904,...,0,0,0,0,0,0,0,0,0,0
CN18Fc24_6_eDNA,7124,1671,2343,2256,1812,720,308,888,1179,824,...,0,0,0,0,0,0,0,0,0,0


# Extract an example row (samples) of read counts for a handful of columns (ASVs)

In [4]:
ex_row = asvs1.iloc[0,50:56]
ex_row

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c     0
f936cc0095df2ce79b485df5f7fe631a     4
45aae5b06129baf3325f68a675e9c8e2    16
defcb02ec20f29352cb1b1b267f162a4     8
e1453e7b5954ac141ec0b8c91939512d     0
2ed7d51e061664183c05fbbb56c0787e    31
Name: CN18Fc12_8_eDNA, dtype: int64

# Total number of counts for sample

In [5]:
ex_row.sum()

59

# The observed count relative percentages

In [6]:
print(ex_row / ex_row.sum())

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c    0.000000
f936cc0095df2ce79b485df5f7fe631a    0.067797
45aae5b06129baf3325f68a675e9c8e2    0.271186
defcb02ec20f29352cb1b1b267f162a4    0.135593
e1453e7b5954ac141ec0b8c91939512d    0.000000
2ed7d51e061664183c05fbbb56c0787e    0.525424
Name: CN18Fc12_8_eDNA, dtype: float64


# Compute Beta parameters for each count

In [7]:
alpha = ex_row + 1
beta = ex_row.sum() - ex_row + 1
print(alpha)
print()
print(beta)

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c     1
f936cc0095df2ce79b485df5f7fe631a     5
45aae5b06129baf3325f68a675e9c8e2    17
defcb02ec20f29352cb1b1b267f162a4     9
e1453e7b5954ac141ec0b8c91939512d     1
2ed7d51e061664183c05fbbb56c0787e    32
Name: CN18Fc12_8_eDNA, dtype: int64

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c    60
f936cc0095df2ce79b485df5f7fe631a    56
45aae5b06129baf3325f68a675e9c8e2    44
defcb02ec20f29352cb1b1b267f162a4    52
e1453e7b5954ac141ec0b8c91939512d    60
2ed7d51e061664183c05fbbb56c0787e    29
Name: CN18Fc12_8_eDNA, dtype: int64


# A random draw from the Beta distribution for each cell and convert percentage to sum to 1

In [8]:
beta_draw = np.random.beta(alpha, beta)
print(beta_draw)
print(beta_draw.sum())
print()
unit_draw = beta_draw / beta_draw.sum()
print(unit_draw)
print(unit_draw.sum())

[0.00246123 0.06513282 0.30288044 0.10908594 0.01240175 0.51766544]
1.00962761229246

[0.00243776 0.06451173 0.29999223 0.10804572 0.01228349 0.51272908]
1.0


# Function to draw a random sample from a row

In [9]:
def betaRow(row):
    ran_row = np.random.beta(row + 1, row.sum() - row + 1)
    ran_row = ran_row / ran_row.sum()
    return ran_row

In [10]:
betaRow(ex_row)

array([0.01781951, 0.05229668, 0.22989198, 0.22256364, 0.00055951,
       0.47686869])

# Function to draw a random sample for the full data frame

In [11]:
# Return data frame of a draw of relative percent of occurrence from a beta distribution
# fit to observed occurrence counts
#   df: data frame where rows = samples and columns = ASVs
def ranRelPct(df, asLogOdds = True):
    # function to return a random draw from a beta distribution for a row
    def betaRow(row):
        ran_row = np.random.beta(row + 1, row.sum() - row + 1)
        ran_row = ran_row / ran_row.sum()
        return ran_row
    # apply function to every row to draw sample of relative percent occurrence
    result = df.apply(betaRow, axis = 1, result_type = 'expand')
    # assign row and column names
    result.index = df.index.values
    result.columns = df.columns.values
    # convert to log-odds if requested
    if asLogOdds:
        result = np.log(result / (1 - result))
    return result

In [12]:
ran_sample = ranRelPct(asvs1)
ran_sample.head()

Unnamed: 0,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,a140195871278e8fcf9447e42bad8786,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
CN18Fc12_8_eDNA,-2.394804,-3.532137,-4.0247,-3.87917,-3.623685,-4.850891,-9.674529,-4.120602,-3.423055,-4.188203,...,-8.658247,-9.875247,-7.880846,-9.217175,-9.23879,-8.991688,-9.159413,-8.873349,-8.422475,-9.368209
CN18Fc19_5_eDNA,-1.583237,-3.080483,-3.000659,-3.097429,-3.173882,-4.478593,-5.187715,-4.221063,-4.007482,-3.890304,...,-11.423668,-10.599291,-11.940396,-11.669951,-9.739395,-10.274153,-10.93642,-12.23706,-10.565731,-11.832068
CN18Fc21_6_eDNA,-1.858554,-3.095736,-3.216205,-3.250337,-3.57875,-4.398078,-4.610524,-3.99804,-3.738419,-3.985716,...,-12.365443,-11.692929,-11.199033,-10.972999,-10.953304,-12.054582,-10.548127,-10.919341,-11.850264,-10.379839
CN18Fc22_6_eDNA,-1.695568,-3.243549,-3.144198,-3.105497,-3.300293,-4.626052,-5.044419,-4.26634,-4.01579,-4.047355,...,-12.930963,-10.159565,-10.457774,-17.806621,-14.278484,-10.967202,-14.161786,-12.122079,-10.835438,-10.186954
CN18Fc24_6_eDNA,-1.87635,-3.449592,-3.100245,-3.097987,-3.385167,-4.278255,-5.117024,-4.123165,-3.776595,-4.251998,...,-10.325792,-13.350903,-11.948377,-10.526095,-12.39994,-10.818177,-13.639935,-12.818145,-11.639712,-11.651464
