In [1]:
import pandas as pd
import MarineDNA as md
import seaborn as sn
import numpy as np

In [2]:
file1 = "../../../Data/Flyer2018_16S_table_counts.tsv"
asvs1 = pd.read_csv(file1, index_col=0, sep="\t").transpose()

# The raw data

In [3]:
asvs1.head()

ASV ID,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,a140195871278e8fcf9447e42bad8786,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
CN18Fc12_8_eDNA,552,210,145,130,156,49,0,89,190,97,...,0,0,0,0,0,0,0,0,0,0
CN18Fc19_5_eDNA,7415,1933,2089,1830,1742,488,234,595,767,918,...,0,0,0,0,0,0,0,0,0,0
CN18Fc21_6_eDNA,8749,2808,2530,2516,1761,787,632,1162,1545,1252,...,0,0,0,0,0,0,0,0,0,0
CN18Fc22_6_eDNA,8152,1967,2086,2178,1855,510,353,750,988,904,...,0,0,0,0,0,0,0,0,0,0
CN18Fc24_6_eDNA,7124,1671,2343,2256,1812,720,308,888,1179,824,...,0,0,0,0,0,0,0,0,0,0


# Extract an example row (samples) of read counts for a handful of columns (ASVs)

In [4]:
ex_row = asvs1.iloc[0,50:56]
ex_row

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c     0
f936cc0095df2ce79b485df5f7fe631a     4
45aae5b06129baf3325f68a675e9c8e2    16
defcb02ec20f29352cb1b1b267f162a4     8
e1453e7b5954ac141ec0b8c91939512d     0
2ed7d51e061664183c05fbbb56c0787e    31
Name: CN18Fc12_8_eDNA, dtype: int64

# Total number of counts for sample

In [5]:
ex_row.sum()

59

# The observed count relative percentages

In [6]:
print(ex_row / ex_row.sum())

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c    0.000000
f936cc0095df2ce79b485df5f7fe631a    0.067797
45aae5b06129baf3325f68a675e9c8e2    0.271186
defcb02ec20f29352cb1b1b267f162a4    0.135593
e1453e7b5954ac141ec0b8c91939512d    0.000000
2ed7d51e061664183c05fbbb56c0787e    0.525424
Name: CN18Fc12_8_eDNA, dtype: float64


# Compute Beta parameters for each count

In [7]:
alpha = ex_row + 1
beta = ex_row.sum() - ex_row + 1
print(alpha)
print()
print(beta)

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c     1
f936cc0095df2ce79b485df5f7fe631a     5
45aae5b06129baf3325f68a675e9c8e2    17
defcb02ec20f29352cb1b1b267f162a4     9
e1453e7b5954ac141ec0b8c91939512d     1
2ed7d51e061664183c05fbbb56c0787e    32
Name: CN18Fc12_8_eDNA, dtype: int64

ASV ID
9cdadd8a7359a3163fb31ad06be74e8c    60
f936cc0095df2ce79b485df5f7fe631a    56
45aae5b06129baf3325f68a675e9c8e2    44
defcb02ec20f29352cb1b1b267f162a4    52
e1453e7b5954ac141ec0b8c91939512d    60
2ed7d51e061664183c05fbbb56c0787e    29
Name: CN18Fc12_8_eDNA, dtype: int64


# A random draw from the Beta distribution for each cell and convert percentage to sum to 1

In [8]:
beta_draw = np.random.beta(alpha, beta)
print(beta_draw)
print(beta_draw.sum())
print()
unit_draw = beta_draw / beta_draw.sum()
print(unit_draw)
print(unit_draw.sum())

[0.01025295 0.13256931 0.34202312 0.13124267 0.02472988 0.61205727]
1.2528751953232349

[0.00818354 0.10581206 0.27299058 0.10475318 0.01973851 0.48852214]
1.0


# Function to draw a random sample from a row

In [9]:
def betaRow(row):
    ran_row = np.random.beta(row + 1, row.sum() - row + 1)
    ran_row = ran_row / ran_row.sum()
    return ran_row

In [10]:
betaRow(ex_row)

array([0.03029297, 0.05280715, 0.17457084, 0.13208423, 0.01889795,
       0.59134686])

# Function to draw a random sample for the full data frame

In [11]:
# Return data frame of a draw of relative percent of occurrence from a beta distribution
# fit to observed occurrence counts
#   df: data frame where rows = samples and columns = ASVs
def ranRelPct(df, asLogOdds = True):
    # function to return a random draw from a beta distribution for a row
    def betaRow(row):
        ran_row = np.random.beta(row + 1, row.sum() - row + 1)
        ran_row = ran_row / ran_row.sum()
        return ran_row
    # apply function to every row to draw sample of relative percent occurrence
    result = df.apply(betaRow, axis = 1, result_type = 'expand')
    # assign row and column names
    result.index = df.index.values
    result.columns = df.columns.values
    # convert to log-odds if requested
    if asLogOdds:
        result = np.log(result / (1 - result))
    return result

In [12]:
ran_sample = ranRelPct(asvs1)
ran_sample.head()

Unnamed: 0,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,a140195871278e8fcf9447e42bad8786,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
CN18Fc12_8_eDNA,-2.450644,-3.47304,-3.811117,-3.86913,-3.720959,-4.806017,-9.116762,-4.121835,-3.523816,-4.254303,...,-9.338834,-9.911631,-9.596166,-8.339286,-8.499551,-8.922937,-7.47269,-10.145096,-11.068144,-10.165443
CN18Fc19_5_eDNA,-1.566279,-3.031368,-2.999913,-3.131192,-3.152215,-4.469255,-5.117076,-4.271957,-4.065181,-3.873513,...,-12.246497,-11.476438,-10.68191,-11.346624,-11.758307,-11.407817,-10.247861,-10.979977,-9.674515,-11.347914
CN18Fc21_6_eDNA,-1.873763,-3.112599,-3.234755,-3.246793,-3.551054,-4.454543,-4.595539,-4.033739,-3.668874,-3.978673,...,-10.214942,-13.959934,-10.703853,-12.818941,-11.585801,-12.031712,-17.443891,-11.073531,-9.746249,-10.867278
CN18Fc22_6_eDNA,-1.677531,-3.219684,-3.212618,-3.147837,-3.313944,-4.610109,-5.010904,-4.200386,-4.019917,-4.050784,...,-13.01577,-11.134813,-10.624284,-11.406457,-11.142277,-14.519355,-10.695151,-9.967446,-12.496356,-10.355515
CN18Fc24_6_eDNA,-1.897783,-3.425755,-3.122939,-3.117828,-3.316237,-4.31293,-5.185001,-4.1465,-3.805895,-4.191375,...,-12.951,-11.747912,-10.414784,-12.73102,-11.278223,-12.856991,-10.318914,-10.582493,-12.178239,-11.089133
