In [1]:
import pandas as pd
import numpy as np
import MarineDNA as md
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px

In [2]:
file1 = "../../../Data/Flyer2018_16S_table_counts.tsv"
asvs1 = pd.read_csv(file1, index_col=0, sep="\t")

### Scores Distribution

In [3]:
def plotScoreDistribution(scores, x = 0, y = 1):
    import pandas as pd
    import numpy as np
    import plotly.offline as pyo
    import plotly.graph_objs as go
    
    score_list = md.harmonizeColumnSigns_std(scores)
    score_arr = np.stack(score_list, axis = 2)

    median_score = pd.DataFrame([[np.median(score_arr[row, col, :]) for col in range(score_arr.shape[1])] for row in range(score_arr.shape[0])])
    min_score = pd.DataFrame([[np.min(score_arr[row, col, :]) for col in range(score_arr.shape[1])] for row in range(score_arr.shape[0])])
    max_score = pd.DataFrame([[np.max(score_arr[row, col, :]) for col in range(score_arr.shape[1])] for row in range(score_arr.shape[0])])

    medians = go.Scatter(
        x = median_score.iloc[:, 0],
        y = median_score.iloc[:, 1],
        mode = 'markers'
    )

    horiz_lines = [
        dict(
            type = 'line',
            x0 = min_score.iloc[i, 0],
            y0 = median_score.iloc[i, 1],
            x1 = max_score.iloc[i, 0],
            y1 = median_score.iloc[i, 1],
            line = dict(
                color = 'grey',
                width = 1
            )
        )
        for i in range(median_score.shape[0])
    ]

    vert_lines = [
        dict(
            type = 'line',
            x0 = median_score.iloc[i, 0],
            y0 = min_score.iloc[i, 1],
            x1 = median_score.iloc[i, 0],
            y1 = max_score.iloc[i, 1],
            line = dict(
                color = 'grey',
                width = 1
            )
        )
        for i in range(median_score.shape[0])
    ]

    go.Figure(
        medians, 
        go.Layout(shapes = horiz_lines + vert_lines, autosize = False, width = 1000, height = 1000)
    ).show()

In [6]:
sample_scores = [md.doPCA(md.ranRelPct(asvs1))["scores"] for i in range(100)]
plotScoreDistribution(sample_scores)

In [17]:
# sample scores is the result of 100 PCAs, each array has 61 components for 62 samples. 
# (index =samples,cols= components, values are scores)
print(sample_scores[0].shape)
print(len(sample_scores))
sample_scores[0][0]

(62, 61)
100


array([ 22.26606085,  85.25742796, -34.31605665,  14.94829194,
         5.46386378,   4.59671847,  -4.2825726 ,   3.99805582,
        -5.29610078,   4.68330541,  -6.26071212, -12.05251874,
         0.29853125,  -1.24907059, -12.21885974,   4.43806958,
         8.95574229,  -5.71533186,  -1.39261435,  -1.24649599,
        -3.8340801 ,   6.04213844,   1.32929321,   4.56762751,
         7.94138038,  -4.68905501,  -1.03342909,   1.18295542,
         5.37227273,   4.75152951,  -3.29340711,   4.7617558 ,
        -0.81895425,  -2.3728414 ,   4.95174875,   6.00743958,
        20.071957  ,   1.12816601,   5.89328689,  20.24225892,
       -12.23716033,  19.02310895,   8.0340693 ,   7.34921075,
       -19.37939544,  -3.28691166,  12.07822238,  -6.54683717,
         1.49316037,  -5.45300136,   5.89873287,  10.80630289,
        -7.68340474,  -0.62016692,  -0.46935925,  -1.25767191,
        -4.38056169,  11.1915061 ,   2.02220877,  -7.3593391 ,
         4.433641  ])

### Loadings Distribution

In [56]:
raw_loadings = [md.doPCA(md.ranRelPct(asvs1))["loadings"] for i in range(10)]
df = md.sortLoadings_std(raw_loadings, 0, asvs1.transpose().columns.values)
go.Figure(go.Heatmap(y = df.index[:-1], z = df.iloc[::-1]), go.Layout(autosize = False, width = 1000, height = 600))

In [57]:
# raw_loadings is the result of 10 PCAs, each array has 61 components for 2752 ASVs. 
# (index =ASVs,cols= components, values are scores)
print(raw_loadings[0].shape)
print(len(raw_loadings))
raw_loadings[0][0]

(2752, 61)
10


array([-0.01973929, -0.01589915, -0.03502688,  0.00720891,  0.02592526,
        0.0048165 , -0.00491794,  0.01556318, -0.01017266, -0.00074785,
        0.00341544, -0.00133221,  0.00202869,  0.00454482, -0.00286587,
       -0.00321024, -0.00058252,  0.00216946, -0.00246804, -0.00431233,
        0.00296257, -0.00475843,  0.00506805,  0.00315624, -0.00114604,
        0.00159109, -0.00065954, -0.00133412, -0.00203054,  0.0017161 ,
        0.00249257,  0.00175826,  0.00212427,  0.0035373 ,  0.00399895,
       -0.00252547,  0.00056124, -0.00043194,  0.00167874,  0.00529118,
        0.000802  ,  0.00114462, -0.00548398,  0.0082304 , -0.00061786,
       -0.00423737, -0.0050166 , -0.00372039, -0.0020779 ,  0.00120451,
        0.00114317,  0.00048405, -0.00066286,  0.0015817 ,  0.00246521,
        0.00187456, -0.00502934, -0.00153234,  0.00132418, -0.00664417,
       -0.00537104])

In [114]:
#pca_result_ld = pd.DataFrame(raw_loadings[0])
pca_result_ld = raw_loadings[0].copy()
#just look at first 10 PCAs
# first ten columns...
pca_result_ld = pca_result_ld[:,:10]
print(pca_result_ld.shape)
pca_result_ld

(2752, 10)


array([[-0.01973929, -0.01589915, -0.03502688, ...,  0.01556318,
        -0.01017266, -0.00074785],
       [ 0.01311951, -0.02160951, -0.01082298, ..., -0.00187358,
        -0.00871797,  0.00691388],
       [-0.04305193,  0.01679369,  0.00508102, ...,  0.02250564,
        -0.00576973, -0.00738908],
       ...,
       [ 0.00315026,  0.01287852, -0.01767164, ...,  0.00202815,
        -0.00589751, -0.01050173],
       [-0.0028501 ,  0.02519944, -0.00489467, ...,  0.01808289,
        -0.00449733, -0.01067813],
       [ 0.00403695,  0.01058189, -0.00985288, ...,  0.00566429,
        -0.0086824 ,  0.01290494]])

In [115]:
#pca_result_ld = pd.DataFrame(raw_loadings[0])
arr = raw_loadings[0].copy()
#just look at first 10 PCAs
# first ten columns...
arr = arr[:,:10]
print(arr.shape)
arr

(2752, 10)


array([[-0.01973929, -0.01589915, -0.03502688, ...,  0.01556318,
        -0.01017266, -0.00074785],
       [ 0.01311951, -0.02160951, -0.01082298, ..., -0.00187358,
        -0.00871797,  0.00691388],
       [-0.04305193,  0.01679369,  0.00508102, ...,  0.02250564,
        -0.00576973, -0.00738908],
       ...,
       [ 0.00315026,  0.01287852, -0.01767164, ...,  0.00202815,
        -0.00589751, -0.01050173],
       [-0.0028501 ,  0.02519944, -0.00489467, ...,  0.01808289,
        -0.00449733, -0.01067813],
       [ 0.00403695,  0.01058189, -0.00985288, ...,  0.00566429,
        -0.0086824 ,  0.01290494]])

### One Solution (Array first then dataframe) - KP

In [19]:
# Get interquartile range (lower and upper bounds)
def iqrThresh(x):
    quarts = np.quantile(x, [0.25, 0.75])
    iqr = np.diff(quarts)
    lower = quarts[0] - 1.5 * iqr
    upper = quarts[1] + 1.5 * iqr
    return [lower, upper]
    
iqrThresh(df.iloc[:,0])

[array([-0.01810848]), array([0.02622454])]

In [23]:
# Test if value is within IQR
def isOutlier(x):
    thresh = iqrThresh(x)
    return [xi <= thresh[0] or xi >= thresh[1] for xi in x]

type(isOutlier(df.iloc[:,0]))

list

In [196]:
# ARRAY VERSION - KP
arr = raw_loadings[0].copy()
#just look at first 10 PCAs
# first ten columns...
arr = arr[:,:10]
print(arr.shape)

outliers = [isOutlier(arr[:,i]) for i in range(arr.shape[1])]
print(len(outliers))
print(len(outliers[1]))
# outliers is a list of 10 lists for each PC, each one has 2752 values, 
# each one is a single-value numpy array of TRUE or FALSE if outlier
# in the end, for each of these PCs want list of ASVs that are TRUE outliers

# Now make it a dataframe
df = pd.DataFrame(data = outliers, columns=asvs1.index.tolist())

# get rid of interior array structure, just want TRUE/FALSE
for i in list(df):
    df[i]= df[i].str[0]

# Iterate over PCs, get outlier ASV IDs :
df = df.T
asv_out_lists = []
# for each PC...
for i in list(df):
    # get a dataframe that's just that PC, and just ASVs that are outliers
    dz = df[i]
    dz = dz.loc[dz==True]
    # create list of outlier ASV IDs
    asv_outs = dz.index.tolist()
    asv_outs = ','.join(asv_outs) #make it one string instead of a list
    asv_out_lists.append(asv_outs)


(2752, 10)
10
2752


In [197]:
# can make dataframe of output:
test = pd.DataFrame(index=list(df), data=asv_out_lists, columns=['ASV_outliers'])
test['Number_ASVs'] = test['ASV_outliers'].str.split(',').str.len()
test

Unnamed: 0,ASV_outliers,Number_ASVs
0,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",523
1,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",147
2,"4bbec3bb723375416616a87d785ac74a,041792056bd7b...",202
3,"53f8cc55ef2454c8901d55f93e52a65c,3f9f49c6bb456...",49
4,"041792056bd7be0d5d925d8a6546e833,1dc845204ed8a...",59
5,"36602542caa215eaa04c5068fe515205,7b3d60ba8f753...",69
6,"9cdadd8a7359a3163fb31ad06be74e8c,86edd6692e8cc...",47
7,"43d87801069220664c1981be9255d43a,8fb4413ed236c...",28
8,"b96ea0d3eaa1a062c1e8411c138f7e28,9cdadd8a7359a...",57
9,"137a91c0c7bdcc0e7fd8f6a8e93bea05,775225d70b3ba...",34


### Now want to apply this solution to multiple runs of the data

In [210]:
def get_ASV_outliers(arr, asv_names):
    outliers = [isOutlier(arr[:,i]) for i in range(arr.shape[1])]
    #print(len(outliers))
    #print(len(outliers[1]))
    # outliers is a list of 10 lists for each PC, each one has 2752 values, 
    # each one is a single-value numpy array of TRUE or FALSE if outlier
    # in the end, for each of these PCs want list of ASVs that are TRUE outliers

    # Now make it a dataframe
    df = pd.DataFrame(data = outliers, columns=asv_names)

    # get rid of interior array structure, just want TRUE/FALSE
    for i in list(df):
        df[i]= df[i].str[0]

    # Iterate over PCs, get outlier ASV IDs :
    df = df.T
    asv_out_lists = []
    # for each PC...
    for i in list(df):
        # get a dataframe that's just that PC, and just ASVs that are outliers
        dz = df[i]
        dz = dz.loc[dz==True]
        # create list of outlier ASV IDs
        asv_outs = dz.index.tolist()
        asv_outs = ','.join(asv_outs) #make it one string instead of a list
        asv_out_lists.append(asv_outs)
    # can make dataframe of output:
    test = pd.DataFrame(index=list(df), data=asv_out_lists, columns=['ASV_outliers'])
    test['Number_ASVs'] = test['ASV_outliers'].str.split(',').str.len()
    return(test)


In [211]:
arr_list = raw_loadings.copy()
# list of PCAs that were run
dfs= []
for arr in arr_list:
    df = get_ASV_outliers(arr, asvs1.index.tolist())
    dfs.append(df)

In [212]:
# Dataframe of results for 10 runs of PCAs (over 61 PCs from each PCA)
df = pd.concat(dfs, axis=1, keys=list(range(0, 10)))
df

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9
Unnamed: 0_level_1,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs,ASV_outliers,Number_ASVs
0,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",523,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",541,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",512,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",533,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",536,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",511,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",538,"c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe...",515,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",528,"495c1bd1608a1dad54d3e2824ce899ef,c8e360969108f...",526
1,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",147,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",161,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",161,"a900b6678ce86851fb16bfafb87f3326,0c35cfa523aa2...",161,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",141,"a900b6678ce86851fb16bfafb87f3326,0c35cfa523aa2...",153,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",139,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",147,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",148,"495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86...",168
2,"4bbec3bb723375416616a87d785ac74a,041792056bd7b...",202,"4bbec3bb723375416616a87d785ac74a,b96ea0d3eaa1a...",201,"4bbec3bb723375416616a87d785ac74a,1f9d97b07aa56...",190,"4bbec3bb723375416616a87d785ac74a,1f9d97b07aa56...",205,"4bbec3bb723375416616a87d785ac74a,b96ea0d3eaa1a...",209,"b96ea0d3eaa1a062c1e8411c138f7e28,5ac48f1cd4727...",185,"4bbec3bb723375416616a87d785ac74a,041792056bd7b...",213,"4bbec3bb723375416616a87d785ac74a,041792056bd7b...",215,"4bbec3bb723375416616a87d785ac74a,b96ea0d3eaa1a...",204,"4bbec3bb723375416616a87d785ac74a,041792056bd7b...",225
3,"53f8cc55ef2454c8901d55f93e52a65c,3f9f49c6bb456...",49,"b96ea0d3eaa1a062c1e8411c138f7e28,c98f01664c8b5...",55,"b96ea0d3eaa1a062c1e8411c138f7e28,c98f01664c8b5...",51,"b96ea0d3eaa1a062c1e8411c138f7e28,3f9f49c6bb456...",43,"2a1b9aefd806ce0cae1c26cd91f422bc,041ff3f475b58...",29,"c98f01664c8b5c71736c631c6fdb6c80,3f9f49c6bb456...",40,"b96ea0d3eaa1a062c1e8411c138f7e28,3f9f49c6bb456...",49,"b96ea0d3eaa1a062c1e8411c138f7e28,c98f01664c8b5...",43,"b96ea0d3eaa1a062c1e8411c138f7e28,152ca12e2ab9a...",28,"b96ea0d3eaa1a062c1e8411c138f7e28,c98f01664c8b5...",42
4,"041792056bd7be0d5d925d8a6546e833,1dc845204ed8a...",59,"1dc845204ed8a3db5c8fa893e2e529d4,3c25aefde2ab9...",59,"041792056bd7be0d5d925d8a6546e833,84b738cddad75...",56,"b96ea0d3eaa1a062c1e8411c138f7e28,5f4ba6ed21ca4...",76,"041792056bd7be0d5d925d8a6546e833,5f4ba6ed21ca4...",66,"041792056bd7be0d5d925d8a6546e833,5ac48f1cd4727...",71,"041792056bd7be0d5d925d8a6546e833,afe88da0a2513...",62,"9cdadd8a7359a3163fb31ad06be74e8c,7b3d60ba8f753...",63,"041792056bd7be0d5d925d8a6546e833,1dc845204ed8a...",65,"84cec816b1ff951af7b866347fc9ed47,2a1b9aefd806c...",59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,"7ec8be180094b588aa83a84c0d44387e,08c85fe6ede53...",42,"64b7f64aac846cb86f1eaa8fac017de6,17ab7e2aa39f7...",31,"5f510cf434513fffae3da23d8a55ced4,2b7ad807634c3...",45,"b96ea0d3eaa1a062c1e8411c138f7e28,d9066528eb422...",46,"cdfa6944261963a29a9e8193263a1875,7c878ce8a9adb...",29,"86edd6692e8cc7e48dcec26fa149a666,2b7ad807634c3...",33,"9d3c387e6dd89ec0ec390b283f2a9dcd,bca9f328e441d...",40,"4acf587ad601d102a7c998c2825a613d,3c25aefde2ab9...",38,"2b7ad807634c3b16ea3f2186b187a9ed,5eadbce56d28c...",27,"1dc845204ed8a3db5c8fa893e2e529d4,20ebab43718fe...",29
57,"9d3c387e6dd89ec0ec390b283f2a9dcd,52e58c7fb3c1e...",54,"137a91c0c7bdcc0e7fd8f6a8e93bea05,675b8ccf722df...",34,"cdfa6944261963a29a9e8193263a1875,fd187133c3cb6...",37,"8fb4413ed236cf03df5e94100c843f31,dfc0bb50086fd...",41,"3e848010a1fafbdceb45e8531e296532,ba99fc71575fe...",30,"152ca12e2ab9a2e3f92d78d4b40c55ca,5bf5f0e700dd4...",44,"e06f3f27920f8a3e33177cb0ad67f5cf,1c5137af8879c...",40,"1512f5e7d39b6518d5fc44028793e352,b21a8ad25f000...",34,"cdfa6944261963a29a9e8193263a1875,775225d70b3ba...",44,"ad9480c98df5e0bb6467716fecb99efa,4a52357fda8f4...",59
58,"1b21d2b71f96b4a59b392aee9aa1d655,0ab2c16c7082d...",44,"3f9f49c6bb456d8589809fcfd4fa80d9,1c5137af8879c...",39,"86edd6692e8cc7e48dcec26fa149a666,319aab5b67538...",43,"c98f01664c8b5c71736c631c6fdb6c80,cdfa694426196...",39,"7c878ce8a9adb33dc5ead88079151bca,9400842855da1...",29,"d92fd995c906b193643c289dece20f43,757b734a3c4d1...",38,"7b3d60ba8f753f9cd0ccc217bec52f6d,a2afcad6722a7...",36,"3d667bfa3b233a38ee37216c6ac7a371,8ab579c5a813a...",29,"137a91c0c7bdcc0e7fd8f6a8e93bea05,775225d70b3ba...",29,"1c9d4888cc82399ca530f0698dba4502,52e58c7fb3c1e...",41
59,"775225d70b3ba7cd64d29ac389305846,92d29d3e7aa8e...",20,"d7f52e8f8465d1d364a6176fb2bf6d23,91a0133a75e2f...",35,"0ab2c16c7082d494ba4d3ad819ec4f7a,1fd219ca91dfd...",43,"dfc0bb50086fd17c3830499a53fa5895,ab1eb4bae9d6a...",23,"63b98d69f205f4ee460f94a3a59b2615,3f8370d534169...",38,"3c25aefde2ab931cbf9149166d8644af,63f0f4f126dd5...",27,"7c878ce8a9adb33dc5ead88079151bca,128e7e2aeb2d5...",58,"1c9d4888cc82399ca530f0698dba4502,6cc4c17fa6b07...",28,"7b3d60ba8f753f9cd0ccc217bec52f6d,1c9d4888cc823...",44,"7b3d60ba8f753f9cd0ccc217bec52f6d,88a4d20f62b43...",44
