Amplicon data of two different marker genes from the same samples
========

### Testing functionality with RAPIDS

#### Import all the dependencies

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns # will this work with RAPIDS?
import pandas as pd
import numpy as np

# RAPIDS libraries
import cudf # RAPIDS equivalent of pandas
import cupy as cp # RAPIDS equivalent of numpy
from cuml.decomposition import PCA # RAPIDS equivalent of scikit-learn

In [2]:
# Check hardware configuration
!nvidia-smi

Tue Feb 21 21:38:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:21:00.0  On |                  Off |
| 30%   49C    P2    78W / 300W |    441MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Create helper functions

#### Return data frame of a draw of relative percent of occurrence from a beta distribution fit to observed occurrence counts df: data frame where rows = samples and columns = ASVs

In [13]:
def ranRelPct_cupy(df, asLogOdds = True):
    for i in range(df.shape[1]):
        col = df.iloc[:,i]
        a = col + 1
        b = col.sum() - col + 1
        beta_dist = cp.random.beta(a,b)
        beta_dist /= col.sum()
        df.iloc[:,i] = beta_dist
    # convert to log-odds if requested
    if asLogOdds:
        result = np.log(df / (1 - df))
    return result.transpose()

#### Draws sample of relative percent of occurrence and conducts PCA df: data frame where rows = samples and columns = ASVs 
#### Returns a dictionary containing : 
#####   df: data frame containing a random draw of the original count data frame as log-odds
#####   scores: array of PCA scores
#####   loadings: array of PCA loadings

In [15]:
def samplePCA(df, num_pcs = None):
    max_pcs = min(df.shape[0] - 1, df.shape[1] - 1)
    if num_pcs is None:
        num_pcs = max_pcs
    elif num_pcs > max_pcs:
        num_pcs = max_pcs
    ran_df = ranRelPct_cupy(df, asLogOdds = True)
    pca = PCA(n_components = num_pcs)
    pca_fit = pca.fit(ran_df)
    scores = pca_fit.transform(ran_df)
    loadings = np.transpose(pca_fit.components_)
    pca_results = {
        "df": ran_df,
        "scores": scores,
        "loadings": loadings
    }
    return pca_results

#### If the sign of the first element in a column in matrices after the first is different than the first, multiply all values in that column by -1

In [4]:
def harmonizeColumnSigns(mat_list):
    for i in range(1, len(mat_list)):
        for col in range(mat_list[i].shape[1]):
            if cupy.sign(mat_list[i][0, col]) != cupy.sign(mat_list[0][0, col]):
                mat_list[i][:, col] *= -1
    return mat_list

#### Sorts PCA loadings from a list 

In [5]:
def sortLoadings(loading_list, pc, asvs, asRanks = False):
    # Harmonize signs across replicates
    harm_loadings = harmonizeColumnSigns(loading_list)
    # Create 3 dimensional array and select component 'pc'
    loadings = cupy.stack(harm_loadings, axis = 2)[:, pc, :]
    # Convert to ranks if 'asRanks == True'
    if asRanks:
        loadings = cupy.array([rankdata(loadings[:, i]) for i in range(loadings.shape[1])]).transpose()
    # Get sorted order based on median for each ASV 
    row_sort = cupy.apply_along_axis(np.median, 1, loadings).ravel().argsort()[::-1]
    # Sort based on median, add ASV names (also sorted) and return data frame
    df = cudf.DataFrame(loadings[row_sort, :])
    df.index = asvs[row_sort]
    return df

#### Run functions with one data set

In [6]:
file1 = "./Data/Flyer2018_16S_table_counts.tsv"
# no need to transpose with new ranRelPct function that works on columns
asvs1 = cudf.read_csv(file1, index_col=0, sep="\t") 

In [7]:
asvs1.head(3)

Unnamed: 0_level_0,CN18Fc12_8_eDNA,CN18Fc19_5_eDNA,CN18Fc21_6_eDNA,CN18Fc22_6_eDNA,CN18Fc24_6_eDNA,CN18Fc25_5_eDNA,CN18Fc27_4_eDNA,CN18Fc29_6_eDNA,CN18Fc30_4_eDNA,CN18Fc32_4_eDNA,...,CN18SESPkoa_SC36,CN18SESPkoa_SC37,CN18SESPkoa_SC39,CN18SESPkoa_SC40,CN18SESPkoa_SC41,CN18SESPkoa_SC42,CN18SESPkoa_SC44,CN18SESPkoa_SC45,CN18SESPkoa_SC47,CN18SESPkoa_SC49
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
495c1bd1608a1dad54d3e2824ce899ef,552,7415,8749,8152,7124,12422,215,8080,8799,2231,...,4324,1339,95,1554,147,2720,21,2015,1847,1886
a900b6678ce86851fb16bfafb87f3326,210,1933,2808,1967,1671,4912,57,3688,3343,1134,...,21023,8898,1367,8291,1112,9881,60,8726,9330,8852
c8e360969108fa2125a3d56eb4dad24f,145,2089,2530,2086,2343,2395,129,1625,1664,639,...,4830,9,566,20,551,60,43,22,28,44


In [14]:
df = ranRelPct_cupy(asvs1, asLogOdds = True)
df

Unnamed: 0,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,a140195871278e8fcf9447e42bad8786,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
CN18Fc12_8_eDNA,-22.891597,-23.541860,-22.236355,-22.312313,-22.607437,-23.293388,-24.811241,-24.836845,-23.600426,-22.655929,...,-22.343866,-23.485769,-24.537687,-23.021141,-23.723140,-22.591102,-22.468469,-22.755133,-22.344359,-22.319600
CN18Fc19_5_eDNA,-25.502183,-25.171862,-25.140357,-25.605764,-25.640167,-25.771789,-25.281586,-25.729454,-25.559860,-26.015494,...,-25.632839,-30.795714,-25.220474,-25.423559,-26.975746,-25.424765,-26.009868,-25.895081,-26.358824,-25.113652
CN18Fc21_6_eDNA,-26.493776,-27.675757,-25.577585,-25.558575,-25.694896,-27.124402,-26.032197,-27.023699,-25.886825,-26.330878,...,-25.917138,-25.890891,-27.584265,-26.303183,-26.127981,-29.215416,-26.489109,-25.684730,-26.247587,-25.636617
CN18Fc22_6_eDNA,-25.331899,-25.401497,-25.266766,-25.815348,-25.950126,-25.228700,-25.307538,-26.300431,-25.682291,-25.204972,...,-26.958035,-25.703295,-26.141335,-25.938236,-25.411199,-27.244379,-25.469988,-27.083402,-25.332256,-25.766349
CN18Fc24_6_eDNA,-27.161813,-25.388994,-28.747507,-28.008983,-26.743832,-26.374393,-26.801285,-25.818552,-26.647134,-25.742977,...,-25.657077,-25.337321,-26.135482,-26.920018,-25.457587,-25.606698,-25.731238,-25.854634,-26.425344,-25.436920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CN18SESPkoa_SC42,-25.444809,-26.597599,-25.433057,-26.125989,-26.280569,-25.669367,-26.398743,-25.499296,-26.848714,-25.698717,...,-26.148740,-26.021584,-25.724290,-27.016737,-25.384493,-26.420679,-25.523812,-28.579273,-28.511954,-25.455227
CN18SESPkoa_SC44,-26.427707,-22.888073,-22.990150,-22.610429,-22.866960,-23.485303,-22.631199,-22.392847,-22.357235,-22.703097,...,-23.703275,-23.154217,-22.416885,-24.190472,-22.996533,-24.730476,-22.751074,-23.308645,-25.038548,-22.288086
CN18SESPkoa_SC45,-27.586806,-26.794064,-25.420334,-26.147939,-25.308510,-27.511936,-26.786236,-27.220975,-25.514898,-26.176316,...,-25.650983,-25.541900,-25.583788,-29.582686,-27.505592,-25.479612,-27.874936,-25.855660,-27.136395,-25.870032
CN18SESPkoa_SC47,-27.584326,-26.400231,-25.518827,-26.461159,-25.691251,-25.980371,-25.698576,-26.074798,-26.030666,-25.882254,...,-25.886460,-27.002878,-25.438982,-25.555292,-30.856728,-28.938681,-25.730317,-25.612428,-26.403855,-26.053664


In [16]:
asv_PCAs = [samplePCA(asvs1, 3) for i in range(10)]
#raw_loadings = [x["loadings"] for x in asv_PCAs]
asv_PCAs

RuntimeError: exception occurred! file=/opt/conda/envs/rapids/include/raft/linalg/detail/eig.cuh line=131: 

In [None]:
df = sortLoadings(raw_loadings, 0, asvs1.columns.values, True)
sns.heatmap(df)

#### Create a list of files to plot

In [4]:
# Set the folder containing the data
DATA_DIR = "./Data/"

file_names = ["Flyer2018_16S_table_counts.tsv",
              "Flyer2018_18S_table_counts.tsv"]
for file in file_names:
    # Create the file path 
    file_path = DATA_DIR + file
    
    # Load the dataframe
    asvs = cudf.read_csv(file_name, index_col=0, sep="\t").transpose()
    
    ### DO OTHER STUFF HERE
    asv_PCAs = [samplePCA(asvs, 3) for i in range(10)]
    raw_loadings = [x["loadings"] for x in asv_PCAs]
    
    df = sortLoadings(raw_loadings, 0, asvs1.columns.values, True)
    sns.heatmap(df)