Amplicon data of two different marker genes from the same samples
========

### Testing functionality with RAPIDS

#### Import all the dependencies

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns # will this work with RAPIDS?
import pandas as pd

# RAPIDS libraries
import cudf # RAPIDS equivalent of pandas
import cupy as cp # RAPIDS equivalent of numpy
from cuml.decomposition import PCA # RAPIDS equivalent of scikit-learn

In [2]:
# Check hardware configuration
!nvidia-smi

Tue Feb 21 21:38:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:21:00.0  On |                  Off |
| 30%   49C    P2    78W / 300W |    441MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Create helper functions

#### Return data frame of a draw of relative percent of occurrence from a beta distribution fit to observed occurrence counts df: data frame where rows = samples and columns = ASVs

In [None]:
def ranRelPct(df, asLogOdds = True):
    # function to return a random draw from a beta distribution for a row
    def betaRow(row):
        ran_row = cupy.random.beta(row + 1, row.sum() - row + 1)
        ran_row = ran_row / ran_row.sum()
        return ran_row
    # apply betaRow function to every row to draw sample of relative percent occurrence
    result = df.apply(lambda x: betaRow(x), axis = 1, result_type = 'expand') 
    # assign row and column names
    result.index = df.index.values
    result.columns = df.columns.values
    # convert to log-odds if requested
    if asLogOdds:
        result = cupy.log(result / (1 - result))
    return result

#### Draws sample of relative percent of occurrence and conducts PCA df: data frame where rows = samples and columns = ASVs 
#### Returns a dictionary containing : 
#####   df: data frame containing a random draw of the original count data frame as log-odds
#####   scores: array of PCA scores
#####   loadings: array of PCA loadings

In [None]:
def samplePCA(df, num_pcs):
    ran_df = ranRelPct(df, asLogOdds = True)
    pca = PCA(n_components = num_pcs)
    pca_fit = pca.fit(ran_df)
    scores = pca_fit.transform(ran_df)
    loadings = cupy.transpose(pca_fit.components_)
    pca_results = {
        "df": ran_df,
        "scores": scores,
        "loadings": loadings
    }
    return pca_results

#### If the sign of the first element in a column in matrices after the first is different than the first, multiply all values in that column by -1

In [None]:
def harmonizeColumnSigns(mat_list):
    for i in range(1, len(mat_list)):
        for col in range(mat_list[i].shape[1]):
            if cupy.sign(mat_list[i][0, col]) != cupy.sign(mat_list[0][0, col]):
                mat_list[i][:, col] *= -1
    return mat_list

#### Sorts PCA loadings from a list 

In [None]:
def sortLoadings(loading_list, pc, asvs, asRanks = False):
    # Harmonize signs across replicates
    harm_loadings = harmonizeColumnSigns(loading_list)
    # Create 3 dimensional array and select component 'pc'
    loadings = cupy.stack(harm_loadings, axis = 2)[:, pc, :]
    # Convert to ranks if 'asRanks == True'
    if asRanks:
        loadings = cupy.array([rankdata(loadings[:, i]) for i in range(loadings.shape[1])]).transpose()
    # Get sorted order based on median for each ASV 
    row_sort = cupy.apply_along_axis(np.median, 1, loadings).ravel().argsort()[::-1]
    # Sort based on median, add ASV names (also sorted) and return data frame
    df = cudf.DataFrame(loadings[row_sort, :])
    df.index = asvs[row_sort]
    return df

#### Run functions with one data set

In [None]:
file1 = "./Data/Flyer2018_16S_table_counts.tsv"
asvs1 = cudf.read_csv(file1, index_col=0, sep="\t").transpose()

In [3]:
asv_PCAs = [samplePCA(asvs1, 3) for i in range(10)]
raw_loadings = [x["loadings"] for x in asv_PCAs]

In [None]:
df = sortLoadings(raw_loadings, 0, asvs1.columns.values, True)
sns.heatmap(df)

#### Create a list of files to plot

In [4]:
# Set the folder containing the data
DATA_DIR = "./Data/"

file_names = ["Flyer2018_16S_table_counts.tsv",
              "Flyer2018_18S_table_counts.tsv"]
for file in file_names:
    # Create the file path 
    file_path = DATA_DIR + file
    
    # Load the dataframe
    asvs = cudf.read_csv(file_name, index_col=0, sep="\t").transpose()
    
    ### DO OTHER STUFF HERE
    asv_PCAs = [samplePCA(asvs, 3) for i in range(10)]
    raw_loadings = [x["loadings"] for x in asv_PCAs]
    
    df = sortLoadings(raw_loadings, 0, asvs1.columns.values, True)
    sns.heatmap(df)