In [30]:
suppressMessages(library(DESeq2))
suppressMessages(library(edgeR))
suppressMessages(library(lubridate))
suppressMessages(library(dplyr))
suppressMessages(library(tidyverse))
suppressMessages(library(gsheet))
suppressMessages(library(data.table))
suppressMessages(library(vegan))
suppressMessages(library(EnvStats))
suppressMessages(library(stringr))

In [46]:
rm(list=ls())
setwd("/workdir/omm35/paper_urine_cfrna")
source("/workdir/omm35/paper_urine_cfrna/scripts/theme_ggplot_cfrna.R")

In [48]:
# Load QC-passed samples metadata from a CSV file.
samples_after_qc = fread("metadata/QC_pass_samples.csv")
# Filter the samples to include only those categorized under 'healthy_plasma' or 'healthy_urine'.
samples_after_qc = samples_after_qc %>% filter(biofluid == "healthy_plasma" | biofluid == "healthy_urine")

## Loading urine cfRNA data and performing initial cleaning.
# Load the urine cfRNA data from a specified TSV file. Assume headers are present.
tissues.urine = fread("/workdir/omm35/paper_urine_cfrna/cto_deconvolution_output/healthy_urine_updated_ref_BP.protein_coding.tsv", header = TRUE)
# Replace periods with hyphens in the sample_id column for consistency with the QC-pass samples metadata.
tissues.urine$sample_id = gsub("\\.", "-", tissues.urine$sample_id)
# Filter the urine cfRNA data to include only those sample IDs that match with the QC-passed samples.
tissues.urine = tissues.urine[tissues.urine$sample_id %in% samples_after_qc$sample_id,]

# Calculate the average fraction of cell types across all matched samples.
# 'colMeans' computes the mean for each column (excluding the first column, which is assumed to be 'sample_id').
urine_means = as.data.frame(colMeans(tissues.urine[, 2:ncol(tissues.urine)]), row.names = TRUE)

# Prepare a data frame to store the tissue types and their corresponding average fractions.
column.name = colnames(tissues.urine)
mat = data.frame(matrix(, nrow = length(column.name[2:length(column.name)]), ncol = 2))
mat$X1 = column.name[2:length(column.name)] # Tissue types.
mat$X2 = urine_means$`colMeans(tissues.urine[, 2:ncol(tissues.urine)])` # Average fractions.
colnames(mat) = c("tissue", "fraction")
urine = mat

# Select the top 5 tissues with the highest average fraction.
urine <- urine %>% arrange(desc(fraction)) %>% slice(1:5)
# Reorder the tissues based on their fraction in descending order.
urine <- urine %>% arrange(desc(fraction))

# Calculate the fraction for the 'other' category by subtracting the sum of the top 5 fractions from 1.
urine_other = data.frame(tissue = "other", fraction = 1 - sum(urine$fraction))
# Add the 'other' category to the urine dataframe.
urine = rbind(urine, urine_other)

# Uncomment if you want to convert fractions to percentages. The code is currently commented out.
# urine$fraction = 100 * urine$fraction

# Create a label for plotting that combines the tissue name and its fraction as a percentage.
urine_lab <- paste0(round(urine$fraction, 2), "%")

# Convert the fractions in 'mat' to percentages for all tissues, not just the top 5.
mat$percent = mat$fraction * 100

In [7]:

#######loading plasma data and repeating the same processes as above
tissues.plasma = fread("/workdir/omm35/paper_urine_cfrna/cto_deconvolution_output/healthy_plasma_updated_ref_BP.protein_coding.tsv")
tissues.plasma$sample_id=gsub("\\.","-",tissues.plasma$sample_id)
tissues.plasma = tissues.plasma[tissues.plasma$sample_id %in% samples_after_qc$sample_id,]
plasma_means =  as.data.frame(colMeans( tissues.plasma[,2:ncol(tissues.plasma)]), row.names = TRUE)
column.name=colnames(tissues.plasma)
mat = data.frame( matrix(, nrow = length(column.name[2:length(column.name)]), ncol = 2))
mat$X1=column.name[2:length(column.name)]
mat$X2 = plasma_means$`colMeans(tissues.plasma[, 2:ncol(tissues.plasma)])`
colnames(mat)= c("tissue","fraction")
plasma = mat
plasma <- plasma %>% arrange(desc(fraction)) %>% slice(1:5)
plasma <- plasma %>% arrange(desc(fraction))
plasma_other = data.frame(tissue="other", fraction=1-sum(plasma$fraction))
plasma = rbind(plasma,plasma_other)
plasma$fraction = 100*plasma$fraction
plasma_lab <- paste0(round(plasma$fraction, 2), "%")
urine$biofluid = "urine"
plasma$biofluid= "plasma"
urine.plasma.df = rbind(plasma,urine)

In [6]:
urine.plasma.df$tissue=substr(urine.plasma.df$tissue,1,40)
color_palette <-c('#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377', "#88CCEE", 
                  "#CC6677", "#DDCC77", "#117733", "#332288", "#AA4499", 
                  "#44AA99", "#999933", "#882255", "#661100", "#6699CC", 
                  "#888888",'#BBBBBB',"#E54C44","#6DED88","#732D9F","#01A03F","#FF85EE","#287500",
                  "#705CD2","#EFD245","#0F80F8","#E2A423","#310D67","#AFE382","#6E006A","#01A665",
                  "#D944A6","#6C8400","#A493FF","#E98429","#0175CE","#FDCD69","#002965","#A9E395",
                  "#F14793","#006630","#FF8BC0","#525F00","#7EACFF","#B0320A","#004589","#FD7547",
                  "#3B1B55","#FFAF73","#AB7EC0","#A05500","#E1B6FF","#654700","#A60030","#FF7D6D",
                  "#6C0200","#800030")
set.seed(1000)
color_palette = sample(color_palette)
ggplot(urine.plasma.df, aes(x="", y=fraction, fill=tissue)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) + theme_void() + facet_grid(~biofluid)+
scale_fill_manual(values = color_palette)+
theme(
    #axis.title.x = element_text(size = 8,family = "Helvetica", color="black"),
    #axis.text.x = element_text(size = 8,family = "Helvetica", color="black"),
    #axis.title.y = element_text(size = 8,family = "Helvetica", color="black"),
    #axis.text.y = element_text(size=6,family = "Helvetica", color="black"),
    #legend.position = c(0.8, 0.8),
    legend.position = "bottom",
    legend.text = element_text(size=6),
    legend.title = element_blank(),
    legend.box.background = element_rect(),
  )