/
vis_identifier_dim_dist.R
182 lines (160 loc) 路 5.68 KB
/
vis_identifier_dim_dist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#' Visualize the distribution difference of samples after Molecule Identifier dimension reduction analysis
#'
#' NOTE: the dataset must be dense matrix in UCSC Xena data hubs.
#'
#'
#' @param dataset the dataset to obtain identifiers.
#' @param ids the molecule identifiers.
#' @param grp_df When `dataset` and `id` are all not `NULL`, it should be a `data.frame` with 2 columns.
#' - The first column refers to sample ID.
#' - The second column refers to groups indicated in axis X.
#' @param samples default is `NULL`, can be common sample names for two datasets.
#' @param return.data whether to reture the raw meta/matrix data (list) instead of plot
#' @param DR_method the dimension reduction method
#' @param add_margin the marginal plot (NULL, "density", "boxplot")
#' @param palette the color setting of RColorBrewer
#'
#' @return a `ggplot` object.
#' @export
#'
#' @examples
#' \donttest{
#' library(UCSCXenaTools)
#' expr_dataset <- "TCGA.LUAD.sampleMap/HiSeqV2_percentile"
#' ids = c("TP53", "KRAS", "PTEN", "MDM2", "CDKN1A")
#'
#' cli_dataset <- "TCGA.LUAD.sampleMap/LUAD_clinicalMatrix"
#' cli_df <- XenaGenerate(
#' subset = XenaDatasets == cli_dataset
#' ) %>%
#' XenaQuery() %>%
#' XenaDownload() %>%
#' XenaPrepare()
#' grp_df = cli_df[, c("sampleID", "gender")]
#' vis_identifier_dim_dist(expr_dataset, ids, grp_df, DR_method="PCA")
#' }
#' @export
vis_identifier_dim_dist = function(dataset = NULL, ids = NULL, grp_df, samples = NULL,
return.data = FALSE,
DR_method = c("PCA", "UMAP", "tSNE"), add_margin = NULL,
palette = "Set1"){
stopifnot(ncol(grp_df) > 1)
DR_method <- match.arg(DR_method)
colnames(grp_df)[1] <- "sample"
if (!is.null(dataset) && !is.null(ids)) {
df <- tryCatch(
{
purrr::map(ids, function(x) {
message("Querying data of identifier ", x, " from dataset ", dataset, " for DR")
data <- if (dataset == "custom_feature_dataset") query_custom_feature_value(x) else
query_molecule_value(dataset, x)
data <- dplyr::tibble(
sample = names(data),
y = as.numeric(data)
)
colnames(data)[2] <- x
data
}) %>%
purrr::reduce(dplyr::full_join, by = "sample")
},
error = function(e) {
rlang::inform("access data failed, the message is provided below")
rlang::warn(conditionMessage(e))
NULL
}
)
if (is.null(df)) return(NULL)
df <- dplyr::inner_join(grp_df, df, by = "sample")
} else {
message("Directly use 'grp_df' for comparison analysis.")
df <- grp_df
ids = colnames(grp_df)[-1:-2]
}
if (!is.null(samples)) {
df <- dplyr::filter(df, .data$sample %in% samples)
}
ids_NAN <- colnames(df[, ids])[apply(df[, ids], 2, function(x) all(is.na(x)))]
ids_SD0 <- colnames(df[, ids])[apply(df[, ids], 2, function(x) stats::sd(x) == 0)] %>% na.omit()
ids_OK <- setdiff(ids, c(ids_NAN, ids_SD0))
if (length(ids_OK) < 3) {
stop("The number of valid ids is less than three. Please inspect the input ids and data_type(?query_pancan_value)")
}
if (return.data) {
return(list(meta = grp_df, exp = df))
}
if (DR_method == "PCA") {
pca_obj <- prcomp(df[, ids_OK], center = TRUE, scale = TRUE)
res_dims <- pca_obj$x[, 1:2] %>%
as.data.frame() %>%
dplyr::rename("PC_1" = "PC1", "PC_2" = "PC2")
} else if (DR_method == "tSNE") {
set.seed(123)
tsne_obj <- Rtsne::Rtsne(df[, ids_OK])
res_dims <- tsne_obj$Y %>%
as.data.frame() %>%
dplyr::rename("tSNE_1" = "V1", "tSNE_2" = "V2")
} else if (DR_method == "UMAP") {
umap_obj <- umap::umap(df[, ids_OK])
res_dims <- umap_obj$layout %>%
as.data.frame() %>%
dplyr::rename("UMAP_1" = "V1", "UMAP_2" = "V2")
}
res_dims <- cbind(df[1:2], res_dims)
groups <- length(unique(res_dims[,colnames(grp_df)[2],drop=TRUE]))
if (groups < 2) {
stop("The number of valid group levels is less than two")
}
if (groups > 6) {
color <- grDevices::hcl(
h = seq(15, 375, length = groups + 1),
l = 65, c = 100
)[seq(groups)]
shape <- rep(16, groups)
} else {
color <- RColorBrewer::brewer.pal(n = 6, name = palette)[seq(groups)]
shape <- c(15:20)[seq(groups)]
}
p <- ggplot2::ggplot(res_dims,
aes_string(colnames(res_dims)[3], colnames(res_dims)[4], color = colnames(res_dims)[2], shape = colnames(res_dims)[2])) +
ggplot2::geom_point() +
ggplot2::stat_ellipse() +
ggplot2::theme_classic(base_size = 20) +
ggplot2::guides(
color = guide_legend(title = NULL),
shape = guide_legend(title = NULL)
) +
ggplot2::theme(
legend.background = element_blank(),
legend.position = c(0, 0),
legend.justification = c(0, 0)
) +
ggplot2::scale_color_manual(values = color) +
ggplot2::scale_shape_manual(values = shape)
if (!is.null(add_margin)) {
geom_type <- switch(add_margin,
"density" = geom_density,
"boxplot" = geom_boxplot,
stop("Please choose one of density/boxplot marginal type")
)
p_right <- cowplot::axis_canvas(p, axis = "x") +
geom_type(
data = p$data, aes_string(x = colnames(p$data)[3], fill = colnames(grp_df)[2]),
alpha = 0.8, linewidth = 0.3
) +
ggplot2::scale_fill_manual(values = color)
p_top <- cowplot::axis_canvas(p, axis = "y", coord_flip = TRUE) +
geom_type(
data = p$data, aes_string(x = colnames(p$data)[4], fill = colnames(grp_df)[2]),
alpha = 0.8, linewidth = 0.3
) +
coord_flip() +
ggplot2::scale_fill_manual(values = color)
p_tmp <- p %>%
cowplot::insert_xaxis_grob(p_right, grid::unit(.2, "null"), position = "top") %>%
cowplot::insert_yaxis_grob(p_top, grid::unit(.2, "null"), position = "right")
p_tmp2 <- cowplot::ggdraw(p_tmp)
p_tmp2$data <- p$data
p <- p_tmp2
}
return(p)
}