-
Notifications
You must be signed in to change notification settings - Fork 0
/
make-data-Levine-32dim.R
263 lines (198 loc) · 8.4 KB
/
make-data-Levine-32dim.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
##########################################################################################
# R script to prepare benchmark dataset Levine_32dim
#
# This is a 32-dimensional mass cytometry (CyTOF) dataset, consisting of expression levels
# of 32 surface protein markers. Cell population labels are available for 14 manually
# gated populations. Cells are healthy human bone marrow mononuclear cells (BMMCs), from 2
# patients.
#
# This R script loads the data, adds manually gated cell population labels, and exports it
# in SummarizedExperiment and flowSet formats.
#
# Source: "benchmark data set 2" in the following paper:
# Levine et al. (2015), "Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like
# Cells that Correlate with Prognosis", Cell, 162, 184-197.
#
# Link to paper: https://www.ncbi.nlm.nih.gov/pubmed/26095251
# Link to raw data: https://www.cytobank.org/cytobank/experiments/46102 (download the .zip
# file shown under "Exported Files")
#
# Lukas Weber, Jan 2019
##########################################################################################
# original version of this script can be found at:
# https://github.com/lmweber/cytometry-clustering-comparison
suppressPackageStartupMessages({
library(flowCore)
library(SummarizedExperiment)
library(magrittr)
})
# -------------
# Download data
# -------------
# create temporary directories
DIR_TMP <- "tmp"
dir.create(file.path(DIR_TMP))
dir.create(file.path(DIR_TMP, "fcs_files"))
# download from 'imlspenticton' server
URL <- "http://imlspenticton.uzh.ch/robinson_lab/HDCytoData"
DIR <- "Levine_32dim"
# download .fcs files
fcs_filename <- "Levine_32dim_fcs_files.zip"
download.file(file.path(URL, DIR, fcs_filename), destfile = file.path(DIR_TMP, "fcs_files", fcs_filename))
unzip(file.path(DIR_TMP, "fcs_files", fcs_filename), exdir = file.path(DIR_TMP, "fcs_files"))
files <- list.files(file.path(DIR_TMP, "fcs_files"), pattern = "\\.fcs$", full.names = TRUE)
# ---------
# Load data
# ---------
# one .fcs file per manually gated cluster, per patient (H1 and H2)
# 32 surface markers (dimensions), 14 manually gated populations, 2 patients (H1 and H2)
# "unassigned" cells are those where cluster labels are unavailable
files_assigned <- files[-grep("NotDebrisSinglets", files)]
files_unassigned <- files[grep("NotDebrisSinglets", files)]
# cell population names
pop_names <-
grep("H1\\.fcs$", files_assigned) %>%
files_assigned[.] %>%
gsub("(_H1\\.fcs$)", "", .) %>%
gsub("^.*_", "", .) %>%
gsub(" ", "_", .)
# channel and marker names
channel_name <- as.character(pData(parameters(read.FCS(files_assigned[1])))$name)
marker_name <- as.character(pData(parameters(read.FCS(files_assigned[1])))$desc)
# clean marker names
marker_name <- gsub("\\(.*", "", marker_name)
# original column names
col_names <- colnames(exprs(read.FCS(files_assigned[1])))
# marker classes (cell type, cell state, or none)
marker_class <- rep("none", length(marker_name))
marker_class[5:36] <- "type"
# vector of labels for patients H1 and H2
patient_names_assigned <- rep(NA, length(files_assigned))
patient_names_assigned[grep("_H1\\.fcs$", files_assigned)] <- "H1"
patient_names_assigned[grep("_H2\\.fcs$", files_assigned)] <- "H2"
patient_names_unassigned <- c("H1", "H2")
# load data and create vectors of population IDs and sample IDs (for "assigned" cells)
data_assigned <- matrix(nrow = 0, ncol = length(marker_name))
pop_id_assigned <- c()
patient_id_assigned <- c()
for (i in 1:length(files_assigned)) {
# load data
data_i <- exprs(read.FCS(files_assigned[i], transformation = FALSE, truncate_max_range = FALSE))
data_assigned <- rbind(data_assigned, data_i)
# population IDs
pop_id_assigned <- c(pop_id_assigned, rep(pop_names[((i - 1) %% length(pop_names)) + 1], nrow(data_i)))
# patient IDs
patient_id_assigned <- c(patient_id_assigned, rep(patient_names_assigned[i], nrow(data_i)))
}
dim(data_assigned) # 104,184 assigned cells, 32 dimensions
table(pop_id_assigned) # 14 manually gated clusters
table(patient_id_assigned) # 2 patient (72,463 and 31,721 assigned cells each)
stopifnot(nrow(data_assigned) == length(pop_id_assigned))
stopifnot(nrow(data_assigned) == length(patient_id_assigned))
# load data and create vectors of population IDs and sample IDs (for "unassigned" cells)
data_unassigned <- matrix(nrow = 0, ncol = length(marker_name))
pop_id_unassigned <- c()
patient_id_unassigned <- c()
for (i in 1:length(files_unassigned)) {
# load data
data_i <- exprs(read.FCS(files_unassigned[i], transformation = FALSE, truncate_max_range = FALSE))
data_unassigned <- rbind(data_unassigned, data_i)
# population IDs
pop_id_unassigned <- c(pop_id_unassigned, rep("unassigned", nrow(data_i)))
# patient IDs
patient_id_unassigned <- c(patient_id_unassigned, rep(patient_names_unassigned[i], nrow(data_i)))
}
dim(data_unassigned) # 161,443 unassigned cells
table(patient_id_unassigned) # 2 patients (118,888 and 42,555 unassigned cells each)
stopifnot(nrow(data_unassigned) == length(pop_id_unassigned))
stopifnot(nrow(data_unassigned) == length(patient_id_unassigned))
# combine assigned and unassigned cells
data_all <- rbind(data_assigned, data_unassigned)
pop_id_all <- c(pop_id_assigned, pop_id_unassigned)
patient_id_all <- c(patient_id_assigned, patient_id_unassigned)
stopifnot(nrow(data_all) == length(pop_id_all))
stopifnot(nrow(data_all) == length(patient_id_all))
# ----------------------
# Delete temporary files
# ----------------------
unlink(DIR_TMP, recursive = TRUE)
# ----------------------------------
# Create SummarizedExperiment object
# ----------------------------------
# set up row data
row_data <- data.frame(
patient_id = as.factor(patient_id_all),
population_id = as.factor(pop_id_all),
stringsAsFactors = FALSE
)
# set up column data
marker_info <- data.frame(
channel_name = as.character(channel_name),
marker_name = as.character(marker_name),
marker_class = factor(marker_class, levels = c("none", "type", "state")),
stringsAsFactors = FALSE
)
col_data <- marker_info
# set up expression data
d_exprs <- data_all
# use marker names as column names (for SummarizedExperiment)
colnames(d_exprs) <- marker_name
stopifnot(nrow(d_exprs) == nrow(row_data))
stopifnot(ncol(d_exprs) == nrow(col_data))
# create SummarizedExperiment object
d_SE <- SummarizedExperiment(
assays = list(exprs = d_exprs),
rowData = row_data,
colData = col_data
)
# ---------------------
# Create flowSet object
# ---------------------
# note: row data (e.g. population IDs) is stored as additional columns of data in the
# expression matrices; additional information from row data and column data (e.g. marker
# classes, cell population names) is stored in 'description' slot
# table of cell population information
population_info <- data.frame(
population_id = seq_len(length(pop_names) + 1),
population_name = c(pop_names, "unassigned"),
stringsAsFactors = FALSE
)
# create list of extra columns of data for each sample
cols_keep <- 2
row_data_fs <- do.call("cbind", lapply(row_data[, cols_keep, drop = FALSE], as.numeric))
stopifnot(nrow(row_data_fs) == nrow(row_data))
row_data_fs_list <- lapply(split(row_data_fs, row_data$patient_id), matrix, ncol = ncol(row_data_fs))
patient_id_names <- names(table(row_data$patient_id))
stopifnot(all(names(row_data_fs_list) == patient_id_names))
# replace column names
row_data_fs_list <- lapply(row_data_fs_list, function(d) {
colnames(d) <- colnames(row_data_fs)
d
})
# add extra columns of data and create new flowSet object
exprs_fs_list <- lapply(split(d_exprs, row_data$patient_id), matrix, ncol = ncol(d_exprs))
exprs_fs_list <- lapply(exprs_fs_list, function(e) {
# use original column names (for flowSet)
colnames(e) <- col_names
e
})
d_flowFrames_list <- mapply(function(e, extra_cols) {
# combine and create flowFrame
stopifnot(nrow(e) == nrow(extra_cols))
flowFrame(cbind(e, extra_cols))
}, exprs_fs_list, row_data_fs_list)
d_flowSet <- flowSet(d_flowFrames_list)
# include additional information in 'description' slot
for (i in seq_along(d_flowSet)) {
# sample information
description(d_flowSet[[i]])$PATIENT_ID <- identifier(d_flowSet[[i]])
# data frame of marker information
description(d_flowSet[[i]])$MARKER_INFO <- marker_info
# data frame of cell population information
description(d_flowSet[[i]])$POPULATION_INFO <- population_info
}
# ------------
# Save objects
# ------------
save(d_SE, file = "Levine_32dim_SE.rda")
save(d_flowSet, file = "Levine_32dim_flowSet.rda")