In [1]:
library("GEOquery")
library("curl")
library("XML")

geo_id  <- "GSE93593"
sra_study <- "SRP096727" # TODO getting this ID can be automated by making a request to NCBI's eutils
out_file <- "GSE93593_cell_features.tsv" # Will contain cell characteristics

Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which.min

Welcome to Bioconductor

    Vignettes contain introductory material; view with


In [2]:
# Reading geo data
gsm <- getGEO(geo_id, GSEMatrix = T)

Found 1 file(s)
GSE93593_series_matrix.txt.gz
Parsed with column specification:
cols(
  .default = col_character()
)
See spec(...) for full column specifications.
File stored at: 
/var/folders/37/f_hljrvj7291y8ygpf4r54wh0000gq/T//RtmpGZKBen/GPL16791.soft


In [3]:
# Shows available metadata
colnames(pData(phenoData(gsm[[1]])))

In [4]:
# Getting metadata
cols <- c("title", "geo_accession", "cell type:ch1", "cre line:ch1", "days in culture:ch1")
cols_rename <- c("sampleName", "geo", "cellType_mother", "creLine", "days")

metadata <- pData(phenoData(gsm[[1]]))[, cols]
colnames(metadata) <- cols_rename
metadata$cellType <- with(metadata, paste0(creLine, "_", days))
head(metadata)

Unnamed: 0_level_0,sampleName,geo,cellType_mother,creLine,days,cellType
Unnamed: 0_level_1,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>
GSM2455567,D26Dn6A01,GSM2455567,cultured embryonic stem cells,DCX-,D26,DCX-_D26
GSM2455568,D26Dn6A02,GSM2455568,cultured embryonic stem cells,DCX-,D26,DCX-_D26
GSM2455569,D26Dn6A03,GSM2455569,cultured embryonic stem cells,DCX-,D26,DCX-_D26
GSM2455570,D26Dn6A04,GSM2455570,cultured embryonic stem cells,DCX-,D26,DCX-_D26
GSM2455571,D26Dn6A05,GSM2455571,cultured embryonic stem cells,DCX-,D26,DCX-_D26
GSM2455572,D26Dn6A06,GSM2455572,cultured embryonic stem cells,DCX-,D26,DCX-_D26


In [5]:
# Gettting sra run ids, useful for integrating with HCA data from the DCP

# Do a serch for the project first
request <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=", sra_study, "&usehistory=y")
result <- xmlRoot(xmlParse(readLines(curl(request)), asText = TRUE))

# Download sra info table
request <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&query_key=", xmlValue(result[["QueryKey"]]), "&WebEnv=", xmlValue(result[["WebEnv"]]), "&rettype=runinfo&retmode=text")
sra_info <- read.table(curl(request), sep = ",", header = TRUE, stringsAsFactors = F)
sra_info <- sra_info[grep("GSM", sra_info[,"SampleName"]),]

rownames(sra_info) <- sra_info[,"SampleName"] # Make sure that SampleName is actually a GSM id 

In [10]:
# Merge sra run ids and sra exp ids into metadata from GEO
metadata$sra_run <- sra_info[metadata$geo, "Run"]
metadata$sra_sample <- sra_info[metadata$geo, "Sample"]

In [7]:
# Save data
metadata <- t(metadata)
write.table(metadata, out_file, sep = "\t", quote = F, col.names = F)