# Query genomic annotation using the biomaRt package from Bioconductor

In [3]:
library(biomaRt)
library(tidyverse)

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mselect()[39m masks [34mbiomaRt[39m::select()



In [4]:
mymart <- useMart("ENSEMBL_MART_ENSEMBL", dataset="mmusculus_gene_ensembl", version = "Ensembl Genes 104")

## Example: query entrez id, and MGI id and symbol for ensembl ids

### List filters

In [5]:
listFilters(mymart) %>% dplyr::filter(stringr::str_detect(name, "ensembl"))

name,description
<chr>,<chr>
ensembl_gene_id,Gene stable ID(s) [e.g. ENSMUSG00000000001]
ensembl_gene_id_version,Gene stable ID(s) with version [e.g. ENSMUSG00000000001.5]
ensembl_transcript_id,Transcript stable ID(s) [e.g. ENSMUST00000000001]
ensembl_transcript_id_version,Transcript stable ID(s) with version [e.g. ENSMUST00000000001.5]
ensembl_peptide_id,Protein stable ID(s) [e.g. ENSMUSP00000000001]
ensembl_peptide_id_version,Protein stable ID(s) with version [e.g. ENSMUSP00000000001.5]
ensembl_exon_id,Exon ID(s) [e.g. ENSMUSE00000097910]


### List attributes

In [6]:
listAttributes(mymart) %>% dplyr::filter(str_detect(name, "ensembl|entrez|mgi"))

name,description,page
<chr>,<chr>,<chr>
ensembl_gene_id,Gene stable ID,feature_page
ensembl_gene_id_version,Gene stable ID version,feature_page
ensembl_transcript_id,Transcript stable ID,feature_page
ensembl_transcript_id_version,Transcript stable ID version,feature_page
ensembl_peptide_id,Protein stable ID,feature_page
ensembl_peptide_id_version,Protein stable ID version,feature_page
ensembl_exon_id,Exon stable ID,feature_page
entrezgene_trans_name,EntrezGene transcript name ID,feature_page
mgi_description,MGI description,feature_page
mgi_symbol,MGI symbol,feature_page


### sample query for two genes

Query by ensembl id

In [7]:
biomaRt::getBM(
    filters = "ensembl_gene_id",
    attributes=c("ensembl_gene_id", "entrezgene_id","mgi_id", "mgi_symbol"),
    values=c("ENSMUSG00000023951", "ENSMUSG00000041872"),
    mart=mymart
)

ensembl_gene_id,entrezgene_id,mgi_id,mgi_symbol
<chr>,<int>,<chr>,<chr>
ENSMUSG00000023951,22339,MGI:103178,Vegfa
ENSMUSG00000041872,257630,MGI:2676631,Il17f


Conversely, query using entrez ids

In [None]:
biomaRt::getBM(
    filters = "entrezgene_id",
    attributes=c("ensembl_gene_id", "entrezgene_id","mgi_id", "mgi_symbol"),
    values=c(22339, 257630),
    mart=mymart
)

## Query gene ids for pathway analysis

A pathway analysis is by deisgn limited to the genes annotated in pathways. Accordingly, can limit our attention to this subset of genes.

### Import Pathways

In [1]:
annodir <- "./resource/"
annofile <- file.path(annodir, "Mm.h.all.v7.1.entrez.rds")

tools::md5sum(annofile)

Mm.H <- readRDS(annofile)

### Query annotation for genes annotated in the pathways

In [2]:
biomaRt::getBM(
    filters = "entrezgene_id",
    attributes=c("chromosome_name", "start_position", "end_position" , "strand" , "ensembl_gene_id", "entrezgene_id","mgi_id", "mgi_symbol"),
    values=as.integer(unique(unlist(Mm.H))),
    mart=mymart
) -> pathwayids

ERROR: Error in martCheck(mart): object 'mymart' not found


In [None]:
pathwayids %>% head

### 

Number of pathways ids with no query

In [None]:
length(setdiff(as.integer(unlist(Mm.H)), pathwayids[["entrezgene_id"]]))

Pathway ids with non-unique ensembl id

In [None]:
pathwayids %>% dplyr::group_by(ensembl_gene_id) %>% dplyr::filter(n()>1) %>% arrange(ensembl_gene_id)

In [None]:
## 
biomaRt::getBM(
    filters = "entrezgene_id",
    attributes=c("ensembl_gene_id", "entrezgene_id","mgi_id", "mgi_symbol"),
    values=c(19047,434233),
    mart=mymart
)          

In [None]:
## 
biomaRt::getBM(
    filters = "entrezgene_id",
    attributes=c("ensembl_gene_id", "entrezgene_id","mgi_id", "mgi_symbol"),
    values=c(100041273, 100042503, 68194),
    mart=mymart
)          

In [None]:
56066

biomaRt::getBM(
    filters = "entrezgene_id",
    attributes=c("ensembl_gene_id", "entrezgene_id","mgi_id", "mgi_symbol"),
    values=c(56066),
    mart=mymart
) 

## Import ensembl ids from study data

In [None]:
listFilters(mymart)[grep("entrez",listFilters(mymart)[,1]),]

## Query annotation data from MGI (http://www.informatics.jax.org)

In [None]:
paste0(rep("c",15), collapse = "")

In [None]:
mgifile <- "http://www.informatics.jax.org/downloads/reports/MGI_Gene_Model_Coord.rpt"

readr::read_tsv(mgifile) -> mgidat

In [None]:
mgidat 

## Get rid of genes listed as "null"

In [None]:
mgidat %>% 
    dplyr::filter(`6. Entrez gene id` != "null" & `11. Ensembl gene id` != "null") ->
    mgidat

mgidat

### Check  if ids are unique`

In [None]:
mgidat %>% 
    dplyr::group_by(`6. Entrez gene id`) %>%
    dplyr::filter(n()>1)

In [None]:
mgidat %>% 
    dplyr::group_by(`11. Ensembl gene id`) %>%
    dplyr::filter(n()>1)

### Number of pathway genes  not in annotation file

In [None]:
length(setdiff(unlist(Mm.H), mgidat[["6. Entrez gene id"]]))

### Set location of analysis results

In [None]:
procdir <- "./proc"

addfile <- file.path(procdir, "PRJNA668393-DESeq2-additive.RDS")
multfile <- file.path(procdir, "PRJNA668393-DESeq2-multiplicative.RDS")

tools::md5sum(addfile)
tools::md5sum(multfile)

### 

In [None]:
### The ensembl ids from the gtf file end with version numbers
### One can use str_remove to remove the ensemble version from id
stringr::str_remove("ENSMUSG12345.01",".[0-9][0-9]{0,2}$")
stringr::str_remove("ENSMUSG12345.1",".[0-9][0-9]{0,2}$")



### Create a named vector (using entrez ids) for statistics from additive model

In [None]:
### drop genes with na statistics
### clean ensembl id by dropping version number
### Merge the results by the clean ensemble id using an inner join
### Only keep 

readRDS(addfile) %>%
    DESeq2::results(tidy = TRUE) %>%
    tidyr:: drop_na(stat) %>%
    dplyr::mutate(ensid = str_remove(row, ".[0-9][0-9]{0,2}$")) %>%
    dplyr::inner_join(mgidat, by = c("ensid" = "11. Ensembl gene id")) %>%
    dplyr::filter(`6. Entrez gene id` %in% unique(unlist(Mm.H))) %>% 
    dplyr::pull(stat, name = "6. Entrez gene id") ->
    addstat

#### spot check the conversion for three genes

In [None]:
idx <- sample(1:length(addstat), 3)
idx

In [None]:
mgidat %>% 
    dplyr::filter(`6. Entrez gene id` %in% names(addstat[idx])) %>%
    dplyr::select(`6. Entrez gene id`, `11. Ensembl gene id`) ->
    testres
    
testres

In [None]:
addstat[idx]
readRDS(addfile) %>%
    DESeq2::results(tidy = TRUE) %>%
    dplyr::filter(str_detect(row, paste0(testres[["11. Ensembl gene id"]], collapse = "|")))

### Create a named vector (using entrez ids) for statistics from additive model

In [None]:
### drop genes with na statistics
### clean ensembl id by dropping version number
### Merge the results by the clean ensemble id using an inner join
### Only keep 

readRDS(multfile) %>%
    DESeq2::results(tidy = TRUE) %>%
    tidyr:: drop_na(stat) %>%
    dplyr::mutate(ensid = str_remove(row, ".[0-9][0-9]{0,2}$")) %>%
    dplyr::inner_join(mgidat, by = c("ensid" = "11. Ensembl gene id")) %>%
    dplyr::filter(`6. Entrez gene id` %in% unique(unlist(Mm.H))) %>% 
    dplyr::pull(stat, name = "6. Entrez gene id") ->
    multstat

#### Spot check

In [None]:
idx <- sample(1:length(addstat), 3)
idx

In [None]:
mgidat %>% 
    dplyr::filter(`6. Entrez gene id` %in% names(multstat[idx])) %>%
    dplyr::select(`6. Entrez gene id`, `11. Ensembl gene id`) ->
    testres
    
testres

In [None]:
multstat[idx]
readRDS(multfile) %>%
    DESeq2::results(tidy = TRUE) %>%
    dplyr::filter(str_detect(row, paste0(testres[["11. Ensembl gene id"]], collapse = "|")))

In [None]:
outdir <- "./proc"

addstatfile <- file.path(outdir, "PRJNA668393-DESeq2-addstat.RDS")
multstatfile <- file.path(outdir, "PRJNA668393-DESeq2-multstat.RDS")

saveRDS(addstat, addstatfile)
saveRDS(multstat, multstatfile)

tools::md5sum(addstatfile)
tools::md5sum(multstatfile)

In [None]:
sessionInfo()