### Installation

In [1]:
# if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#   BiocManager::install("cicero")

Vignette: https://www.bioconductor.org/packages/devel/bioc/vignettes/cicero/inst/doc/website.html

### Import packages

In [2]:
library(cicero)
library(data.table)
library(Matrix)
library(proxy)
library(reshape2)
library(BuenColors)
library(umap)

Loading required package: monocle
Loading required package: Matrix
Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:Matrix’:

    colMeans, colSums, rowMeans, rowSums, which

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank,

### Preprocess

`bsub < count_reads_peaks.sh`

In [3]:
path = './count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)

In [None]:
#assuming tab separated values with a header    
datalist = lapply(files, function(x)fread(paste0(path,x))$V4) 
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist) 

In [5]:
dim(datafr)

In [6]:
df_regions = read.csv("../../input/combined.sorted.merged.bed",
                      sep = '\t',header=FALSE,stringsAsFactors=FALSE)

In [7]:
dim(df_regions)

In [8]:
peaknames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")

In [9]:
head(peaknames)

In [10]:
head(sapply(strsplit(files,'\\.'),'[', 2))

In [11]:
colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = peaknames

In [12]:
datafr[1:5,1:5]

Unnamed: 0,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC,AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC,AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC,AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC
chr1_3002715_3002962,0,0,0,0,0
chr1_3037090_3037634,0,0,0,0,0
chr1_3084622_3085850,0,0,0,0,0
chr1_3103610_3104006,0,0,0,0,0
chr1_3106869_3107182,0,0,0,0,0


In [13]:
dim(datafr)

In [4]:
# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')

In [15]:
mat_sparse = as(datafr, "dgTMatrix")
cicero_data = data.frame(cbind(Peak=rownames(datafr)[mat_sparse@i+1],
                               Cell=colnames(datafr)[mat_sparse@j+1],
                               Count=mat_sparse@x),stringsAsFactors = FALSE)
cicero_data$Count = as.numeric(cicero_data$Count)

In [16]:
head(cicero_data)

Peak,Cell,Count
chr1_3254750_3255907,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,2
chr1_3378202_3378888,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,2
chr1_6443336_6444923,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,2
chr1_6584548_6585071,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,1
chr1_7127882_7129914,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,2
chr1_7136712_7137602,AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT,1


### Obtain Feature Matrix

In [17]:
start_time <- Sys.time()

In [18]:
metadata <- read.table('../../input/metadata.tsv',
                         header = TRUE,
                         stringsAsFactors=FALSE,quote="",row.names=1)

In [28]:
input_cds <- make_atac_cds(cicero_data, binarize = TRUE)

In [29]:
pData(input_cds)$label = metadata[rownames(pData(input_cds)),'label']

In [30]:
dim(input_cds)

In [31]:
#Ensure there are no peaks included with zero reads
input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 

In [32]:
dim(input_cds)

In [33]:
input_cds

CellDataSet (storageMode: environment)
assayData: 385237 features, 12178 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: AGCGATAGAACGAATTCGAAGCCTACGACCTATCCT
    AGCGATAGAACGAATTCGAAGCCTACGATATAGCCT ...
    TCTCGCGCTTGGTAAGGATTGGTAGTCGTATAGCCT (12178 total)
  varLabels: cells Size_Factor num_genes_expressed label
  varMetadata: labelDescription
featureData
  featureNames: chr1_3002715_3002962 chr1_3037090_3037634 ...
    chrY_631121_631487 (385237 total)
  fvarLabels: site_name chr ... num_cells_expressed (5 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:  

In [34]:
set.seed(2019)
input_cds <- detectGenes(input_cds)

In [35]:
input_cds <- estimateSizeFactors(input_cds)

In [36]:
input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=15,
                        reduction_method = 'tSNE', norm_method = "none")

"NAs introduced by coercion to integer range"

ERROR: Error in if (any(i < 0L)) {: missing value where TRUE/FALSE needed


#### Errors were reported previously  
https://github.com/cole-trapnell-lab/cicero-release/issues/25

In [None]:
tsne_coords <- t(reducedDimA(input_cds))

In [None]:
head(tsne_coords)

In [None]:
df_tsne_coords = data.frame(cbind(tsne_coords,as.character(pData(input_cds)$label)),stringsAsFactors = FALSE)
colnames(df_tsne_coords) = c('tsne_1','tsne_2','label')
df_tsne_coords$tsne_1 = as.numeric(df_tsne_coords$tsne_1)
df_tsne_coords$tsne_2 = as.numeric(df_tsne_coords$tsne_2)

In [None]:
options(repr.plot.width=4, repr.plot.height=3)
p <- ggplot(shuf(df_tsne_coords), aes(x = tsne_1, y = tsne_2, color = label)) +
  geom_point(size = 1)+
  ggtitle('tSNE') + theme_classic()
p

In [None]:
mouse.mm9.genome = read.table('../../input/mm9/mm9.chrom.sizes')
genome_ref = mouse.mm9.genome
file_tss='../../input/mm9/mm9-tss.bed'

In [None]:
head(genome_ref)

In [None]:
row.names(tsne_coords) <- row.names(pData(input_cds))
cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
conns <- run_cicero(cicero_cds, genome_ref) # Takes a few minutes to run

In [None]:
gene_annotation <- read.table(file_tss,sep='\t')
names(gene_annotation)[4] <- "gene"    
gene_annotation_pos <- subset(gene_annotation, V5 == "+")
gene_annotation_pos$V3 <- gene_annotation_pos$V2 + 1
gene_annotation_neg <- subset(gene_annotation, V5 == "-")
gene_annotation_neg$V2 <- gene_annotation_neg$V3 - 1
tss <- rbind(gene_annotation_pos, gene_annotation_neg)    

In [None]:
input_cds <- annotate_cds_by_site(input_cds, tss)
# generate unnormalized gene activity matrix
unnorm_ga <- build_gene_activity_matrix(input_cds, conns)
unnorm_ga <- unnorm_ga[!Matrix::rowSums(unnorm_ga) == 0,]

# make a list of num_genes_expressed
num_genes <- pData(input_cds)$num_genes_expressed
names(num_genes) <- row.names(pData(input_cds))

# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)    
fm_Cicero = as.matrix(cicero_gene_activities)

In [None]:
end_time <- Sys.time()

In [None]:
end_time - start_time

In [None]:
all(colnames(fm_Cicero) == rownames(metadata))

In [None]:
saveRDS(fm_Cicero, file = '../../output/feature_matrices/FM_Cicero_cusanovich2018subset.rds')

### Downstream Analysis

In [None]:
df_umap_Cicero <- umap(t(fm_Cicero))$layout

In [None]:
df_umap = data.frame(cbind(df_umap_Cicero,metadata[rownames(pData(input_cds)),'label']),stringsAsFactors = FALSE)
colnames(df_umap) = c('umap_1','umap_2','label')
df_umap$umap_1 = as.numeric(df_umap$umap_1)
df_umap$umap_2 = as.numeric(df_umap$umap_2)
options(repr.plot.width=5, repr.plot.height=4)
p <- ggplot(df_umap, aes(x = umap_1, y = umap_2, color = label)) +
  geom_point(size = 1) + 
  ggtitle('Cicero') + theme_classic()
p

In [37]:
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_Cicero/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] grid      splines   stats4    parallel  stats     graphics  grDevices
 [8] utils     datasets  methods   base     

other attached packages:
 [1] umap_0.2.2.0         BuenColors_0.5.5     MASS_7.3-51.1       
 [4] reshape2_1.4.3       proxy_0.4-23         data.table_1.12.0   
 [7] cicero_1.0.15        Gviz_1.26.5          GenomicRanges_1.34.0
[10] GenomeInfoDb_1.18.1  IR

In [None]:
save.image(file = 'Cicero_cusanovich2018subset.RData')