# Integrating human and macaque caudate snATAC-seq

Notes on linked peaks notes: 
- human peak w/ macaque ortholog intersecting human ortholog of macaque peak
- call ties w/ multiple peak matches using nearest summits
- count matrix comes from linked peaks 1-1 in human and macaque 

Notes on integration:
- perform integration with about same number of cells in each group
- here selected representative human and macaque subject
- predefine the features during anchor finding step

In [1]:
PROJDIR=file.path('../../../data/raw_data/cross_species_peak_orthologs')

#######################################
### set up libraries and functions ####
ss <- function(x, pattern, slot = 1, ...) { 
  sapply(strsplit(x = x, split = pattern, ...), '[', slot) }
options(stringsAsFactors = F, repr.plot.width=14, repr.plot.height=6)
suppressMessages(library(Signac)); suppressMessages(library(Seurat))
suppressMessages(library(harmony))

source('../hal_scripts/narrowPeakFunctions.R')

In [2]:
# # set up future for parallelization
library(future)
library(future.apply)
plan("sequential")
options(future.globals.maxSize = 180 * 1024^3)

# 1) prepare integrated human and 

In [3]:
# integratedRDS_fn = file.path(PROJDIR,'rdas','mergedMultiSpeciesSeuratCCAHumanOnly.rds')
# human = readRDS(file = integratedRDS_fn)
# human[["peaks2"]] <- human[['integrated']]

"Cannot add objects with duplicate keys (offending key: integrated_), setting key to 'peaks2_'"


In [4]:
# DefaultAssay(object = human) <- "peaks2"
# human = DietSeurat(
#     human, counts = TRUE, data = TRUE, scale.data = TRUE, 
#     assays = c('peaks2','peaks'), dimreducs = c('integratedLSI','umap'))

# ### grab just one sample from human caudate cells ###
# cells = WhichCells(human, expression = Sample %in% c("14_1018.CAUD"))
# human = subset(human, cells = cells)

Read in the macaque snATAC-seq files.

In [5]:
## load the seurat object
saveRDS_fn = file.path(PROJDIR, 'rdas', 'multispeciesMergedSeurat.rds')
obj_seurat = readRDS(file = saveRDS_fn)

### grab just the macaque cells ###
cells = WhichCells(obj_seurat, expression = Species %in% c("rheMac10"))
macaque = subset(obj_seurat, cells = cells)
rm(obj_seurat); gc(verbose = FALSE)

## show cell clusters per sample
macaque@meta.data$log10nFrags = log10(macaque@meta.data$nFrags)
table(macaque@meta.data$Clusters2, macaque@meta.data$Sample)

## recompute TFIDR and SVD on just macaque samples
macaque <- RunTFIDF(macaque, verbose = FALSE)
macaque <- RunSVD(macaque, verbose = FALSE)

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,10966552,585.7,17062120,911.3,13329932,711.9
Vcells,973522851,7427.4,5056467981,38577.8,5697326200,43467.2


             
              CAUD_WS1H_STA682A131
  Astro                       1035
  Interneuron                  193
  Microglia                    519
  MSN_D1                       975
  MSN_D2                       925
  MSN_UNK1                     400
  Oligo                       1442
  OPC                          304

# 2) integration with Seurat CCA

In [6]:
# features <- SelectIntegrationFeatures(object.list = obj_seurat.list, nfeatures = 10000)
features <- rownames(macaque)

# find integration anchors between species, using all features
anchors <- FindIntegrationAnchors(
        object.list = list(human, macaque), reduction = 'cca', anchor.features = features,
        reference = c(1), k.filter = NA, assay = c('peaks2', 'peaks'))

Scaling features for provided objects

Finding anchors between all query and reference datasets

Running CCA

Merging objects

Finding neighborhoods

Finding anchors

	Found 20450 anchors



In [9]:
# integrate data and create a new merged object
integrated <- IntegrateData(anchors, dims = 2:30, preserve.order = TRUE)

# we now have a "corrected" TF-IDF matrix, and can run LSI again on this corrected matrix
integrated <- RunSVD(integrated, n = 30, reduction.name = 'integratedLSI', verbose = FALSE)
integrated <- RunUMAP(integrated, dims = 2:30, reduction = 'integratedLSI', verbose = FALSE)

"Overlapping ranges supplied. Ranges should be non-overlapping."
Integrating dataset 2 with reference dataset

"Overlapping ranges supplied. Ranges should be non-overlapping."
Finding integration vectors

Finding integration vector weights

Integrating data

"Adding a command log without an assay associated with it"


ERROR: Error in irlba(A = t(x = object), nv = n, work = irlba.work): BLAS/LAPACK routine 'DLASCL' gave error code -4


In [None]:
# plot embeddings
p_seuratIntegration_species = 
    DimPlot(object = integrated, label = FALSE, group.by = 'Species', cols = 'Dark2') +
    ggplot2::ggtitle('Seurat CCA Integration')

p_seuratIntegration_clusters2 = 
    DimPlot(object = integrated, label = TRUE, group.by = 'Clusters2', cols = 'Paired') +
    ggplot2::ggtitle('Seurat CCA Integration')

p_seuratIntegration_species + p_seuratIntegration_clusters2

In [None]:
DimPlot(object = integrated, label = TRUE, group.by = 'Clusters2', cols = 'Paired',split.by = 'Sample') +
    ggplot2::ggtitle('Seurat CCA Integration')

In [None]:
integrated
object.size(integrated) / 1024^3

In [None]:
integrated.trim = DietSeurat(
    integrated, counts = TRUE, data = TRUE, scale.data = FALSE, 
    assays = c('peaks', 'integrated'), dimreducs = c('integratedLSI','umap'))

object.size(integrated.trim) / 1024^3

In [None]:
## save the seurat object
integratedRDS_fn = file.path(PROJDIR,'rdas','multispeciesSeuratCCAprimate.rds')
saveRDS(integrated.trim, file = integratedRDS_fn)