In [1]:
suppressPackageStartupMessages({
    library(Seurat)
    library(zellkonverter)
    library(harmony)
    library(singlecellmethods)
    library(readxl)
    library(rlang)
    library(dplyr)
    library(uwot)
    library(ggplot2)
})

In [30]:
ad <- readH5AD("/data/norman/southark/external_datasets/fibroblast_atlas_med_2022/med_2022_normalized_expr.h5ad")
fibro_atlas <- as.Seurat(ad, counts = "X", data = NULL)

In [3]:
fibro_atlas <- subset(x = fibro_atlas, subset = sample_type == 'primary')

In [4]:
tissue <- read_excel("/data/norman/southark/external_datasets/fibroblast_atlas_med_2022/SupplementaryData.xlsx", sheet = "TableS6")
inflamm <- read_excel("/data/norman/southark/external_datasets/fibroblast_atlas_med_2022/SupplementaryData.xlsx", sheet = "TableS7")

In [5]:
filtered_tissue <- tissue[tissue$Pval < 0.05 & tissue$LogFoldChange >= 0.35,]
filtered_inflamm <- inflamm[inflamm$Pval < 0.05 & abs(inflamm$Slope) >= 0.1,]

In [6]:
select_genes <- unique(c(unique(filtered_tissue$Feature), unique(filtered_inflamm$Feature)))

In [8]:
fibro_atlas@assays$originalexp@var.features <- select_genes

In [9]:
fibro_atlas <- fibro_atlas %>% 
                RunBalancedPCA(, npcs=40, weight.by="organ__ontology_label", assay.use="originalexp", reduction.name = "pca.old") %>%
                RunHarmony(, group.by.vars=c("donor_id", "organ__ontology_label"),
                           reduction = 'pca.old', weight.by="organ__ontology_label",
                           assay.use = "originalexp", project.dim = FALSE, reduction.save = "harmony.old") # weighted harmony

use_weights



[1] TRUE


use_weights

use_weights



[1] TRUE


use_weights

Harmony 1/10

Harmony converged after 1 iterations



In [10]:
harmony_umap <- umap(fibro_atlas@reductions$harmony.old@cell.embeddings, n_neighbors = 30L, metric = 'Euclidean', init = 'Laplacian',
     spread = 0.3, min_dist = 0.05, set_op_mix_ratio = 1.0, local_connectivity = 1L, repulsion_strength = 1, negative_sample_rate = 1, seed=0)

In [11]:
umapdata <- Seurat::CreateDimReducObject(
      embeddings = harmony_umap,
      stdev = as.numeric(apply(harmony_umap, 2, stats::sd)),
      assay = "originalexp",
      key = "umap_"
    )

“No columnames present in cell embeddings, setting to 'umap_1:2'”


In [12]:
fibro_atlas[["umap.old"]] <- umapdata

In [17]:
comps <- read.csv("/data/norman/southark/external_datasets/fibro_CRISPRa_Tfs/20240331_fibroblast_bulk_comps.csv", row.names="X")

In [18]:
features <- list()
weights <- list()
for (i in rownames(comps)){
    features[[strtoi(i)+1]] <- colnames(comps[i, which(comps[i,] > 0.05)])
    weights[[strtoi(i)+1]] <- as.numeric(comps[i, which(comps[i,] > 0.05)])
}

features <- features[-56]
weights <- weights[-56]

In [373]:
program_genes <- names(which(apply(comps > 0.05, 2, any)))

In [374]:
missing_genes <- setdiff(program_genes, rownames(fibro_atlas@assays$originalexp@meta.features))

In [375]:
updated_list <- UpdateSymbolList(missing_genes)

Found updated symbols for 14 symbols

SARS -> SARS1
H2AFV -> H2AZ2
GARS -> GARS1
TARS -> TARS1
NARS -> NARS1
EPRS -> EPRS1
WARS -> WARS1
H3F3A -> H3-3A
H2AFZ -> H2AZ1
HIST1H2AC -> H2AC6
H2AFX -> H2AX
IARS -> IARS1
HIST1H4C -> H4C3
H2AFJ -> H2AJ



In [859]:
manually_mapped <- c("HLA-A", "HLA-B", "HLA-C", "KRTAP2-3", "KRTAP1-5", "MT-ATP6", "MT-CO1",
                     "MT-CO2", "MT-CO3", "MT-CYB", "MT-ND1", "MT-ND2", "MT-ND3", "MT-ND4", "MT-ND5")

In [860]:
select_program_genes <- intersect(c(manually_mapped, updated_list, colnames(comps)), rownames(fibro_atlas@assays$originalexp@meta.features))

In [861]:
fibro_atlas@assays$originalexp@var.features <- union(select_program_genes, select_genes)

In [862]:
fibro_atlas <- fibro_atlas %>% 
                RunBalancedPCA(, npcs=40, weight.by="organ__ontology_label", assay.use="originalexp", ) %>%
                RunHarmony(, group.by.vars=c("organ__ontology_label", "donor_id"),
                           reduction = 'pca', weight.by="organ__ontology_label", assay.use = "originalexp", project.dim = FALSE) # weighted harmony

use_weights



[1] TRUE


use_weights

use_weights



[1] TRUE


use_weights

Harmony 1/10

Harmony converged after 1 iterations



In [863]:
harmony_umap <- umap(fibro_atlas@reductions$harmony@cell.embeddings, n_neighbors = 30L, metric = 'Euclidean', init = 'Laplacian',
     spread = 0.3, min_dist = 0.05, set_op_mix_ratio = 1.0, local_connectivity = 1L, repulsion_strength = 1, negative_sample_rate = 1, seed=0)

In [864]:
umapdata <- Seurat::CreateDimReducObject(
      embeddings = harmony_umap,
      stdev = as.numeric(apply(harmony_umap, 2, stats::sd)),
      assay = "originalexp",
      key = "umap_"
    )

“No columnames present in cell embeddings, setting to 'umap_1:2'”


In [865]:
fibro_atlas[["umap"]] <- umapdata

In [24]:
LengthCheck <- function(values, cutoff = 0) {
  return(vapply(
    X = values,
    FUN = function(x) {
      return(length(x = x) > cutoff)
    },
    FUN.VALUE = logical(1)
  ))
}

In [25]:
AddWeightedModuleScore <- function(
  object,
  features,
  weights,
  pool = NULL,
  nbin = 24,
  ctrl = 100,
  k = FALSE,
  assay = NULL,
  name = 'Cluster',
  seed = 1,
  search = FALSE,
  slot = 'data',
  ...
) {
  if (!is.null(x = seed)) {
    set.seed(seed = seed)
  }
  assay.old <- DefaultAssay(object = object)
  assay <- assay %||% assay.old
  DefaultAssay(object = object) <- assay
  assay.data <- GetAssayData(object = object, assay = assay, slot = slot)
  features.old <- features
  features.original <- features
  if (k) {
    .NotYetUsed(arg = 'k')
    features <- list()
    for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) {
      features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster == i))
    }
    cluster.length <- length(x = features)
  } else {
    if (is.null(x = features)) {
      stop("Missing input feature list")
    }
    features <- lapply(
      X = features,
      FUN = function(x) {
        missing.features <- setdiff(x = x, y = rownames(x = object))
        if (length(x = missing.features) > 0) {
          warning(
            "The following features are not present in the object: ",
            paste(missing.features, collapse = ", "),
            ifelse(
              test = search,
              yes = ", attempting to find updated synonyms",
              no = ", not searching for symbol synonyms"
            ),
            call. = FALSE,
            immediate. = TRUE
          )
          if (search) {
            tryCatch(
              expr = {
                updated.features <- UpdateSymbolList(symbols = missing.features, ...)
                names(x = updated.features) <- missing.features
                for (miss in names(x = updated.features)) {
                  index <- which(x == miss)
                  x[index] <- updated.features[miss]
                }
              },
              error = function(...) {
                warning(
                  "Could not reach HGNC's gene names database",
                  call. = FALSE,
                  immediate. = TRUE
                )
              }
            )
            missing.features <- setdiff(x = x, y = rownames(x = object))
            if (length(x = missing.features) > 0) {
              warning(
                "The following features are still not present in the object: ",
                paste(missing.features, collapse = ", "),
                call. = FALSE,
                immediate. = TRUE
              )
            }
          }
        }
        return(intersect(x = x, y = rownames(x = object)))
      }
    )
    
    for (i in 1:length(features)){
        features[[i]] <- features.original[[i]][which(features.original[[i]] %in% features[[i]])]
        weights[[i]] <- weights[[i]][which(features.original[[i]] %in% features[[i]])]
    }
    cluster.length <- length(x = features)
  }
  if (!all(LengthCheck(values = features))) {
    warning(paste(
      'Could not find enough features in the object from the following feature lists:',
      paste(names(x = which(x = !LengthCheck(values = features)))),
      'Attempting to match case...'
    ))
    features <- lapply(
      X = features.old,
      FUN = CaseMatch,
      match = rownames(x = object)
    )
  }
  if (!all(LengthCheck(values = features))) {
    stop(paste(
      'The following feature lists do not have enough features present in the object:',
      paste(names(x = which(x = !LengthCheck(values = features)))),
      'exiting...'
    ))
  }
  pool <- pool %||% rownames(x = object)
  data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
  data.avg <- data.avg[order(data.avg)]
  data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e30, n = nbin, labels = FALSE, right = FALSE)
  names(x = data.cut) <- names(x = data.avg)
  ctrl.use <- vector(mode = "list", length = cluster.length)
  for (i in 1:cluster.length) {
    features.use <- features[[i]]
    ctrl.use[[i]] <- vector(mode = "list", length = length(x = features.use))
    for (j in 1:length(x = features.use)) {
        
      # save controls as list of lists
      ctrl.use[[i]][[j]] <- names(x = sample(
          x = data.cut[which(x = data.cut == data.cut[features.use[j]])],
          size = ctrl,
          replace = FALSE
        ))
    }
  }
  
  ctrl.scores <- matrix(
    data = numeric(length = 1L),
    nrow = length(x = ctrl.use),
    ncol = ncol(x = object)
  )
  
  for (i in 1:length(ctrl.use)) {
    features.use <- ctrl.use[[i]]
    ctrl.scores[i, ] <- 0
      
    for (f in 1:length(features.use)){
        ctrl.scores[i, ] <- ctrl.scores[i, ] + Matrix::colMeans(x = assay.data[features.use[[f]], ]) * weights[[i]][f]
    }
      
  }

  features.scores <- matrix(
    data = numeric(length = 1L),
    nrow = cluster.length,
    ncol = ncol(x = object)
  )
  
  for (i in 1:cluster.length) {
    features.use <- features[[i]]
    features.scores[i, ] <- Matrix::colSums(assay.data[features.use, , drop = FALSE] * weights[[i]])
    
  }

  features.scores.use <- (features.scores - ctrl.scores) / mapply(function(x, y) x * y, lapply(weights, mean), lapply(features, length))
  rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
  features.scores.use <- as.data.frame(x = t(x = features.scores.use))
  rownames(x = features.scores.use) <- colnames(x = object)
  object[[colnames(x = features.scores.use)]] <- features.scores.use
  CheckGC()
  DefaultAssay(object = object) <- assay.old
  return(object)
}

In [28]:
fibro_atlas <- AddWeightedModuleScore(fibro_atlas, features, weights, name="gene.program", search=TRUE)

“The following features are not present in the object: H2AFV, H2AFZ, H2AFX, HIST1H4C, attempting to find updated synonyms”
Found updated symbols for 4 symbols

H2AFV -> H2AZ2
H2AFZ -> H2AZ1
H2AFX -> H2AX
HIST1H4C -> H4C3

“The following features are not present in the object: SNHG29, attempting to find updated synonyms”
“No updated symbols found”
“The following features are still not present in the object: SNHG29”
“The following features are not present in the object: H2AFJ, LMO7.AS1, attempting to find updated synonyms”
Found updated symbols for 1 symbols

H2AFJ -> H2AJ

“The following features are still not present in the object: LMO7.AS1”
“The following features are not present in the object: MDH1, SNHG29, ZFAS1, MT.ND5, SNHG5, HLA.B, NR2F1.AS1, attempting to find updated synonyms”
“No updated symbols found”
“The following features are still not present in the object: MDH1, SNHG29, ZFAS1, MT.ND5, SNHG5, HLA.B, NR2F1.AS1”
“The following features are not present in the object: SNHG16,

In [892]:
sce_fibro_atlas <- as.SingleCellExperiment(fibro_atlas, assay = c("originalexp"))
writeH5AD(sce_fibro_atlas, "/data/norman/angel/fibro_datasets/fibroblast_atlas_med_2022.h5ad", X_name = 'counts')