In [None]:
suppressPackageStartupMessages({
  library(dplyr)
  library(plotly)
  library(viridis)
  library(data.table)
  library(DelayedArray)
  library(devtools)
  library(tidyr)
  library(plotly)
  library(htmlwidgets)
  library(stringr)
  library(VGAM)
  
  DelayedArray:::set_verbose_block_processing(TRUE)
  options(DelayedArray.block.size=1e9)  
})

# load dev branch: currently 
# load_all("/net/trapnell/vol1/home/lsaund11/bin/monocle3-dev")
load_all("/net/trapnell/vol1/home/duran/bin/monocle3_dev")

# library(monocle3)

source("/net/trapnell/vol1/home/sanjays/projects/GAP/bin/projection_utils_clean.R")

# set working dir on cluster
setwd("/net/trapnell/vol1/home/sanjays/projects/GAP/GAP21_eliza-syd/")


# Temporarily disable OpenMP threading in functions to be run in parallel
old_omp_num_threads = as.numeric(Sys.getenv("OMP_NUM_THREADS"))
  if (is.na(old_omp_num_threads)){
    old_omp_num_threads = 1
  }
RhpcBLASctl::omp_set_num_threads(1)

# Temporarily set the number of threads the BLAS library can use to be 1
old_blas_num_threads = as.numeric(Sys.getenv("OPENBLAS_NUM_THREADS"))
if (is.na(old_omp_num_threads)){
    old_blas_num_threads = 1
}
RhpcBLASctl::blas_set_num_threads(1)

In [None]:
setwd("/net/trapnell/vol1/home/sanjays/projects/GAP/GAP21_eliza-syd")

# load files

In [None]:
ref_cds <- readRDS("/net/trapnell/vol1/home/sanjays/projects/GAP/COMB_GAP/R_objects/full_gap_hf_ctrl_ref_mito-filt_1.25M_model-update_anno_cds.RDS")


# maybe this is old 
hf_cds <- readRDS("/net/trapnell/vol1/home/dorrity/analysis/nobackup/hotfish/210509_projection/hf4_no-ctrls_projected_major-group-anno_clean_cds.RDS")


# functions

# make reference embryo trajectory

In [None]:
ref_cds = ref_cds[, !is.na(colData(ref_cds)$embryo)]
coldata_ref = colData(ref_cds) %>% as.data.frame()

let's first set up a wide matrix that contains the number of cell of each type (columns) in each embryo (row).  

In [None]:
covariates_df = coldata_ref %>% 
  select(embryo, timepoint) %>% 
  distinct() %>% as.data.frame

rownames(covariates_df) = covariates_df$embryo

coldata_ref_summary = coldata_ref %>% 
  group_by(embryo, cell_type_sub) %>% 
  summarize(cells=n())


cell_counts_wide = spread(coldata_ref_summary, embryo, cells, fill=0)
cell_types = as.character(cell_counts_wide %>% pull(cell_type_sub))
cell_counts_wide = as.matrix(cell_counts_wide[,2:ncol(cell_counts_wide)])
row.names(cell_counts_wide) = cell_types

cell_counts_wide = t(cell_counts_wide)
cell_counts_wide = cell_counts_wide[as.character(covariates_df$embryo),]

thresh_cells_per_cluster = 1500
thresh_cells_per_cluster_upper = 40000

# filter embs with low counts
cell_counts_wide = cell_counts_wide[,colSums(cell_counts_wide) < thresh_cells_per_cluster_upper]
cell_counts_wide = cell_counts_wide[,colSums(cell_counts_wide) > thresh_cells_per_cluster]


# normalize by size factors, make a cds to get them
all_cell_count_cds = new_cell_data_set(t(cell_counts_wide), 
                                   cell_metadata=covariates_df)

all_cell_count_cds <- all_cell_count_cds[,Matrix::colSums(exprs(all_cell_count_cds)) != 0]
all_cell_count_cds = detect_genes(all_cell_count_cds)
all_cell_count_cds = estimate_size_factors(all_cell_count_cds)
all_cell_count_cds = preprocess_cds(all_cell_count_cds, num_dim = 10, norm_method="size_only", method = "PCA")
all_cell_count_cds = reduce_dimension(all_cell_count_cds, preprocess_method = "PCA")

sf = size_factors(all_cell_count_cds)
cell_counts_wide = cell_counts_wide[names(sf),]
cell_counts_wide = round(cell_counts_wide / sf, 0)
covariates_df = covariates_df[names(sf),]



Next, let's fit a multinomial regression model that describes the changes in cell type frequencies a smoothed function of time.

In [None]:
spline_df = 3
spt_mult_fit <- VGAM::vglm(cell_counts_wide ~ sm.ns(timepoint, df=spline_df), 
                          data = covariates_df, family = "multinomial", trace = FALSE)

Next, we will trace out a curve that describes how cell type proportions change over time by emitting the predictions of this model from 24-96hpf.

In [None]:
time_span_dummy = data.frame(timepoint=seq(18, 96, by=0.2))
time_span_dummy$embryo = stringr::str_c("dummy-", row.names(time_span_dummy))
row.names(time_span_dummy) = time_span_dummy$embryo 

xxx_df = covariates_df
xxx_df$total_cells = rowSums(cell_counts_wide) 

cell_count_fit = glm(total_cells ~ splines::ns(timepoint, df=spline_df), 
                        family = "poisson", data=xxx_df, trace=FALSE)

emb_trajectory = predict(cell_count_fit, newdata=time_span_dummy, type="response") * predict(spt_mult_fit, newdata=time_span_dummy,type="response")


Now, let's plot the trajectory predicted by the model along with the actual embryo counts in a low-dimensional space with UMAP:

In [None]:
# Cell count trajectory over embryos
row.names(covariates_df) = rownames(cell_counts_wide)
cell_count_trajectory_cell_metadata = covariates_df
cell_count_trajectory_cell_metadata$fake = FALSE
time_span_dummy$fake = TRUE
cell_count_trajectory_cell_metadata = rbind(cell_count_trajectory_cell_metadata, time_span_dummy)
cell_count_trajectory_counts = t(cell_counts_wide)
#cell_count_trajectory_counts = cell_count_trajectory_counts / rowSums(cell_count_trajectory_counts)
cell_count_trajectory_counts = cbind(t(cell_counts_wide), t(emb_trajectory))
cell_count_cds = new_cell_data_set(cell_count_trajectory_counts, cell_metadata=cell_count_trajectory_cell_metadata)


In [None]:
# add a batch label 
cell_count_coldata = as.data.frame(colData(cell_count_cds)) 
cell_count_coldata$embryo = gsub("-", ".", cell_count_coldata$embryo)
cell_count_coldata = cell_count_coldata %>% separate(col = embryo, sep="\\.", into = c("expt", "other"), extra="merge") 
colData(cell_count_cds)$expt = cell_count_coldata$expt

In [None]:
cell_count_cds = preprocess_cds(cell_count_cds, num_dim = 10, norm_method="size_only")
cell_count_cds = align_cds(cell_count_cds, alignment_group = "expt")
cell_count_cds = reduce_dimension(cell_count_cds, 
                                  preprocess_method = "Aligned", 
                                  build_nn_index = T)


save_transform_models(cell_count_cds, dir = "ref_embryo_model")

# check to make sure this doesn't look like garbage
plot_cells(cell_count_cds, color_cells_by="timepoint", cell_size = 1) %>% ggsave(filename="embryo_traj.png")

In [None]:
cell_count_cds = cluster_cells(cell_count_cds)
cell_count_cds = learn_graph(cell_count_cds)

get_earliest_principal_node <- function(cds, start_time=18){
  cell_ids <- which(colData(cds)[, "timepoint"] == start_time)
  
  closest_vertex <-
  cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
  igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names
  (which.max(table(closest_vertex[cell_ids,]))))]
  
  root_pr_nodes
}

cell_count_cds = order_cells(cell_count_cds,
                             root_pr_nodes=get_earliest_principal_node(cell_count_cds))

# plot by pseudotime
p <- plot_cells(cell_count_cds, color_cells_by="pseudotime", cell_size=1)  +
     theme(legend.position = "none",
        rect = element_rect(fill = "transparent"),
        panel.background = element_rect(fill = "transparent",colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        strip.background = element_blank(),
        strip.text.x = element_blank())
     
ggsave(p, filename="embryo_traj_pseudotime.png")

In [None]:
colData(cell_count_cds)$pseudostage = pseudotime(cell_count_cds)

# save RDS
saveRDS(cell_count_cds, file = 'R_objects/final_reference_embryoTraj.rds')

In [None]:
ref_cell_count_cds <- readRDS(file = 'R_objects/final_reference_embryoTraj.rds')
ref_cell_count_coldata <- colData(ref_cell_count_cds) %>% as.data.frame

# make HF query embryo cds

In [None]:
hf_cds = hf_cds[, !is.na(colData(hf_cds)$embryo)]
hf_cds = hf_cds[, !is.na(colData(hf_cds)$cell_type_sub)]


In [None]:
make_cell_count_cds <- function(cds) {
    coldata_query = colData(cds) %>% as.data.frame
    covariates_df = coldata_query %>%
    select(embryo, timepoint, temp) %>%
    distinct() %>% as.data.frame
    rownames(covariates_df) = covariates_df$embryo
    coldata_query_summary = coldata_query %>%
      group_by(embryo, cell_type_sub) %>%
      summarize(cells=n())
    cell_counts_wide = spread(coldata_query_summary, embryo, cells, fill=0)
    cell_types = as.character(cell_counts_wide %>%
                              pull(cell_type_sub))
    cell_counts_wide = as.matrix(cell_counts_wide[,2:ncol(cell_counts_wide)])
    row.names(cell_counts_wide) = cell_types
    cell_counts_wide = t(cell_counts_wide)
    cell_counts_wide = cell_counts_wide[as.character(covariates_df$embryo),]

    # # normalize by size factors, make a cds to get them
    all_cell_count_cds = new_cell_data_set(t(cell_counts_wide),
                                    cell_metadata=covariates_df)
    # all_cell_count_cds <- all_cell_count_cds[,Matrix::colSums(exprs(all_cell_count_cds)) != 0]
    all_cell_count_cds = detect_genes(all_cell_count_cds)
    all_cell_count_cds = estimate_size_factors(all_cell_count_cds)
    sf = size_factors(all_cell_count_cds)
    cell_counts_wide = cell_counts_wide[names(sf),]
    cell_counts_wide = round(cell_counts_wide / sf, 0)
    covariates_df = covariates_df[names(sf),]
    row.names(covariates_df) = rownames(cell_counts_wide)
    cell_count_cds = new_cell_data_set(t(cell_counts_wide), cell_metadata=covariates_df)
    cell_count_cds = detect_genes(cell_count_cds)
    cell_count_cds = estimate_size_factors(cell_count_cds)
    return(cell_count_cds)
}

In [None]:
hf_cell_count_cds <- make_cell_count_cds(hf_cds)

In [None]:
saveRDS(hf_cell_count_cds, file = 'R_objects/hf_cell_count_cds.rds')

# project

In [None]:
hf_cell_count_cds <- readRDS(file = 'R_objects/hf_cell_count_cds.rds')

In [None]:

# load the models into the query 
hf_cell_count_cds <- load_transform_models(hf_cell_count_cds, 
                                          "ref_embryo_model")
# no error!
# Error in load_transform_models(hf_cell_count_cds, "ref_embryo_model") : 
# md5sum mismatch for file 'ref_embryo_model/rdd_umap_transform_model.rds

# preprocess transform
hf_cell_count_cds <- preprocess_transform(hf_cell_count_cds, method="PCA")

# umap transform 
hf_cell_count_cds <- reduce_dimension_transform(hf_cell_count_cds, preprocess_method = "PCA")

In [None]:
saveRDS(hf_cell_count_cds, file = 'R_objects/hf_cell_count_cds.rds')

In [None]:
colData(hf_cell_count_cds)$expt = "HF"

In [None]:
ref_hf_cell_count_cds <- combine_cds(list(hf_cell_count_cds, ref_cell_count_cds), keep_reduced_dims=T)

plot_cells(ref_hf_cell_count_cds, color_cells_by = "expt") %>% ggsave(filename="ref_hf_plot.png")

In [None]:
hf_cell_count_cds <- add_gap_time(hf_cell_count_cds, 
                                ref_cell_count_coldata, 
                                transfer_type = "timepoint", 
                                colname = "mean_nn_time", 
                                k = 15)

In [None]:
hf_cell_count_cds <- add_gap_time(hf_cell_count_cds, 
                                ref_cell_count_coldata, 
                                transfer_type = "pseudostage", 
                                colname = "mean_nn_pseudostage", 
                                k = 10)

In [None]:
# or learn graph in the combined cds

In [None]:
# plot_cells(query_embryo_cds, color_cells_by = "ref_label_time", 
#            label_cell_groups = T, label_groups_by_cluster = F, cell_size = 1.5) +
#   scale_color_viridis_c() +
#   ggsave("plots/hf4_15nn-celltype_x1y2_umap3D.png")

# plot_cells(query_embryo_cds, color_cells_by = "pseudostage", 
#            label_cell_groups = T, label_groups_by_cluster = F, cell_size = 1.5) +
#   scale_color_viridis_c() 
