In [None]:
library(SPARSim)
library(dplyr)
library(Seurat)
library(qusage)
library(stringr)
library(ggplot2)
library('biomaRt')

sessionInfo()

In [None]:
#devtools::install_gitlab("sysbiobig/sparsim", build_opts = c("--no-resave-data", "--no-manual"), build_vignettes = TRUE)
#install.packages(c("Seurat", "deplyr", "stringr"))
BiocManager::install(c("qusage", "edgeR", "mclust", "scater", "scran"))

In [None]:
use_file = "go_human.bp.gmt"

#if( ! file.exists("ReactomePathways.gmt")){
#      #download_and_unzip("https://reactome.org/download/current/ReactomePathways.gmt.zip", ".", "ReactomePathways.gmt.zip", ".")
#      temp <- tempfile()
#      download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip",temp)
#      Reactome <- qusage::read.gmt(unzip(temp, exdir = "./"))
#      unlink(temp)
#}

pathways <- qusage::read.gmt(use_file)



In [None]:
grep("receptor", names(pathways), value=T)

In [None]:
#c("Tung_param_preset", "Camp_param_preset", "Engel_param_preset", "Chu_param_preset", "Horning_param_preset", "Bacher_param_preset", "Brain_10X_param_preset", "T_10X_param_preset", "PBMC_10X_param_preset", "Zheng_param_preset", "Macosko_param_preset", "Saunders_param_preset")
# should match reactome names
data("PBMC_10X_param_preset")
intensity= PBMC_10X_param_preset[[1]]$intensity

genes_df=data.frame(names=names(intensity))

mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes_df$name,mart= mart)

genes_df= merge(genes_df,G_list,by.x="names",by.y="ensembl_gene_id",all.x=TRUE)

genes_df$hgnc_symbol[duplicated(genes_df$hgnc_symbol)] <- NA

genes_df[is.na(genes_df$hgnc_symbol),]$hgnc_symbol=paste0("NA_",1:sum(is.na(genes_df$hgnc_symbol)))

genes=genes_df$hgnc_symbol
names(intensity)=genes

keepIndices = grep("^NA_", names(intensity), invert=T)
intensity = intensity[ keepIndices ]
variability = PBMC_10X_param_preset[[1]]$variability[ keepIndices ]

n_genes=length(genes)
library_size=PBMC_10X_param_preset[[1]]$lib_size  * 10 ### here lib factor
n_cells=length(library_size)
genes=names(intensity)

In [None]:
hist(intensity, xlim = c(0, 3), breaks=2000)

In [None]:
highIntensityGenes = names(intensity[intensity > 0.05])

In [None]:
keepPWs = c()

for (pw in names(pathways))
{
    numGenes = length(pathways[[pw]])
    numHighIntensityGenes = length(intersect(highIntensityGenes, pathways[[pw]]))

    highIntensityRatio = numHighIntensityGenes / numGenes

    if ((numGenes > 1) && (highIntensityRatio > 0.5))
    {
        print(paste(pw, numGenes, numHighIntensityGenes))
        keepPWs = c(keepPWs, pw)
    }
}

In [None]:
keepPWs[1:10]

In [None]:
#Set states
states=c("wildtype","knockout01","knockout02","knockout03")

num_pws_per_state=10

state2pw = list()
state2genes = list()
state2count = list()

for (state in states)
{
    state2count[[state]] = 0
}

for (pw in keepPWs)
{
    #print(pw)
    pwGenes = pathways[[pw]]
    pwGenes = intersect(pwGenes, names(intensity))
    #print(pwGenes)

    intersections = list()
    
    for (state in states)
    {
        if (state %in% names(state2genes))
        {
            commonGenes = intersect(pwGenes, state2genes[[state]])
            if (length(commonGenes) > 0)
            {
                intersections[[state]] = length(commonGenes)
            }
        }
    }

    curstate = NULL
    if (length(names(intersections)) == 1)
    {
        #print("One Intersection")
        nextstate = names(intersections)[1]

        if (state2count[[nextstate]] < num_pws_per_state)
        {
            curstate = nextstate
        }

    } else if (length(names(intersections)) == 0)
    {

        #print("No Intersection")
        for (state in states)
        {
            if (state2count[[state]] < num_pws_per_state)   
            {
                curstate = state
                break       
            }
        }

    }

    #print(curstate)

    if (!is.null(curstate))
    {
        #print(curstate)
        if (curstate %in% names(state2genes))
        {
            #print("Add case")
            state2genes[[curstate]] = c(state2genes[[curstate]], pwGenes)
            state2pw[[curstate]] = c(state2pw[[curstate]], pw)
            state2count[[curstate]] = state2count[[curstate]] + 1
        } else {
            #print("init case")
            state2genes[[curstate]] = c(pwGenes)
            state2pw[[curstate]] = c(pw)
            state2count[[curstate]] = state2count[[curstate]] + 1
        }
        

    }

}

print(state2pw)

In [None]:


hist(log(intensity+1))

#Set scaling for changes
#Here for every state a scaling factor can be set which will be applied to every gene in the geneset
changes=list()

set.seed(1) 




trends=list(c(1,1,4,4),c(1,8,8,1),c(2,2,1,1))

for(i in 1:length(trends)){

  statename = names(state2pw)[i]
  
  for (pw in state2pw[[statename]])
  {
    changes[[pw]] = unlist(trends[i])
  }
}

#changes[["Glycerophospholipid catabolism"]]=c(1,1,4,4)
#changes[["Glutathione synthesis and recycling"]]=c(1,1,4,4)
#changes[["G1 Phase"]]=c(1,8,8,1)
#changes[["Glutathione synthesis and recycling"]]=c(1,8,8,1)
#changes[["Formation of apoptosome"]]=c(1,1,.25,.25)
#changes[["Downstream signaling of activated FGFR4"]]=c(1,1,.25,.25)
changes_df=data.frame(Reduce(rbind, changes))
rownames(changes_df)=names(changes)
colnames(changes_df)=states
write.table(changes_df,file = "simulated_changingPathways_random.tsv",sep="\t",row.names = TRUE,col.names=TRUE,quote = FALSE)

parameter_list=lapply(states,function(x)
  list(intensity=intensity,variability=variability,library_size=library_size,feature_names=genes,sample_names=paste0(x,"-Cell",1:n_cells),condition_name=x)
  )
names(parameter_list)=states


for(c in names(changes)){
  scaling=changes[[c]]
  geneset=intersect(pathways[[c]], names(intensity))
  for(i in 1:length(parameter_list)){
    name_i=names(parameter_list)[i]
    #hist(log10(parameter_list[[i]]$intensity[genes %in% geneset]+1))
    parameter_list[[i]]$intensity[names(intensity) %in% geneset]=parameter_list[[i]]$intensity[names(intensity) %in% geneset]*scaling[i]
  }
}

print(summary(parameter_list))
intensity_matrix=do.call(cbind,lapply(parameter_list,function(x)x$intensity))
print("Changing genes:")
intensity_matrix[unlist(apply(intensity_matrix,1, sd, na.rm = TRUE))!=0, ]

In [None]:
dim(intensity)
dim(parameter_list[[1]]$variability)
dim(parameter_list[[2]]$intensity)
dim(parameter_list[[3]]$intensity)
dim(parameter_list[[4]]$intensity)

In [None]:
param_per_condition=lapply(parameter_list,function(x)
  

  SPARSim_create_simulation_parameter(
                                  intensity = x$intensity, 
                                  variability = x$variability, 
                                  library_size = x$library_size,
                                  feature_names = x$feature_names, 
                                  sample_names =x$sample_names, 
                                  condition_name = x$condition_name)
  
  )
sim_result <- SPARSim_simulation(dataset_parameter = param_per_condition)


condition_vector=sapply(stringr::str_split(colnames(sim_result$count_matrix),"-"), "[[", 1)

In [None]:

rownames(sim_result$count_matrix)=make.names(rownames(sim_result$count_matrix))
colnames(sim_result$count_matrix)=make.names(colnames(sim_result$count_matrix))

sObject <- CreateSeuratObject(counts = sim_result$count_matrix, project = "simulated", min.cells = 0, min.features = 200)
sObject[["percent.mt"]] <- PercentageFeatureSet(sObject, pattern = "^MT-")

VlnPlot(sObject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(sObject, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sObject, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
sObject <- NormalizeData(sObject, normalization.method = "LogNormalize", scale.factor = 10000)
sObject <- FindVariableFeatures(sObject, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(sObject), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(sObject)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

all.genes <- rownames(sObject)
sObject <- ScaleData(sObject, features = all.genes)

sObject <- RunPCA(sObject, features = VariableFeatures(object = sObject))
print(sObject[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(sObject, dims = 1:2, reduction = "pca")
DimPlot(sObject, reduction = "pca")

DimHeatmap(sObject, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(sObject, dims = 1:15, cells = 500, balanced = TRUE)
sObject <- FindNeighbors(sObject, dims = 1:5)
sObject <- FindClusters(sObject, resolution = 2)
head(Idents(sObject), 5)

sObject <- RunUMAP(sObject, dims = 1:10)
DimPlot(sObject, reduction = "umap")

sObject[["cell_names"]] <- condition_vector
DimPlot(sObject, group.by = "cell_names") + NoLegend()

In [None]:
source("https://raw.githubusercontent.com/mjoppich/FlowSets/main/seurat_util_functions.R")
saveRDS(sObject, file = "simulated_scdata_random.rds")


summarised_data=getExtendedExpressionData(sObject,group.by="cell_names")

write.table(summarised_data,file = "summarised_simulated_scdata_random.tsv",sep="\t",row.names = FALSE,col.names=TRUE,quote = FALSE)

write.table(intensity_matrix[unlist(apply(intensity_matrix,1, sd, na.rm = TRUE))!=0, ],file = "simulated_changingGenes_random.tsv",sep="\t",row.names = TRUE,col.names=TRUE,quote = FALSE)


In [None]:
trends

In [None]:
options(repr.plot.width = 8, repr.plot.height = 4)

for (stateindex in 1:length(states))
{
    state = names(state2genes)[stateindex]
    trend = trends[stateindex]
    plotnamesuffix = paste(state, paste(trend, sep="->"))

    for (gene in head(state2genes[[state]]))
    {
        gene = make.names(gene)
        p=VlnPlot(sObject, gene, group.by="cell_names") + ggtitle(paste(gene, plotnamesuffix))
        plot(p)

    }

}