# `Seurat`: removal of cell cycle from virtual tumor in mESC 

Here we apply `Seurat` to remove the cell cycle from a mESC dataset with 600 cell cycle genes and 1000 other gene expressions doubled for 40% cells.

** data taken from [Cyclum](https://github.com/KChen-lab/Cyclum/tree/master/old-version/data/mESC).

In [None]:
%load_ext rpy2.ipython


In [None]:
from pathlib import Path
import sys

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter

In [None]:
sys.path.append("../../")
from paths import DATA_DIR

In [None]:
input_file_name_mask = str(DATA_DIR) + '/cellcycle_virtualtumor/perturbed-mesc-tpm-linear'
output_file_rds = str(DATA_DIR) + '/cellcycle_virtualtumor/mesc_seurat.rds'
output_file_seurat = str(DATA_DIR) + '/cellcycle_virtualtumor/mesc_seurat.h5Seurat'

In [None]:
%%R
library(Seurat)
library(biomaRt)

## helper fucntions

In [None]:
%%R
read_binary <- function(file_name){
  f <- file(file_name, 'rb')
  n_row <- readBin(f, what="int", n=1, size=4, endian="little")
  n_col <- readBin(f, what="int", n=1, size=4, endian="little")
  m2 <- matrix(nrow=n_col, ncol=n_row, 
               data=readBin(f, what="double", n=n_row*n_col, size=8, endian="little"))
  close(f)
  return(t(m2))
}

read_binary_with_name <- function(file_name_mask){
  m <- read_binary(paste(file_name_mask, "-value.bin", sep=""))
  all_names <- scan(paste(file_name_mask, "-name.txt", sep=""), what="character", sep = "\t")
  rownames(m) <- all_names[1:nrow(m)]
  colnames(m) <- all_names[-1:-nrow(m)]
  return(m)
}

## Load and pre-process data

In [None]:
%%R -i input_file_name_mask

exp.mat <- read_binary_with_name(input_file_name_mask)

mesc <- CreateSeuratObject(counts = t(exp.mat))
mesc <- NormalizeData(mesc)
mesc <- FindVariableFeatures(mesc, selection.method = "vst")
mesc <- ScaleData(mesc, features = rownames(mesc))
mesc <- RunPCA(mesc, features = VariableFeatures(mesc), ndims.print = 6:10, nfeatures.print = 10)

In [None]:
%%R 
DimHeatmap(mesc, dims = c(8, 10))

In [None]:
%%R 
head(mesc)

## Assign Cell-Cycle Scores

In [None]:
%%R 
library(stringr)

In [None]:
%%R 
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
# segregate this list into markers of G2/M phase and markers of S phase
m.s.genes <- str_to_title(tolower(cc.genes.updated.2019$s.genes)) 
m.g2m.genes <- str_to_title(tolower(cc.genes.updated.2019$g2m.genes)) 

mesc <- CellCycleScoring(mesc, s.features = m.s.genes, g2m.features = m.g2m.genes, set.ident = TRUE)

# view cell cycle scores and phase assignments
head(mesc[[]])

In [None]:
%%R 
mesc <- RunPCA(mesc, features = c(m.s.genes, m.g2m.genes))
DimPlot(mesc)

## Regress out cell cycle scores during data scaling

In [None]:
%%R -i output_file_rds
mesc <- ScaleData(mesc, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(mesc))
mesc <- RunPCA(mesc, features = VariableFeatures(mesc), nfeatures.print = 10)
saveRDS(mesc, file = output_file_rds)

In [None]:
%%R -i output_file_seurat
library(SeuratDisk)
SaveH5Seurat(mesc, filename = output_file_seurat)
Convert(output_file_seurat, dest = "h5ad")

In [None]:
%%R 
mesc <- RunPCA(mesc, features = c(m.s.genes, m.g2m.genes))
DimPlot(mesc)