# `ccRemover`: removal of cell cycle from virtual tumor in mESC  

Here we apply `ccRemover` to remove the cell cycle from a mESC dataset with 600 cell cycle genes and 1000 other gene expressions doubled for 40% cells.
Data taken from [Cyclum](https://github.com/KChen-lab/Cyclum/tree/master/old-version/data/mESC).

## Import necessary packages

In [None]:
%load_ext rpy2.ipython

In [None]:
from pathlib import Path
from cyclum import writer
import sys

import pandas as pd
import numpy as np
import sklearn as skl
import sklearn.preprocessing
import scprep as scp
import scanpy as sc
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter


In [None]:
sys.path.append("../../")
from paths import DATA_DIR

In [None]:
input_file_mask = str(DATA_DIR) + '/cellcycle_virtualtumor/perturbed-mesc-tpm-linear'
output_file_rds = str(DATA_DIR) + '/cellcycle_virtualtumor/ccremover_xhat.rds'
output_file_h5ad = str(DATA_DIR) + '/cellcycle_virtualtumor/ccremover_adata.h5ad'

In [None]:
%%R
knitr::opts_chunk$set(echo = TRUE)
library(ccRemover)
set.seed(10)


## Load data

In [None]:
def preprocess(input_file_mask):
    """
    Read in data and perform log transform (log2(x+1)), centering (mean = 1) and scaling (sd = 1).
    """
    tpm = writer.read_df_from_binary(input_file_mask)
    sttpm = pd.DataFrame(data=skl.preprocessing.scale(np.log2(tpm.values + 1)), 
                         index=tpm.index, columns=tpm.columns, 
                         dtype=float)
    
    label = pd.read_csv(input_file_mask + '-label.csv', sep="\t", index_col=0)
    return sttpm, label

sttpm, label = preprocess(input_file_mask)


In [None]:
%%R
read_binary <- function(file_name){
  f <- file(file_name, 'rb')
  n_row <- readBin(f, what="int", n=1, size=4, endian="little")
  n_col <- readBin(f, what="int", n=1, size=4, endian="little")
  m2 <- matrix(nrow=n_col, ncol=n_row, 
               data=readBin(f, what="double", n=n_row*n_col, size=8, endian="little"))
  close(f)
  return(t(m2))
}

read_binary_with_name <- function(file_name_mask){
  m <- read_binary(paste(file_name_mask, "-value.bin", sep=""))
  all_names <- scan(paste(file_name_mask, "-name.txt", sep=""), what="character", sep = "\t")
  rownames(m) <- all_names[1:nrow(m)]
  colnames(m) <- all_names[-1:-nrow(m)]
  return(m)
}

In [None]:
%%R -i input_file_mask

exp.mat <- read_binary_with_name(input_file_mask)


In [None]:
%%R
rdata <- t(scale(log2(exp.mat+1)))

rdata[is.nan(rdata)] <- 0

In [None]:
%%R -o rdata
head(rdata[,1:5])

## the cell cycle genes

In [None]:
%%R

gene_names <- rownames(rdata)
cell_cycle_gene_indices <- gene_indexer(gene_names, species = "mouse")
head(gene_names[cell_cycle_gene_indices])

[-] create the vector which will be used in the main ccRemover procedure.

In [None]:
%%R

if_cc <- rep(FALSE,nrow(rdata)) 
if_cc[cell_cycle_gene_indices] <- TRUE
summary(if_cc)

In [None]:
%%R

dat <- list(x=rdata, if_cc=if_cc)

## ccRemover

In [None]:
%%R
xhat <- ccRemover(dat, bar=TRUE)

In [None]:
%%R -i output_file_rds
saveRDS(xhat, file = output_file_rds)

In [None]:
%%R -o xhat

dim(xhat)

In [None]:
label = label.astype("category")

sttpm_adata_ccremover = sc.AnnData(sttpm)
sttpm_adata_ccremover.X = xhat.T
sttpm_adata_ccremover.obs['subcluster'] = label['subcluster']
sttpm_adata_ccremover.obs['stage'] = label['stage']
sttpm_adata_ccremover.rename_categories('stage', ['G1', 'G2M', 'S'])

In [None]:
sttpm_adata_ccremover.write(output_file_h5ad)