# `Cyclum`: removal of cell cycle from virtual tumor in mESC 


Here we apply `Cyclum` to remove the cell cycle from a mESC dataset with 600 cell cycle genes and 1000 other gene expressions doubled for 40% cells.

** data taken from [Cyclum](https://github.com/KChen-lab/Cyclum/tree/master/old-version/data/mESC).

## Import necessary packages

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
import sklearn as skl
import sklearn.preprocessing
import scprep as scp
import scanpy as sc
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import sys


In [None]:
sys.path.append("../../")
from paths import DATA_DIR

In [None]:
import cyclum.tuning
import cyclum.models
from cyclum import writer
import cyclum.illustration
import cyclum.evaluation

Warning information from TensorFlow may occur. It doesn't matter.

## Helper functions

In [None]:
def compute_projection_op(Temb):
    """
    Temb: probablistic embedding (cells X locations)
    """
    pl = Temb / Temb.sum(axis=1)[:, np.newaxis] # noramalize each cell across locis p(l|c)
    pc = Temb / Temb.sum(axis=0) # noramalize each loci across cells p(c|l)
    
    F = pl.dot(pc.T)
    
    return F

def filter_recons(X, Temb):
    """
    X: input dataset (cells X genes)
    Temb: probablistic mapping to embedding manifold (cells X embedding dim)
    """
    pl = Temb / Temb.sum(axis=1)[:, np.newaxis] # noramalize each cell across locis p(l|c)
    pc = Temb / Temb.sum(axis=0) # noramalize each loci across cells p(c|l)
    proj = pl.dot(pc.T).dot(X)
    
    Xres_or = (X - proj) # calc residuals
    Xres = Xres_or - Xres_or.min(axis=0) # ensure positivity; shift each gene by min
    
    return Xres, Xres_or, proj

def plot_round_distr_color(flat, label, color_dict, fig_name=None):
    figure = plt.figure()
    ax = figure.subplots(subplot_kw={'projection': 'polar'})
    color = [color_dict[l] for l in label]

    for x, color in zip(flat, color):
        ax.plot([x, x], [1.5, 2], color=color, linewidth=0.5)

    xx = []
    pp = []
    max_p = 0
    for l in color_dict:
        _ = cyclum.evaluation.periodic_parzen_estimate(flat[label == l], 2 * np.pi)
        xx.append(_[0])
        pp.append(_[1])
        max_p = np.max([np.max(pp[-1]), max_p])
    for x, p, l in zip(xx, pp, color_dict):
        ax.fill_between(x, p / max_p + 2, 2, color=color_dict[l], alpha=0.5, linewidth=0.0, label=l)
    ax.legend(bbox_to_anchor=(1.4, 1.2))
    ax.set_yticks([])
    plt.tight_layout()

    if fig_name is not None:
        plt.savefig(fig_name, dpi=300)
    return figure, xx

def match_genes_idx(sc_gene_names, ref_gene_names):
    idx_in_sc = np.array([], dtype='int')
    idx_in_ref = np.array([], dtype='int')
    for i, gene in enumerate(ref_gene_names):
        marker_index = [g for g, v in enumerate(sc_gene_names) if v.upper() == gene.upper()]
        if len(marker_index) > 0:
            idx_in_sc = np.append(idx_in_sc, marker_index[0])
            idx_in_ref = np.append(idx_in_ref, i)
    return idx_in_sc, idx_in_ref


## Read data
Here we have label, so we load both. However, the label is not used until evaluation.

In [None]:
input_file_name_mask = str(DATA_DIR) + '/cellcycle_virtualtumor/perturbed-mesc-tpm-linear'

def preprocess(input_file_mask):
    """
    Read in data and perform log transform (log2(x+1)), centering (mean = 1) and scaling (sd = 1).
    """
    tpm = writer.read_df_from_binary(input_file_mask)
    sttpm = pd.DataFrame(data=skl.preprocessing.scale(np.log2(tpm.values + 1)), index=tpm.index, columns=tpm.columns)
    
    label = pd.read_csv(input_file_mask + '-label.csv', sep="\t", index_col=0)
    return sttpm, label

sttpm, label = preprocess(input_file_mask)

There is no convention whether cells should be columns or rows. Here we require cells to be rows.

In [None]:
sttpm.head()

In [None]:
label.head()

## Set up the model, fit and predict


In [None]:
model = cyclum.tuning.CyclumAutoTune(sttpm.values, max_linear_dims=3, 
                                     epochs=500, rate=2e-4, verbose=100,
                                     encoder_width=[40, 20])

In [None]:
model.show_elbow()
pass
model.model.summary()

In [None]:
model.train(sttpm.values, epochs=800, verbose=100, rate=2e-4)

In [None]:
## predictions 
pseudotime = model.predict_pseudotime(sttpm.values)
weights = model.get_weight()
rotation = weights[[-1,-2], :]


z = weights[-2, :] + 1j * weights[-1, :]
arg = np.angle(z)
mag = np.abs(z)

weights.shape

In [None]:
## save predictions
np.savez_compressed(DATA_DIR /'predictions', pseudotime=pseudotime, weights=weights, rotation=rotation)

## Illustrations
We illustrate the results on a circle, to show its circular nature. 
There is virtually no start and end of the circle.
Red, green and blue represents G0/G1, S and G2/M phase respectively.
The inner lines represents single cells. The cells spread across the
The areas outside

In [None]:
%aimport cyclum.illustration

In [None]:
color_map = {'stage': {"g0/g1": "red", "s": "green", "g2/m": "blue"},
                 'subcluster': {"intact": "cyan", "perturbed": "violet"}}
# cyclum.illustration.plot_round_distr_color(pseudotime, label['stage'], color_map['stage'])
plot_round_distr_color(pseudotime[:, 0], label['stage'], color_map['stage'])
pass

## Subclone analysis

The data corrected by cyclum shows better separation of two subclones in the tsne plot.

In [None]:
# cyclum filter
sttpm_cycFlt = sttpm - np.concatenate([np.cos(pseudotime), np.sin(pseudotime)], axis=1) @ rotation
