# What is about ?

Analyse cell cycle for Broad Institute collection of datasets.

(с) A.Chervov, A. Zinovyev

See paper: https://arxiv.org/abs/2208.05229 for cell cycle analysis.

    "Computational challenges of cell cycle analysis using single cell transcriptomics"
    Alexander Chervov, Andrei Zinovyev

Data source:

GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE157220

BROAD: https://singlecell.broadinstitute.org/single_cell/study/SCP542/pan-cancer-cell-line-heterogeneity

Paper: Pan-cancer single cell RNA-seq uncovers recurring programs of cellular heterogeneity https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8135089/

Gabriela S. Kinker,,1,4 Alissa C. Greenwald,,1 Rotem Tal,1 Zhanna Orlova,1 Michael S. Cuoco,2 James M. McFarland,3 Allison Warren,3 Christopher Rodman,2 Jennifer A. Roth,3 Samantha A. Bender,3 Bhavna Kumar,5 James W. Rocco,5 Pedro ACM Fernandes,4 Christopher C. Mader,3 Hadas Keren-Shaul,6,7 Alexander Plotnikov,6 Haim Barr,6 Aviad Tsherniak,3 Orit Rozenblatt-Rosen,2 Valery Krizhanovsky,1 Sidharth V. Puram,8 Aviv Regev,2 and Itay Tirosh1,#



### Versions:

7 - cosmetic changes 

6 - added PCA plots with more axis 1,3 and 2,3 from Tirosh genes
    
    # not see something interesting 

5 - added PCA plots from reactome-ext list and from ALL genes
    
    # Results are not very clear - expected: reactome is the same as Tirosh (plus minus)
    # In fact sometimes it shows "circle+hole" worse, sometimes better , many times - more or less the same 
    

4 - added PCA plot from Tirosh genes

    # Only for 3 examples of the fast cycle, PCA shows cycle clearly (hole is seen) 
    # Actually these examples are kind of 4-angle, and 2 more examples 4-angle - that is nice - should be investigated further
    
3 - same as 2 minor corrections 

2 - phase plots for all processed datasets 

1 - just phase plot for one particular dataset  - no clear picture of the cell cycle 

    # path_and_filename = '/kaggle/input/scrnaseq-collection-of-cancer-cell-lines/processed_h5ad/HEC59_ENDOMETRIUM_proc.h5ad'
    # str_data_inf = 'HEC59 ENDOMETRIUM'


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
c = 0
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        c+=1
        if c>=10: break
        print(c, os.path.join(dirname, filename))
            
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Install/import modules

In [None]:
!pip install scanpy 
import scanpy as sc

import numpy as np
import pandas as pd
import scipy 

import os
import sys
import time
t0start = time.time()


import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 70
#plt.style.use('dark_background')
import seaborn as sns

from sklearn.decomposition import PCA

# Cell cycle phase plots


In [None]:
S_phase_genes_Tirosh = ['MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8']
G2_M_genes_Tirosh = ['HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1', 'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2', 'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA']

In [None]:
url = 'https://raw.githubusercontent.com/chervov/genes/main/cell_cycle_genes_reactome_extended.txt'
cell_cycle_genes_reactome_extended = pd.read_csv(url,header= None)
cell_cycle_genes_reactome_extended = list(cell_cycle_genes_reactome_extended[0].unique())
print( len(cell_cycle_genes_reactome_extended), cell_cycle_genes_reactome_extended[:10] )

In [None]:
n_x_subplots = 2 # 
n_y_subplots = 3

import os
import time

t00 = time.time()

verbose = 1
cc = 0
IX = 0
df_stat = pd.DataFrame()
for dirname, _, filenames in os.walk('/kaggle/input'):
    if 'processed_h5ad' not in  dirname: continue 
        
    for filename in filenames:
        cc+=1
        #if cc>=4: break

        path_and_filename = os.path.join(dirname, filename)
        str_data_inf = filename.replace('_', ' ')
        
        if verbose >= 1:
            print(cc, path_and_filename )
        
        adata_orig = sc.read(path_and_filename)
        # adata_orig = sc.read_10x_mtx(
        #     path_and_filename, # 'data/filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
        #     var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
        #     cache=True)  

        adata = adata_orig.copy()
        if verbose >= 1000:
            print(adata.X.sum(axis=1).ravel()[:5] )
            print( adata )
            
        
        mask = pd.Series(index = adata.obs.index, data = True) # Just all include mask - in future may be change to something

        fig = plt.figure(figsize = (25,20));  c = 0
        #plt.suptitle(str_data_inf, fontsize = 20)       
        cell_type = ''
        plt.suptitle(cell_type +' '+str(mask.sum())+'Cells'  + ' ' + str_data_inf ,fontsize= 18  ) # 'n_neighbors='+str(n_neighbors) +

        if n_x_subplots*n_y_subplots >= 1: # Phase plot
            
            c+=1; fig.add_subplot(n_y_subplots,n_x_subplots,c);

            list_genes_upper = [t.upper() for t in adata.var.index ]
            I = np.where( pd.Series(list_genes_upper).isin( S_phase_genes_Tirosh ) )[0]
            if verbose >= 100:
                print(len(I), np.array(list_genes_upper)[I])
            v1 = adata[mask,:].X[:,I].mean(axis = 1)
            v1 = np.asarray(v1).ravel()
            I = np.where( pd.Series(list_genes_upper).isin( G2_M_genes_Tirosh ) )[0]
            if verbose >= 100:
                print(len(I), np.array(list_genes_upper)[I])
            v2 = adata[mask,:].X[:,I].mean(axis = 1)
            v2 = np.asarray(v2).ravel()

            corrcoef = np.round(np.corrcoef(v1,v2)[0,1] , 2)
            IX+=1
            df_stat.loc[IX,'Correlation G1S vs G2M'] = corrcoef
            df_stat.loc[IX,'n_cells'] = adata.X.shape[0];         df_stat['n_cells'] = df_stat['n_cells'].astype(int)
            df_stat.loc[IX,'Name'] = str_data_inf


            col =  'stage' # 'total_counts' #'pct_counts_mt' # 
            col = 'cell_type' #  'n_counts' # 'age', 'sex', 'sample', 'patient', 'cell_type'
            col = 'total_counts'
            if col in adata.obs.columns:
                ax = sns.scatterplot(x = v1, y = v2 , hue = adata[mask].obs[col], palette = "viridis" )#, palette = ['green', 'red'] )# obs['pct_counts_mt'])
                plt.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text
                plt.setp(ax.get_legend().get_title(), fontsize=20) # for legend title
            else:
                ax = sns.scatterplot(x = v1, y = v2)#  , hue = color_by, palette = ['red','green'] )# obs['pct_counts_mt'])
            #plt.title(cell_type +' '+str(mask.sum())+'Cells'  + ' ' + str_data_inf + ' Phase plot',fontsize= 20  )
            cell_type = ''
            #plt.title('Correlation G1SvsG2M='+str(corrcoef) +  ' '+cell_type +' '+str(mask.sum())+'Cells'  + ' ' + str_data_inf + ' Phase plot',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.title('Correlation G1SvsG2M='+str(corrcoef) +   ' Phase plot',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.xlabel('G1S',  fontsize = 20)
            plt.ylabel('G2M',  fontsize = 20)
            
            
        if n_x_subplots*n_y_subplots >= 2: # PCA plot from Tirosh
            c+=1; fig.add_subplot(n_y_subplots,n_x_subplots,c);
            
            list_genes_upper = [t.upper() for t in adata.var.index ]
            I = np.where( pd.Series(list_genes_upper).isin( S_phase_genes_Tirosh + G2_M_genes_Tirosh ) )[0]
            if verbose >= 100:
                print(len(I), np.array(list_genes_upper)[I])
            
            from sklearn.decomposition import PCA
            reducer = PCA(n_components=2)
            r = reducer.fit_transform(adata[mask,:].X[:,I])
            v1,v2 = r[:,0],r[:,1]
            
            col = 'total_counts'
            if col in adata.obs.columns:
                ax = sns.scatterplot(x = v1, y = v2 , hue = adata[mask].obs[col], palette = "viridis" )#, palette = ['green', 'red'] )# obs['pct_counts_mt'])
                plt.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text
                plt.setp(ax.get_legend().get_title(), fontsize=20) # for legend title
            else:
                ax = sns.scatterplot(x = v1, y = v2)#  , hue = color_by, palette = ['red','green'] )# obs['pct_counts_mt'])

            plt.title(str(reducer)+ ' From Tirosh genes',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.xlabel(str(reducer)+'1',  fontsize = 20)
            plt.ylabel(str(reducer)+'2',  fontsize = 20)

        if n_x_subplots*n_y_subplots >= 3: # PCA plot from Tirosh
            c+=1; fig.add_subplot(n_y_subplots,n_x_subplots,c);
            
            list_genes_upper = [t.upper() for t in adata.var.index ]
            I = np.where( pd.Series(list_genes_upper).isin( S_phase_genes_Tirosh + G2_M_genes_Tirosh ) )[0]
            if verbose >= 100:
                print(len(I), np.array(list_genes_upper)[I])
            
            from sklearn.decomposition import PCA
            reducer = PCA(n_components=3)
            r = reducer.fit_transform(adata[mask,:].X[:,I])
            v1,v2 = r[:,0],r[:,2]
            
            col = 'total_counts'
            if col in adata.obs.columns:
                ax = sns.scatterplot(x = v1, y = v2 , hue = adata[mask].obs[col], palette = "viridis" )#, palette = ['green', 'red'] )# obs['pct_counts_mt'])
                plt.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text
                plt.setp(ax.get_legend().get_title(), fontsize=20) # for legend title
            else:
                ax = sns.scatterplot(x = v1, y = v2)#  , hue = color_by, palette = ['red','green'] )# obs['pct_counts_mt'])

            plt.title(str(reducer)+ ' From Tirosh genes',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.xlabel(str(reducer)+'1',  fontsize = 20)
            plt.ylabel(str(reducer)+'3',  fontsize = 20)

        if n_x_subplots*n_y_subplots >= 4: # PCA plot from Tirosh
            c+=1; fig.add_subplot(n_y_subplots,n_x_subplots,c);
            
            list_genes_upper = [t.upper() for t in adata.var.index ]
            I = np.where( pd.Series(list_genes_upper).isin( S_phase_genes_Tirosh + G2_M_genes_Tirosh ) )[0]
            if verbose >= 100:
                print(len(I), np.array(list_genes_upper)[I])
            
            from sklearn.decomposition import PCA
            reducer = PCA(n_components=3)
            r = reducer.fit_transform(adata[mask,:].X[:,I])
            v1,v2 = r[:,1],r[:,2]
            
            col = 'total_counts'
            if col in adata.obs.columns:
                ax = sns.scatterplot(x = v1, y = v2 , hue = adata[mask].obs[col], palette = "viridis" )#, palette = ['green', 'red'] )# obs['pct_counts_mt'])
                plt.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text
                plt.setp(ax.get_legend().get_title(), fontsize=20) # for legend title
            else:
                ax = sns.scatterplot(x = v1, y = v2)#  , hue = color_by, palette = ['red','green'] )# obs['pct_counts_mt'])

            plt.title(str(reducer)+ ' From Tirosh genes',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.xlabel(str(reducer)+'2',  fontsize = 20)
            plt.ylabel(str(reducer)+'3',  fontsize = 20)
            
            
        if n_x_subplots*n_y_subplots >= 5: # PCA from Reactom Ext Genes
            c+=1; fig.add_subplot(n_y_subplots,n_x_subplots,c);
            
            list_genes_upper = [t.upper() for t in adata.var.index ]
            I = np.where( pd.Series(list_genes_upper).isin(cell_cycle_genes_reactome_extended ) )[0]
            if verbose >= 100:
                print(len(I), np.array(list_genes_upper)[I])
            
            from sklearn.decomposition import PCA
            reducer = PCA(n_components=2)
            r = reducer.fit_transform(adata[mask,:].X[:,I])
            v1,v2 = r[:,0],r[:,1]
            
            col = 'total_counts'
            if col in adata.obs.columns:
                ax = sns.scatterplot(x = v1, y = v2 , hue = adata[mask].obs[col], palette = "viridis" )#, palette = ['green', 'red'] )# obs['pct_counts_mt'])
                plt.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text
                plt.setp(ax.get_legend().get_title(), fontsize=20) # for legend title
            else:
                ax = sns.scatterplot(x = v1, y = v2)#  , hue = color_by, palette = ['red','green'] )# obs['pct_counts_mt'])

            plt.title(str(reducer)+ ' From Reactome genes',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.xlabel(str(reducer)+'1',  fontsize = 20)
            plt.ylabel(str(reducer)+'2',  fontsize = 20)

        if n_x_subplots*n_y_subplots >= 6: # PCA from ALL Genes
            c+=1; fig.add_subplot(n_y_subplots,n_x_subplots,c);
            
            from sklearn.decomposition import PCA
            reducer = PCA(n_components=2)
            r = reducer.fit_transform(adata[mask,:].X)
            v1,v2 = r[:,0],r[:,1]
            
            col = 'total_counts'
            if col in adata.obs.columns:
                ax = sns.scatterplot(x = v1, y = v2 , hue = adata[mask].obs[col], palette = "viridis" )#, palette = ['green', 'red'] )# obs['pct_counts_mt'])
                plt.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text
                plt.setp(ax.get_legend().get_title(), fontsize=20) # for legend title
            else:
                ax = sns.scatterplot(x = v1, y = v2)#  , hue = color_by, palette = ['red','green'] )# obs['pct_counts_mt'])

            plt.title(str(reducer)+ ' From ALL genes',fontsize= 20  ) # 'n_neighbors='+str(n_neighbors) +
            plt.xlabel(str(reducer)+'1',  fontsize = 20)
            plt.ylabel(str(reducer)+'2',  fontsize = 20)
            
            
        plt.show()            
print('Finished. %.1f seconds passed total'%(-t00 + time.time()) ) 
df_stat


In [None]:
plt.hist( df_stat['Correlation G1S vs G2M'], bins = 25 )
plt.title('Correlation G1S vs G2M')
plt.show()

In [None]:
df_stat

In [None]:
df_stat.describe()

In [None]:
df_stat2 = df_stat.copy()
df_stat2['Name'] = df_stat2['Name'].apply(lambda x: x.replace('proc.h5ad',''))
df_stat2

In [None]:

df_stat2.to_csv('cell_lines_broad_institute_info.csv')

In [None]:
plt.hist( df_stat['Correlation G1S vs G2M'], bins = 25 )
plt.title('Correlation G1S vs G2M')
plt.show()

In [None]:
df_stat

In [None]:
print(filename)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='CST3')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['CST3'])#, 'PPBP'])

In [None]:
m = np.mean((np.exp(adata.X)-1).sum(axis = 1) )
print(np.std((np.exp(adata.X)-1).sum(axis = 1) )/m)
print(np.std(     adata.X.sum(axis = 1) )/ np.mean( adata.X.sum(axis = 1) ))

(np.exp(adata.X)-1).sum(axis = 1)[:10]

In [None]:
nan

In [None]:
nan