# Cell Cycle Scoring

This Jupyter notebooks uses the Seurat single-cell RNA-seq analysis package to compute cell cycle scores for each cell in the Gasperini et al. 2019 dataset. This code is based off the Seurat cell-cycle scoring and regression vignette available at: https://satijalab.org/seurat/articles/cell_cycle_vignette.html

Author: Karthik Guruvayurappan

In [119]:
library('Seurat')
library('Matrix')
library('biomaRt')
library(ggplot2)

In [2]:
# load in UMI count (expression) matrix
expression.matrix <- readMM('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.exprs.mtx')
head(expression.matrix)

6 x 207324 sparse Matrix of class "dgTMatrix"
                                                                               
[1,] . . . . . . . . . . . . . . . . . . . . . . . . 1 . 1 . . . . . . . ......
[2,] 1 . . . . . . . . . . . . . 2 . 1 . . . . . . . 1 . . . . . . 1 . . ......
[3,] 1 . 1 1 1 . . . . 1 1 . . . 3 . 4 1 . . . 1 . . 1 1 1 1 1 1 . . . . ......
[4,] . . . 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
[5,] . . 1 . . . . 1 . . . . . . . . . . . . . . . . . 1 . . . . 1 . . . ......
[6,] . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . . . . . . . ......

 .....suppressing 207290 columns in show(); maybe adjust 'options(max.print= *, width = *)'
 ..............................

In [3]:
# convert expression matrix from matrix to data frame format
expression.matrix <- as.data.frame(expression.matrix)
head(expression.matrix)

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V207315,V207316,V207317,V207318,V207319,V207320,V207321,V207322,V207323,V207324
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,1,0,0
3,1,0,1,1,1,0,0,0,0,1,⋯,0,0,0,0,1,0,0,2,0,2
4,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [5]:
# read in column names and add to expression matrix
cell.barcodes <- read.delim('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.cells.txt', header = FALSE)
cell.barcodes <- cell.barcodes$V1
colnames(expression.matrix) <- cell.barcodes
head(expression.matrix)

Unnamed: 0_level_0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,⋯,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,1,0,0
3,1,0,1,1,1,0,0,0,0,1,⋯,0,0,0,0,1,0,0,2,0,2
4,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [6]:
# read in row names and add to expression matrix
genes <- read.delim('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.genes.txt', header = FALSE)
genes <- genes$V1
rownames(expression.matrix) <- genes
head(expression.matrix)

Unnamed: 0_level_0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,⋯,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,1,0,0
ENSG00000228463,1,0,1,1,1,0,0,0,0,1,⋯,0,0,0,0,1,0,0,2,0,2
ENSG00000237094,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000235373,0,0,1,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000228327,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [27]:
print(length(genes))

[1] 13135


In [79]:
genes[10680]

In [96]:
# code snippet adapted from: https://stackoverflow.com/questions/28543517/how-can-i-convert-ensembl-id-to-gene-symbol-in-r
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
gene.symbols <- getBM(
    filters = "ensembl_gene_id",
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    values = genes,
    mart = mart,
)

In [98]:
gene.symbols <- merge(data.frame(genes), gene.symbols, all.x = TRUE, by.x = 'genes', by.y = 'ensembl_gene_id', sort = FALSE)

In [100]:
gene.symbols <- merge(data.frame(genes), gene.symbols, by = 'genes', sort = FALSE)
gene.symbols[10680, ]

Unnamed: 0_level_0,genes,hgnc_symbol
Unnamed: 0_level_1,<chr>,<chr>
10680,ENSG00000131747,TOP2A


In [102]:
gene.symbols[is.na(gene.symbols)] <- ''
sum(is.na(gene.symbols$hgnc_symbol))

In [103]:
print(nrow(gene.symbols))

[1] 13135


In [104]:
print(head(gene.symbols))

            genes hgnc_symbol
1 ENSG00000238009            
2 ENSG00000237683            
3 ENSG00000228463            
4 ENSG00000237094            
5 ENSG00000235373            
6 ENSG00000228327            


In [105]:
for (i in 1:nrow(gene.symbols)) {
    if (gene.symbols[i, 'hgnc_symbol'] == '') {
        gene.symbols[i, 'hgnc_symbol'] <- gene.symbols[i, 'genes']
    }
}
head(gene.symbols, 20)

Unnamed: 0_level_0,genes,hgnc_symbol
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000238009,ENSG00000238009
2,ENSG00000237683,ENSG00000237683
3,ENSG00000228463,ENSG00000228463
4,ENSG00000237094,ENSG00000237094
5,ENSG00000235373,ENSG00000235373
6,ENSG00000228327,ENSG00000228327
7,ENSG00000237491,LINC01409
8,ENSG00000225880,LINC00115
9,ENSG00000230368,FAM41C
10,ENSG00000188976,NOC2L


In [106]:
rownames(expression.matrix) <- gene.symbols[, 'hgnc_symbol']

In [107]:
head(expression.matrix)

Unnamed: 0_level_0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,⋯,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,1,0,0
ENSG00000228463,1,0,1,1,1,0,0,0,0,1,⋯,0,0,0,0,1,0,0,2,0,2
ENSG00000237094,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000235373,0,0,1,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000228327,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [108]:
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

In [109]:
gene.expression <- CreateSeuratObject(counts = expression.matrix)
gene.expression

An object of class Seurat 
13135 features across 207324 samples within 1 assay 
Active assay: RNA (13135 features, 0 variable features)

In [110]:
gene.expression <- NormalizeData(gene.expression)
gene.expression

An object of class Seurat 
13135 features across 207324 samples within 1 assay 
Active assay: RNA (13135 features, 0 variable features)

In [111]:
gene.expression <- FindVariableFeatures(gene.expression, selection.method  = "vst")
gene.expression

An object of class Seurat 
13135 features across 207324 samples within 1 assay 
Active assay: RNA (13135 features, 2000 variable features)

In [112]:
gene.expression <- ScaleData(gene.expression, features = rownames(gene.expression))
gene.expression

Centering and scaling data matrix



An object of class Seurat 
13135 features across 207324 samples within 1 assay 
Active assay: RNA (13135 features, 2000 variable features)

In [113]:
gene.expression <- RunPCA(gene.expression, features = VariableFeatures(gene.expression), ndims.print = 1:5, nfeatures.print = 10)
gene.expression

PC_ 1 
Positive:  NME1, EIF5A, SRM, CHCHD2, ODC1, PRELID1, TIMM13, RANBP1, EBNA1BP2, AURKAIP1 
Negative:  SNHG32, ZFAS1, EPB41L4A-AS1, ALAS2, FTH1, SLC25A37, YPEL3, ENSG00000117289, PNRC1, EIF4A2 
PC_ 2 
Positive:  VIM, PKM, S100A11, TPM4, CLIC1, SQSTM1, STX3, TMSB10, JUN, RHOC 
Negative:  HBZ, HBA1, HBA2, HBG1, HEMGN, HBG2, PRDX2, ALAS2, HBE1, HMBS 
PC_ 3 
Positive:  CCNB1, CDC20, HMMR, PLK1, AURKA, CCNB2, PTTG1, NEK2, HMGB2, CCNA2 
Negative:  SH3BGRL3, MT-CO2, MT-ND4, MT-CO1, MT-ND1, MT-ND2, MT-CYB, MT-CO3, FTL, NEAT1 
PC_ 4 
Positive:  PRDX1, HBG1, HBG2, EIF4A1, UBB, PRDX2, GYPA, CCT5, EIF2S1, ENO1 
Negative:  TOP2A, PIF1, CENPA, CENPF, ASPM, MKI67, ENSG00000227706, TPX2, TMSB10, MT-ND1 
PC_ 5 
Positive:  H1-2, H2AC6, H2BC4, H2BC12, LBH, HBG2, DYNLL1, HSPB1, HBA1, RNASE1 
Negative:  TRIB3, DDIT4, PSAT1, MTHFD2, ATF4, WARS1, MAP1B, GARS1, EIF4EBP1, ASNS 



An object of class Seurat 
13135 features across 207324 samples within 1 assay 
Active assay: RNA (13135 features, 2000 variable features)
 1 dimensional reduction calculated: pca

In [114]:
gene.expression <- CellCycleScoring(gene.expression, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

# view cell cycle scores and phase assignments
head(gene.expression[[]])

“The following features are not present in the object: UHRF1, MLF1IP, CASP8AP2, not searching for symbol synonyms”
“The following features are not present in the object: UBE2C, FAM64A, HN1, not searching for symbol synonyms”


Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,S.Score,G2M.Score,Phase,old.ident
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<dbl>,<chr>,<fct>
AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,SeuratProject,17566,3549,0.110732311,-0.1319208,S,SeuratProject
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,SeuratProject,8917,2543,-0.010290919,-0.1535426,G1,SeuratProject
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,SeuratProject,14626,3191,-0.17586013,-0.3084879,G1,SeuratProject
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,SeuratProject,22783,4539,0.003057281,-0.1574859,S,SeuratProject
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,SeuratProject,10124,2605,-0.144480961,-0.2362154,G1,SeuratProject
AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,SeuratProject,9743,2187,0.026418076,-0.1462899,S,SeuratProject


In [115]:
s.genes

In [116]:
g2m.genes

In [121]:
# Visualize the distribution of cell cycle markers across
cell.cycle.marker.ridge.plot <- RidgePlot(gene.expression, features = c("PCNA", "TOP2A", "MCM6", "MKI67"), ncol = 2)

In [123]:
ggsave(
    filename = '/iblm/netapp/home/karthik/crisprQTL/plots/cell_cycle_marker_gene_ridge_plot.tiff',
    cell.cycle.marker.ridge.plot,
    device = 'tiff'
)

Saving 6.67 x 6.67 in image

Picking joint bandwidth of 0.0365

Picking joint bandwidth of 0.0411

Picking joint bandwidth of 0.0198

Picking joint bandwidth of 0.0401



In [125]:
# Running a PCA on cell cycle genes reveals, unsurprisingly, that cells separate entirely by
# phase
# gene.expression <- RunPCA(gene.expression, features = c(s.genes, g2m.genes))
cell.cycle.pca <- DimPlot(gene.expression)

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`



In [126]:
ggsave(
    filename = '/iblm/netapp/home/karthik/crisprQTL/plots/cell_cycle_pca.tiff',
    plot = cell.cycle.pca,
    device = 'tiff'
)

Saving 6.67 x 6.67 in image



In [127]:
gene.expression <- ScaleData(gene.expression, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(gene.expression))

Regressing out S.Score, G2M.Score



In [None]:
# Now, a PCA on the variable genes no longer returns components associated with cell cycle
gene.expression <- RunPCA(gene.expression, features = VariableFeatures(gene.expression), nfeatures.print = 10)

In [None]:
# When running a PCA on only cell cycle genes, cells no longer separate by cell-cycle phase
gene.expression <- RunPCA(gene.expression, features = c(s.genes, g2m.genes))
cell.cycle.regressed.pca <- DimPlot(marrow)

In [None]:
ggsave(
    filename = '/iblm/netapp/home/karthik/crisprQTL/plots/cell_cycle_regressed_pca.tiff',
    plot = cell.cycle.regressed.pca,
    device = 'tiff'
)

In [None]:
# write scores to CSV files
s.scores <- gene.expression[[]]['S.Score']
g2m.scores <- gene.expression[[]]['G2M.Score']

write.csv(s.scores, '/iblm/netapp/home/karthik/crisprQTL/gasperini_data/s_scores.csv')
write.csv(g2m.scores, '/iblm/netapp/home/karthik/crisprQTL/gasperini_data/g2m_scores.csv')