# Pilot Cell Cycle Scoring

This notebook contains the code required to compute the cell cycle scores for the pilot dataset at high MOI. 

Author: Karthik Guruvayurappan

In [1]:
library('Seurat')
library('Matrix')
library('biomaRt')
library(ggplot2)

Attaching SeuratObject

Attaching sp



In [3]:
# load in UMI count (expression) matrix
expression.matrix <- readMM('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_pilot_highmoi_screen.exprs.mtx.gz')
head(expression.matrix)

6 x 47650 sparse Matrix of class "dgTMatrix"
                                                                               
[1,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
[2,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
[3,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
[4,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
[5,] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
[6,] . . . . . . . 1 . . . . . 1 . . . . . . . . . . . . . . 1 . . . . . ......

 .....suppressing 47616 columns in show(); maybe adjust 'options(max.print= *, width = *)'
 ..............................

In [4]:
# convert expression matrix from matrix to data frame format
expression.matrix <- as.data.frame(expression.matrix)
head(expression.matrix)

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V47641,V47642,V47643,V47644,V47645,V47646,V47647,V47648,V47649,V47650
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [5]:
# read in column names and add to expression matrix
cell.barcodes <- read.delim('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_pilot_highmoi_screen.cells.txt.gz', header = FALSE)
cell.barcodes <- cell.barcodes$V1
colnames(expression.matrix) <- cell.barcodes
head(expression.matrix)

Unnamed: 0_level_0,AAACCTGAGCGAGAAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGCTGTCTA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGTTATCGC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGTTCCACA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCAAATACAG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCACTCGACG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCAGTCTTCC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCATGCCTAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGGTCGGCATC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGGTCTCTCGT-1_K1000_CRISPRi_cells_r1_SI-GA-G1,⋯,TTTGTCACAAGGACAC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACACAAGACG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACACTCTGTC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACATTCCTCG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCAGTCAGCTAT-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCAGTGTTTGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCACAAACC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCAGAGGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCGCGATCG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCTGGCGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [6]:
# read in row names and add to expression matrix
genes <- read.delim('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_pilot_highmoi_screen.genes.txt.gz', header = FALSE)
genes <- genes$V1
rownames(expression.matrix) <- genes
head(expression.matrix)

Unnamed: 0_level_0,AAACCTGAGCGAGAAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGCTGTCTA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGTTATCGC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGTTCCACA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCAAATACAG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCACTCGACG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCAGTCTTCC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCATGCCTAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGGTCGGCATC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGGTCTCTCGT-1_K1000_CRISPRi_cells_r1_SI-GA-G1,⋯,TTTGTCACAAGGACAC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACACAAGACG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACACTCTGTC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACATTCCTCG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCAGTCAGCTAT-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCAGTGTTTGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCACAAACC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCAGAGGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCGCGATCG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCTGGCGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000186092,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000239945,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,0,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [52]:
# code snippet adapted from: https://stackoverflow.com/questions/28543517/how-can-i-convert-ensembl-id-to-gene-symbol-in-r
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
gene.symbols <- getBM(
    filters = "ensembl_gene_id",
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    values = genes,
    mart = mart,
)

In [53]:
gene.symbols <- merge(data.frame(genes), gene.symbols, all.x = TRUE, by.x = 'genes', by.y = 'ensembl_gene_id', sort = FALSE)

In [54]:
gene.symbols <- merge(data.frame(genes), gene.symbols, by = 'genes', sort = FALSE)

In [55]:
gene.symbols[is.na(gene.symbols)] <- ''
sum(is.na(gene.symbols$hgnc_symbol))

In [56]:
gene.symbols <- gene.symbols[!duplicated(gene.symbols$gene), ]

In [57]:
for (i in 1:nrow(gene.symbols)) {
    if (gene.symbols[i, 'hgnc_symbol'] == '') {
        gene.symbols[i, 'hgnc_symbol'] <- gene.symbols[i, 'genes']
    }

    if (sum(gene.symbols$hgnc_symbol == gene.symbols$hgnc_symbol[i]) >= 2) {
        gene.symbols[i, 'hgnc_symbol'] <- gene.symbols[i, 'genes']
    }
}
head(gene.symbols)

Unnamed: 0_level_0,genes,hgnc_symbol
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000243485,MIR1302-2HG
2,ENSG00000237613,FAM138A
3,ENSG00000186092,OR4F5
4,ENSG00000238009,ENSG00000238009
5,ENSG00000239945,ENSG00000239945
6,ENSG00000237683,ENSG00000237683


In [58]:
gene.symbols[duplicated(gene.symbols$hgnc_symbol), ]

genes,hgnc_symbol
<chr>,<chr>


In [59]:
rownames(expression.matrix) <- gene.symbols[, 'hgnc_symbol']

In [60]:
head(expression.matrix)

Unnamed: 0_level_0,AAACCTGAGCGAGAAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGCTGTCTA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGTTATCGC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGAGTTCCACA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCAAATACAG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCACTCGACG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCAGTCTTCC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGCATGCCTAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGGTCGGCATC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,AAACCTGGTCTCTCGT-1_K1000_CRISPRi_cells_r1_SI-GA-G1,⋯,TTTGTCACAAGGACAC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACACAAGACG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACACTCTGTC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCACATTCCTCG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCAGTCAGCTAT-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCAGTGTTTGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCACAAACC-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCAGAGGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCGCGATCG-1_K1000_CRISPRi_cells_r6_SI-GA-G6,TTTGTCATCTGGCGTG-1_K1000_CRISPRi_cells_r6_SI-GA-G6
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000239945,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,0,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [61]:
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

In [62]:
gene.expression <- CreateSeuratObject(counts = expression.matrix)
gene.expression

An object of class Seurat 
32738 features across 47650 samples within 1 assay 
Active assay: RNA (32738 features, 0 variable features)

In [63]:
gene.expression <- NormalizeData(gene.expression)
gene.expression

An object of class Seurat 
32738 features across 47650 samples within 1 assay 
Active assay: RNA (32738 features, 0 variable features)

In [64]:
gene.expression <- FindVariableFeatures(gene.expression, selection.method  = "vst")
gene.expression

An object of class Seurat 
32738 features across 47650 samples within 1 assay 
Active assay: RNA (32738 features, 2000 variable features)

In [65]:
gene.expression <- ScaleData(gene.expression, features = rownames(gene.expression))
gene.expression

Centering and scaling data matrix



An object of class Seurat 
32738 features across 47650 samples within 1 assay 
Active assay: RNA (32738 features, 2000 variable features)

In [66]:
gene.expression <- RunPCA(gene.expression, features = VariableFeatures(gene.expression), ndims.print = 1:5, nfeatures.print = 10)
gene.expression

PC_ 1 
Positive:  SRM, ATP5MC3, CENPX, PRELID1, MRPL41, ZNF706, SDF2L1, EBNA1BP2, PPP1R14B, PHF19 
Negative:  ALAS2, HBZ, GYPA, ENSG00000117289, SNHG32, GYPB, EIF1, ASNS, DDIT4, BTG2 
PC_ 2 
Positive:  HBA1, HBZ, HBA2, HBG1, HBG2, KLF1, HMBS, HBE1, PITX1, GYPA 
Negative:  VIM, LGALS1, TMSB10, CLIC1, TMSB4X, S100A11, KRT8, SOCS1, TPM4, PKM 
PC_ 3 
Positive:  ENO1, EIF4A1, TUBA1B, CCT5, UBB, CCNB1, NUDC, HSP90AA1, LDHA, HSP90AB1 
Negative:  MT-CO2, MT-ND4, NEAT1, MT-ND1, C1orf56, MT-ND2, MT-ATP6, FTL, HNRNPH1, EIF2S3 
PC_ 4 
Positive:  HMGB2, NUCKS1, PSAT1, TOP2A, CENPF, HSPA8, PHGDH, MTHFD2, ASPM, HMMR 
Negative:  HBG2, HBG1, H1-2, H2AC6, HBA1, H4C8, H2BC12, HBA2, ENSG00000269600, H2BC4 
PC_ 5 
Positive:  AURKA, TOP2A, CENPA, CKS2, TPX2, CENPF, PIF1, ASPM, UBE2S, CDCA8 
Negative:  NPM1, ENO1, CTSC, HSP90AB1, CYBA, LDHA, HSPB1, EIF4A1, QPRT, UBB 



An object of class Seurat 
32738 features across 47650 samples within 1 assay 
Active assay: RNA (32738 features, 2000 variable features)
 1 dimensional reduction calculated: pca

In [67]:
gene.expression <- CellCycleScoring(gene.expression, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

# view cell cycle scores and phase assignments
head(gene.expression[[]])

“The following features are not present in the object: UHRF1, MLF1IP, CASP8AP2, not searching for symbol synonyms”
“The following features are not present in the object: FAM64A, HN1, not searching for symbol synonyms”


Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,S.Score,G2M.Score,Phase,old.ident
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<dbl>,<chr>,<fct>
AAACCTGAGCGAGAAA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,SeuratProject,14090,3676,-0.29817225,0.127110492,G2M,SeuratProject
AAACCTGAGCTGTCTA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,SeuratProject,11933,3053,0.04276115,-0.207796487,S,SeuratProject
AAACCTGAGTTATCGC-1_K1000_CRISPRi_cells_r1_SI-GA-G1,SeuratProject,21239,3902,-0.05601193,-0.006468491,G1,SeuratProject
AAACCTGAGTTCCACA-1_K1000_CRISPRi_cells_r1_SI-GA-G1,SeuratProject,29482,4940,-0.08736339,-0.015361547,G1,SeuratProject
AAACCTGCAAATACAG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,SeuratProject,10811,3307,-0.09298147,0.169837374,G2M,SeuratProject
AAACCTGCACTCGACG-1_K1000_CRISPRi_cells_r1_SI-GA-G1,SeuratProject,9344,2297,-0.03381389,0.004478619,G2M,SeuratProject


In [68]:
# Visualize the distribution of cell cycle markers across
cell.cycle.marker.ridge.plot <- RidgePlot(gene.expression, features = c("PCNA", "TOP2A", "MCM6", "MKI67"), ncol = 2)

In [69]:
ggsave(
    filename = '/iblm/netapp/home/karthik/crisprQTL/plots/cell_cycle_marker_gene_ridge_plot_pilot.tiff',
    cell.cycle.marker.ridge.plot,
    device = 'tiff'
)

Saving 6.67 x 6.67 in image

Picking joint bandwidth of 0.057

Picking joint bandwidth of 0.0557

Picking joint bandwidth of 0.0452

Picking joint bandwidth of 0.0609



In [72]:
gene.expression <- RunPCA(gene.expression, features = c(s.genes, g2m.genes))
cell.cycle.pca <- DimPlot(gene.expression)

“The following 5 features requested have not been scaled (running reduction without them): UHRF1, MLF1IP, CASP8AP2, FAM64A, HN1”
“You're computing too large a percentage of total singular values, use a standard svd instead.”
PC_ 1 
Positive:  AURKA, CENPF, HMGB2, CKS2, TOP2A, TPX2, CKS1B, CENPA, CDCA8, MKI67 
	   PSRC1, HMMR, NUSAP1, BIRC5, KIF2C, CDK1, GTSE1, CENPE, CDCA3, AURKB 
	   CDC20, NEK2, TACC3, NUF2, TUBB4B, DLGAP5, BUB1, NDC80, CKAP2, CDCA2 
Negative:  UNG, CCNE2, CDC6, DTL, MCM5, WDR76, HELLS, CDCA7, POLD3, CHAF1B 
	   MCM6, GINS2, MCM4, MCM2, BRIP1, UBE2C, FEN1, POLA1, E2F8, ATAD2 
	   EXO1, SLBP, TIPIN, RPA2, CLSPN, RFC2, UBR7, PCNA, DSCC1, CDC45 
PC_ 2 
Positive:  FEN1, GINS2, PCNA, TYMS, SLBP, CLSPN, CDC45, RFC2, GMNN, NASP 
	   CDC6, RPA2, UNG, MCM4, MCM6, ATAD2, MCM5, CHAF1B, EXO1, USP1 
	   RRM2, MCM2, DTL, CDCA7, RAD51, POLD3, HELLS, CCNE2, CKS1B, WDR76 
Negative:  AURKA, PSRC1, TOP2A, CENPA, CENPE, GTSE1, CDCA8, HMMR, G2E3, CKS2 
	   CKAP2L, TPX2, CENPF, CKAP2, GAS

In [74]:
ggsave(
    filename = '/iblm/netapp/home/karthik/crisprQTL/plots/cell_cycle_pca_pilot.tiff',
    plot = cell.cycle.pca,
    device = 'tiff'
)

Saving 6.67 x 6.67 in image



In [75]:
# write scores to CSV files
s.scores <- gene.expression[[]]['S.Score']
g2m.scores <- gene.expression[[]]['G2M.Score']

write.csv(s.scores, '/iblm/netapp/home/karthik/crisprQTL/gasperini_data/s_scores_pilot.csv')
write.csv(g2m.scores, '/iblm/netapp/home/karthik/crisprQTL/gasperini_data/g2m_scores_pilot.csv')