In [1]:
source("source/RaceID3_StemID2_class.R")

Loading required package: tsne
Loading required package: pheatmap
Loading required package: MASS
Loading required package: cluster
Loading required package: mclust
Package 'mclust' version 5.4
Type 'citation("mclust")' for citing this R package in publications.
Loading required package: flexmix
Loading required package: lattice
Loading required package: fpc
Loading required package: amap
Loading required package: RColorBrewer
Loading required package: locfit
locfit 1.5-9.1 	 2013-03-22
Loading required package: vegan
Loading required package: permute
This is vegan 2.4-6
Loading required package: Rtsne
Loading required package: scran
Loading required package: BiocParallel
Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘pack

In [2]:
x <- read.csv("source/E725.matrix.Seb_NewData_E725.3.quantif",sep="\t",header=TRUE, row.name=1)
prdata <- x[grep("ERCC",rownames(x),invert=TRUE),]

In [3]:
dim(x)
dim(prdata)

In [4]:
require(biomaRt)
# Annotate the data before adding SC experiment

mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl")
k <- getBM(
    filters = "ensembl_gene_id",
    attributes = c(
            "ensembl_gene_id",              # Gene stable ID
            "external_gene_name",           # Casual name
            "external_transcript_name",     # Transcript-specific name
            "gene_biotype",                 # Gene biotype
            "transcript_biotype",           # Trans type
            "description",                  # Gene description
            "band",                         # Karyotype band
            "refseq_mrna",
            "go_id",                        # Go Term accession (cellular domains)
            "go_linkage_type",              # Go Term evidence code
            "name_1006",                    # Go Term name
            "definition_1006",              # Go Term definition
            "namespace_1003"                # Go domain                  
    ),
    values = rownames(prdata),
    mart = mart
)

Loading required package: biomaRt


In [13]:
# Update rownames with external_gene_names
matrix_names <- rownames(prdata)
lookup_table <- unique(k[,c(1,3)]) # ensembl, external transcript

new_names <- c()
for (name in matrix_names){
    gene_name <- lookup_table[lookup_table$ensembl_gene_id == name,]$external_transcript_name
    # Not using external_gene_name due to isoforms in data
    if (length(gene_name) != 0){
        new_names <- c(new_names, gene_name)
    } else {
        new_names <- c(new_names, name)
    }
}

In [28]:
dim(lookup_table)
length(rownames(prdata))
length(new_names)

loki <- unique(k[,c(1,3)])
names_only <- unique(loki$ensembl_gene_id)

head(
    merge(names_only, loki, by.x = 1, by.y = 2, all.x = T)
)

#rownames(prdata) <- new_names
#sc <- SCseq(prdata)
#head(unique(k[,c(1,3)]))


x,ensembl_gene_id
ENSMUSG00000000001,
ENSMUSG00000000003,
ENSMUSG00000000028,
ENSMUSG00000000031,
ENSMUSG00000000037,
ENSMUSG00000000049,


In [None]:
head(rownames(sc@expdata),100)


In [None]:
# The data comes from the same batch, so we do not forsee any batch
# effects, therefore downsampling will not be used
sc <- filterdata(
    sc, mintotal=3000, minexpr=5, maxexpr=500, 
    downsample = F, sfn = F, hkn = F,
    dsn = 1, rseed = 17000, CGenes = NULL, FGenes = NULL
)
# Using defaults, we are left with 1000 genes
#dim(sc_defaults@fdata)


In [None]:
# regress out the batch effect
# optional:
#vars <- data.frame(row.names=names(sc@fdata),batch=sub("(_|)\\d.+","",names(sc@fdata)))
#sc@fdata <- varRegression(sc@fdata,vars)

# correct for cell cycle, proliferation, and expression of degradation markers by PCA
# optional:

gCC <- name2id( k$external_gene_name[k$name_1006 == "cell cycle"],rownames(sc@fdata))
gCP <- name2id( k$external_gene_name[k$name_1006 == "cell proliferation"],rownames(sc@fdata))
vset <- list(gCC,gCP)
#g   <- sub("__chr.+","",rownames(sc@fdata));
#k   <- getBM(attributes = c("external_gene_name", "go_id","name_1006"),filters="external_gene_name",values=g,mart=mart)
#gCC <- name2id( k$external_gene_name[k$name_1006 == "cell cycle"],rownames(sc@fdata)) 
#gCP <- name2id( k$external_gene_name[k$name_1006 == "cell proliferation"],rownames(sc@fdata))
#vset <- list(gCC,gCP)

In [None]:
k$external_gene_name[k$name_1006 == "cell cycle"]
dim(k)
dim(sc@fdata)
#x <- CCcorrect(sc@fdata,vset=vset,CGenes=NULL,ccor=.4,nComp=NULL,pvalue=.05,quant=.01,mode="pca")
#x$n


In [None]:
# loadings of the first principal component that has been removed
y <- x$pca$rotation[,x$n[1]]
# genes from vset are either enriched in the head or the tail of this list
tail(y[order(y,decreasing=TRUE)],10)
# reassign the corrected expression matrix to sc@fdata
sc@fdata <- x$xcor

In [None]:
# k-medoids clustering
sc <- clustexp(sc,clustnr=30,bootnr=50,metric="pearson",do.gap=T,sat=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000,FUNcluster="kmedoids",FSelect=TRUE)
# compute t-SNE map
sc <- comptsne(sc,rseed=15555,sammonmap=FALSE,initial_cmd=TRUE,fast=TRUE,perplexity=30)
# detect outliers and redefine clusters
sc <- findoutliers(sc, outminc=5,outlg=2,probthr=1e-3,thr=2**-(1:40),outdistquant=.95)
# reassign clusters based on random forest
sc <- rfcorrect(sc,rfseed=12345,final=TRUE,nbfactor=5)

In [None]:
plotgap(sc)

plotsaturation(sc,disp=TRUE)
# plot change of the within-cluster dispersion as a function of the cluster number: only if sat == TRUE
plotsaturation(sc)
# silhouette of k-medoids clusters
plotsilhouette(sc)
# Jaccard's similarity of k-medoids clusters
plotjaccard(sc)
# barchart of outlier probabilities
plotoutlierprobs(sc)
# regression of background model
plotbackground(sc)
# dependence of outlier number on probability threshold (probthr)
plotsensitivity(sc)
# heatmap of k-medoids cluster
clustheatmap(sc,final=FALSE,hmethod="single")
# heatmap of final cluster
clustheatmap(sc,final=TRUE,hmethod="single")
# highlight k-medoids clusters in t-SNE map
plottsne(sc,final=FALSE)
# highlight final clusters in t-SNE map
plottsne(sc,final=TRUE)