In [2]:
library(genefu)
library(SingleCellExperiment)
library(rhdf5)
library(Matrix)
library(scran)

Loading required package: survcomp

Loading required package: survival

Loading required package: prodlim

Loading required package: biomaRt

Loading required package: iC10

Loading required package: pamr

Loading required package: cluster

Loading required package: impute

Loading required package: iC10TrainingData

Loading required package: AIMS

Loading required package: e1071

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, u

In [3]:
Load_h5adsc_to_SCE <- function(scmat, scgnm = NA){
  scmat <- h5read(scmat, '/')
  if("raw" %in% attr(scmat, "names")){
    X <- scmat$raw$X
    clsX <- class(X)
    if(length(clsX)>1)
      clsX <- clsX[1]
    if(clsX!="list"){
      X <- Matrix(X)
      dat <- as(X, "CsparseMatrix")
    }
    h5ad.var <- scmat$raw$var
  } else{
    X <- scmat$X
    dat <- sparseMatrix(i = X$indices[] + 1,
                        p = X$indptr[],
                        x = as.numeric(X$data[]),
                        repr = "C")
    h5ad.var <- scmat$var
  }
  # generate factors using categories
  var <- list()
  if("__categories" %in% attr(h5ad.var, "names")){ # old anndata
    for(name in attr(h5ad.var[["__categories"]], "names")){
      if(length(h5ad.var[[name]]) >= length(h5ad.var[["__categories"]][[name]])){
        var[[name]] <- factor(h5ad.var[[name]], labels = h5ad.var[["__categories"]][[name]])
      }
    }
  } else {
    for(name in attr(h5ad.var, "names")){
      if(name!='_index')
        if(class(var[[name]]) == "list"){
          var[[name]] <- factor(h5ad.var[[name]]$codes, labels = h5ad.var[[name]]$categories)
        }else{
          var[[name]] <- h5ad.var[[name]]
        }
    }
  }
  h5ad.obs <- scmat$obs
  obs <- list()
  if("__categories" %in% attr(h5ad.obs, "names")){ # old anndata
    for(name in attr(h5ad.obs[["__categories"]], "names")){
      if(length(h5ad.obs[[name]]) >= length(h5ad.obs[["__categories"]][[name]]))
        obs[[name]] <- factor(h5ad.obs[[name]], labels = h5ad.obs[["__categories"]][[name]])
    }
  } else{ # new anndata
    for(name in attr(h5ad.obs, "names")){
      if(name!='_index')
        if(class(h5ad.obs[[name]]) == "list"){
          obs[[name]] <- factor(h5ad.obs[[name]]$codes, labels = h5ad.obs[[name]]$categories)
        }else{
          obs[[name]] <- h5ad.obs[[name]]
        }
    }
  }

  dims <- c(length(h5ad.var[["_index"]]), length(h5ad.obs[["barcodes"]]))
  dat@Dim <- dims
  dat@Dimnames <- list(as.character(h5ad.var[["_index"]]), as.character(h5ad.obs[["_index"]]))
  obs <- data.frame(obs)
  rownames(obs) <- h5ad.obs[["barcodes"]] 
  return(sce)
}

In [3]:
  scmat <- h5read("adata-Fepi2.h5ad", '/')
  if("raw" %in% attr(scmat, "names")){
    X <- scmat$raw$X
    clsX <- class(X)
    if(length(clsX)>1)
      clsX <- clsX[1]
    if(clsX!="list"){
      X <- Matrix(X)
      dat <- as(X, "CsparseMatrix")
    }
    h5ad.var <- scmat$raw$var
  } else{
    X <- scmat$X
    dat <- sparseMatrix(i = X$indices[] + 1,
                        p = X$indptr[],
                        x = as.numeric(X$data[]),
                        repr = "C")
    h5ad.var <- scmat$var
  }

In [5]:
# generate factors using categories
  var <- list()
  if("__categories" %in% attr(h5ad.var, "names")){ # old anndata
    for(name in attr(h5ad.var[["__categories"]], "names")){
      if(length(h5ad.var[[name]]) >= length(h5ad.var[["__categories"]][[name]])){
        var[[name]] <- factor(h5ad.var[[name]], labels = h5ad.var[["__categories"]][[name]])
      }
    }
  } else {
    for(name in attr(h5ad.var, "names")){
      if(name!='_index')
        if(class(var[[name]]) == "list"){
          var[[name]] <- factor(h5ad.var[[name]]$codes, labels = h5ad.var[[name]]$categories)
        }else{
          var[[name]] <- h5ad.var[[name]]
        }
    }
  }
  h5ad.obs <- scmat$obs
  obs <- list()
  if("__categories" %in% attr(h5ad.obs, "names")){ # old anndata
    for(name in attr(h5ad.obs[["__categories"]], "names")){
      if(length(h5ad.obs[[name]]) >= length(h5ad.obs[["__categories"]][[name]]))
        obs[[name]] <- factor(h5ad.obs[[name]], labels = h5ad.obs[["__categories"]][[name]])
    }
  } else{ # new anndata
    for(name in attr(h5ad.obs, "names")){
      if(name!='_index')
        if(class(h5ad.obs[[name]]) == "list"){
          obs[[name]] <- factor(h5ad.obs[[name]]$codes, labels = h5ad.obs[[name]]$categories)
        }else{
          obs[[name]] <- h5ad.obs[[name]]
        }
    }
  }

  dims <- c(length(h5ad.var[["_index"]]), length(h5ad.obs[["barcodes"]]))
  dat@Dim <- dims
  dat@Dimnames <- list(as.character(h5ad.var[["_index"]]), as.character(h5ad.obs[["_index"]]))
  obs <- data.frame(obs)
  rownames(obs) <- h5ad.obs[["barcodes"]] 

In [6]:
  sce <- SingleCellExperiment(assays = list(counts = dat),
                              colData = DataFrame(obs))

In [23]:
sce <- logNormCounts(sce)

In [27]:
sce_agg <- scater::aggregateAcrossCells(sce, sce$batch)
sce_agg

class: SingleCellExperiment 
dim: 39735 18 
metadata(0):
assays(1): counts
rownames(39735): TNFRSF4 TNFRSF18 ... CR381653.1 ITGB2-AS1
rowData names(0):
colnames(18): 1T 2T ... 21T 22T
colData names(38): Her2.old.score Her2.score ... ids ncells
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [52]:
# pseudo bulk data
library(org.Hs.eg.db)
ddata <- t(sce_agg@assays@data@listData$counts)
s2g=toTable(org.Hs.egSYMBOL)
s=colnames(ddata)
g=s2g[match(s,s2g$symbol),1]
dannot=data.frame(probe=s,
                    "Gene.Symbol" =s, 
                    "EntrezGene.ID"=g)
ddata=ddata[,!is.na(dannot$EntrezGene.ID)]
s<-molecular.subtyping(sbt.model = "pam50",data=ddata, annot=dannot,do.mapping=T)
sce_agg$PAM50  = s$subtype
data.frame(sce_agg$PAM50)

Unnamed: 0_level_0,sce_agg.PAM50
Unnamed: 0_level_1,<fct>
1T,Basal
2T,Basal
3T,Basal
5N,Normal
5T,LumB
6N,Normal
6T,LumB
8T,Basal
9T,Basal
10T,LumB


In [48]:
write.table(ddata, sep='\t', file='Fepi-agg.tsv')

In [53]:
library(org.Hs.eg.db)
ddata <- t(sce@assays@data@listData$logcounts)
s2g=toTable(org.Hs.egSYMBOL)
s=colnames(ddata)
g=s2g[match(s,s2g$symbol),1]
dannot=data.frame(probe=s,
                    "Gene.Symbol" =s, 
                    "EntrezGene.ID"=g)
ddata=ddata[,!is.na(dannot$EntrezGene.ID)]
s<-molecular.subtyping(sbt.model = "pam50",data=ddata, annot=dannot,do.mapping=T)
sce$PAM50  = s$subtype
table(sce$batch_mol, sce$PAM50)

“sparse->dense coercion: allocating vector of size 10.5 GiB”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard deviation is zero”
“the standard d

               
                Basal Her2 LumB LumA Normal
  1T_TNBC         145    8    9  396   2302
  2T_TNBC         833   13   59   86   1306
  3T_TNBC         290   19   59  994   2858
  5N_Normal       140    6   20 1453   2911
  5T_Luminal B     43    9   71  823    125
  6N_Normal       172   10   29 2122   3132
  6T_Luminal B     33   47  629 3366    346
  8T_TNBC        1002   85  142  533   4424
  9T_TNBC         253   36   78  482    357
  10N_Normal      152    6   28  766   2239
  10T_Luminal B     1    7  485 6399     11
  12N_Normal       24   12   13 1131   1349
  15T_Luminal B    36   14   80  564     49
  16N_Normal        6    0    3  446     72
  17T_HER2         44  113   30  155     30
  20T_LumB-HER2    18   13   63  519     26
  21T_LumB-HER2    10   57  408 3803    196
  22T_LumB-HER2   679   77  275   47    307

In [55]:
write.table(sce$PAM50, sep='\t', file='single-cell_PAM50.tsv')

In [129]:
table(sce$epi.subtype,sce$PAM50)

                         
                          Basal Her2 LumB LumA Normal
  Basal APOE+ Normal          9    0    0   16    450
  Basal Cancer              110   15   21  142   6247
  Basal KRTDAP+ Normal       44    2    6  195    646
  Basal S100A2+ Normal        2    0    0   30    902
  LumHR Active Cancer       515   92  633 6777    959
  LumHR Active Normal         3    1   26 1987     24
  LumHR Cycling Cancer      407   96  610  349    218
  LumHR Fibro Cancer        161   30  176  797    598
  LumHR Fibro Normal          2    3    1  157     15
  LumHR IFI6-1 Normal         0    0    2  131     13
  LumHR IFI6-2 Normal         0    0    0  174      1
  LumHR KIT Normal            3    0    0  219     49
  LumHR Major Cancer        662  133  435 4838   1790
  LumHR Major Normal         15   11   44 3053     99
  LumHR Myo Normal            1    0    0   33     11
  LumHR Plasma Normal         2    0    1  118      6
  LumHR SCGB Normal           2    2    6 1422     28
  

In [130]:
table(sce$epi.subtype,sce$molecular_type)

                         
                          HER2 LumB-HER2 Luminal B Normal TNBC
  Basal APOE+ Normal         0         1         3      7  464
  Basal Cancer              32       157       317   3905 2124
  Basal KRTDAP+ Normal       0         1         2      2  888
  Basal S100A2+ Normal       0         0         2      2  930
  LumHR Active Cancer       67      1930      5299     40 1640
  LumHR Active Normal        0         0         0   1555  486
  LumHR Cycling Cancer      56       311       666    108  539
  LumHR Fibro Cancer         5       116       736     33  872
  LumHR Fibro Normal         0         0         1    139   38
  LumHR IFI6-1 Normal        0         0         0      1  145
  LumHR IFI6-2 Normal        0       121        53      1    0
  LumHR KIT Normal           0         0         7    208   56
  LumHR Major Cancer        97      2035      2504   1502 1720
  LumHR Major Normal         0         3       806   1816  597
  LumHR Myo Normal           

In [131]:
table(sce$epi.subtype,sce$batch_mol)

                         
                          1T_TNBC 2T_TNBC 3T_TNBC 5N_Normal 5T_Luminal B
  Basal APOE+ Normal          461       0       1         3            1
  Basal Cancer                124     117    1231       847           58
  Basal KRTDAP+ Normal        886       0       2         0            0
  Basal S100A2+ Normal        924       0       0         0            0
  LumHR Active Cancer           0     393      88        11          487
  LumHR Active Normal           0       0     300       728            0
  LumHR Cycling Cancer          0      70     109        31           32
  LumHR Fibro Cancer            2      16     725        13          167
  LumHR Fibro Normal            0       0      34        47            0
  LumHR IFI6-1 Normal           0       0       0         0            0
  LumHR IFI6-2 Normal           0       0       0         1           38
  LumHR KIT Normal              0       0       9        68            0
  LumHR Major Cancer     

In [1]:
library(copykat)

In [7]:
exp.rawdata <- as.matrix(sce@assays@data$counts)

“sparse->dense coercion: allocating vector of size 15.7 GiB”


In [8]:

copykat.test <- copykat(rawmat=exp.rawdata,id.type="S",cell.line="no",ngene.chr= 5,win.size=25,KS.cut=0.2,sam.name="test",distance="euclidean",n.cores=24)
pred.test <- data.frame(copykat.test$prediction)
CNA.test <- data.frame (copykat.test$CNAmat)


[1] "running copykat v1.1.0"
[1] "step1: read and filter data ..."
[1] "39735 genes, 53019 cells in raw data"
[1] "filtered out 85 cells with less than 200 genes; remaining 52937 cells"
[1] "11057 genes past LOW.DR filtering"
[1] "step 2: annotations gene coordinates ..."
[1] "start annotation ..."
[1] "step 3: smoothing data with dlm ..."
[1] "step 4: measuring baselines ..."
number of iterations= 300 
number of iterations= 336 
number of iterations= 2061 
number of iterations= 315 
number of iterations= 312 
number of iterations= 337 
[1] "low confidence in classification"
[1] "cell: 1"
number of iterations= 486 
[1] "cell: 2"
number of iterations= 500 
[1] "cell: 3"
number of iterations= 466 
[1] "cell: 4"
number of iterations= 500 
[1] "cell: 5"
number of iterations= 500 
[1] "cell: 6"
number of iterations= 408 
[1] "cell: 7"
number of iterations= 500 
[1] "cell: 8"
number of iterations= 500 
[1] "cell: 9"
number of iterations= 500 
[1] "cell: 10"
number of iterations= 226 
[1] "ce

In [1]:
copykat.test

ERROR: Error in eval(expr, envir, enclos): object 'copykat.test' not found
