### Installation

`devtools::install_github("zji90/SCRATdatahg19")`  
`source("https://raw.githubusercontent.com/zji90/SCRATdata/master/installcode.R")`  

###  Import packages

In [1]:
library(devtools)
library(GenomicAlignments)
library(Rsamtools)
library(SCRATdatahg19)
library(SCRAT)

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Loading required package: S4Vectors
Loading required package: s

### Obtain Feature Matrix

In [2]:
start_time = Sys.time()

In [3]:
metadata <- read.table('../../input/metadata.tsv',
                         header = TRUE,
                         stringsAsFactors=FALSE,quote="",row.names=1)

In [4]:
SCRATsummary <- function (dir = "", genome, bamfile = NULL, singlepair = "automated", 
    removeblacklist = T, log2transform = T, adjustlen = T, featurelist = c("GENE", 
        "ENCL", "MOTIF_TRANSFAC", "MOTIF_JASPAR", "GSEA"), customfeature = NULL, 
    Genestarttype = "TSSup", Geneendtype = "TSSdown", Genestartbp = 3000, 
    Geneendbp = 1000, ENCLclunum = 2000, Motifflank = 100, GSEAterm = "c5.bp", 
    GSEAstarttype = "TSSup", GSEAendtype = "TSSdown", GSEAstartbp = 3000, 
    GSEAendbp = 1000) 
{
    if (is.null(bamfile)) {
        bamfile <- list.files(dir, pattern = ".bam$")
    }
    datapath <- system.file("extdata", package = paste0("SCRATdata", 
        genome))
    bamdata <- list()
    for (i in bamfile) {
        filepath <- file.path(dir, i)
        if (singlepair == "automated") {
            bamfile <- BamFile(filepath)
            tmpsingle <- readGAlignments(bamfile)
            tmppair <- readGAlignmentPairs(bamfile)
            pairendtf <- testPairedEndBam(bamfile)
            if (pairendtf) {
                tmp <- tmppair
                startpos <- pmin(start(first(tmp)), start(last(tmp)))
                endpos <- pmax(end(first(tmp)), end(last(tmp)))
                id <- which(!is.na(as.character(seqnames(tmp))))
                tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))
            }
            else {
                tmp <- GRanges(tmpsingle)
            }
        }
        else if (singlepair == "single") {
            tmp <- GRanges(readGAlignments(filepath))
        }
        else if (singlepair == "pair") {
            tmp <- readGAlignmentPairs(filepath)
            startpos <- pmin(start(first(tmp)), start(last(tmp)))
            endpos <- pmax(end(first(tmp)), end(last(tmp)))
            id <- which(!is.na(as.character(seqnames(tmp))))
            tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))
        }
        if (removeblacklist) {
            load(paste0(datapath, "/gr/blacklist.rda"))
            tmp <- tmp[-as.matrix(findOverlaps(tmp, gr))[, 1], 
                ]
        }
        bamdata[[i]] <- tmp
    }
    bamsummary <- sapply(bamdata, length)
    ### newly added snippet to get rid of zeros in 'bamsummary'
    if(length(which(bamsummary==0))>0){
        bamsummary = bamsummary+1
    }
    allres <- NULL
    datapath <- system.file("extdata", package = paste0("SCRATdata", 
        genome))
    if ("GENE" %in% featurelist) {
        print("Processing GENE features")
        load(paste0(datapath, "/gr/generegion.rda"))
        if (Genestarttype == "TSSup") {
            grstart <- ifelse(as.character(strand(gr)) == "+", 
                start(gr) - as.numeric(Genestartbp), end(gr) + 
                  as.numeric(Genestartbp))
        }
        else if (Genestarttype == "TSSdown") {
            grstart <- ifelse(as.character(strand(gr)) == "+", 
                start(gr) + as.numeric(Genestartbp), end(gr) - 
                  as.numeric(Genestartbp))
        }
        else if (Genestarttype == "TESup") {
            grstart <- ifelse(as.character(strand(gr)) == "+", 
                end(gr) - as.numeric(Genestartbp), start(gr) + 
                  as.numeric(Genestartbp))
        }
        else if (Genestarttype == "TESdown") {
            grstart <- ifelse(as.character(strand(gr)) == "+", 
                end(gr) + as.numeric(Genestartbp), start(gr) - 
                  as.numeric(Genestartbp))
        }
        if (Geneendtype == "TSSup") {
            grend <- ifelse(as.character(strand(gr)) == "+", 
                start(gr) - as.numeric(Geneendbp), end(gr) + 
                  as.numeric(Geneendbp))
        }
        else if (Geneendtype == "TSSdown") {
            grend <- ifelse(as.character(strand(gr)) == "+", 
                start(gr) + as.numeric(Geneendbp), end(gr) - 
                  as.numeric(Geneendbp))
        }
        else if (Geneendtype == "TESup") {
            grend <- ifelse(as.character(strand(gr)) == "+", 
                end(gr) - as.numeric(Geneendbp), start(gr) + 
                  as.numeric(Geneendbp))
        }
        else if (Geneendtype == "TESdown") {
            grend <- ifelse(as.character(strand(gr)) == "+", 
                end(gr) + as.numeric(Geneendbp), start(gr) - 
                  as.numeric(Geneendbp))
        }
        ngr <- names(gr)
        gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, 
            grend), end = pmax(grstart, grend)))
        names(gr) <- ngr
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- end(gr) - start(gr) + 1
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
    }
    if ("ENCL" %in% featurelist) {
        print("Processing ENCL features")
        load(paste0(datapath, "/gr/ENCL", ENCLclunum, ".rda"))
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- sapply(gr, function(i) sum(end(i) - start(i) + 
                1))
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
    }
    if ("MOTIF_TRANSFAC" %in% featurelist) {
        print("Processing MOTIF_TRANSFAC features")
        load(paste0(datapath, "/gr/transfac1.rda"))
        gr <- flank(gr, as.numeric(Motifflank), both = T)
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- sapply(gr, function(i) sum(end(i) - start(i) + 
                1))
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
        load(paste0(datapath, "/gr/transfac2.rda"))
        gr <- flank(gr, as.numeric(Motifflank), both = T)
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- sapply(gr, function(i) sum(end(i) - start(i) + 
                1))
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
        if (genome %in% c("hg19", "hg38")) {
            load(paste0(datapath, "/gr/transfac3.rda"))
            gr <- flank(gr, as.numeric(Motifflank), both = T)
            tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
                i))
            tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
            if (log2transform) {
                tmp <- log2(tmp + 1)
            }
            if (adjustlen) {
                grrange <- sapply(gr, function(i) sum(end(i) - 
                  start(i) + 1))
                tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
            }
            tmp <- tmp[rowSums(tmp) > 0, , drop = F]
            allres <- rbind(allres, tmp)
        }
    }
    if ("MOTIF_JASPAR" %in% featurelist) {
        print("Processing MOTIF_JASPAR features")
        load(paste0(datapath, "/gr/jaspar1.rda"))
        gr <- flank(gr, as.numeric(Motifflank), both = T)
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- sapply(gr, function(i) sum(end(i) - start(i) + 
                1))
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
        load(paste0(datapath, "/gr/jaspar2.rda"))
        gr <- flank(gr, as.numeric(Motifflank), both = T)
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- sapply(gr, function(i) sum(end(i) - start(i) + 
                1))
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
    }
    if ("GSEA" %in% featurelist) {
        print("Processing GSEA features")
        for (i in GSEAterm) {
            load(paste0(datapath, "/gr/GSEA", i, ".rda"))
            allgr <- gr
            for (sgrn in names(allgr)) {
                gr <- allgr[[sgrn]]
                if (GSEAstarttype == "TSSup") {
                  grstart <- ifelse(as.character(strand(gr)) == 
                    "+", start(gr) - as.numeric(GSEAstartbp), 
                    end(gr) + as.numeric(GSEAstartbp))
                }
                else if (GSEAstarttype == "TSSdown") {
                  grstart <- ifelse(as.character(strand(gr)) == 
                    "+", start(gr) + as.numeric(GSEAstartbp), 
                    end(gr) - as.numeric(GSEAstartbp))
                }
                else if (GSEAstarttype == "TESup") {
                  grstart <- ifelse(as.character(strand(gr)) == 
                    "+", end(gr) - as.numeric(GSEAstartbp), start(gr) + 
                    as.numeric(GSEAstartbp))
                }
                else if (GSEAstarttype == "TESdown") {
                  grstart <- ifelse(as.character(strand(gr)) == 
                    "+", end(gr) + as.numeric(GSEAstartbp), start(gr) - 
                    as.numeric(GSEAstartbp))
                }
                if (GSEAendtype == "TSSup") {
                  grend <- ifelse(as.character(strand(gr)) == 
                    "+", start(gr) - as.numeric(GSEAendbp), end(gr) + 
                    as.numeric(GSEAendbp))
                }
                else if (GSEAendtype == "TSSdown") {
                  grend <- ifelse(as.character(strand(gr)) == 
                    "+", start(gr) + as.numeric(GSEAendbp), end(gr) - 
                    as.numeric(GSEAendbp))
                }
                else if (GSEAendtype == "TESup") {
                  grend <- ifelse(as.character(strand(gr)) == 
                    "+", end(gr) - as.numeric(GSEAendbp), start(gr) + 
                    as.numeric(GSEAendbp))
                }
                else if (GSEAendtype == "TESdown") {
                  grend <- ifelse(as.character(strand(gr)) == 
                    "+", end(gr) + as.numeric(GSEAendbp), start(gr) - 
                    as.numeric(GSEAendbp))
                }
                ngr <- names(gr)
                gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, 
                  grend), end = pmax(grstart, grend)))
                names(gr) <- ngr
                allgr[[sgrn]] <- gr
            }
            gr <- allgr
            tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
                i))
            tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
            if (log2transform) {
                tmp <- log2(tmp + 1)
            }
            if (adjustlen) {
                grrange <- sapply(gr, function(i) sum(end(i) - 
                  start(i) + 1))
                tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
            }
            tmp <- tmp[rowSums(tmp) > 0, , drop = F]
            allres <- rbind(allres, tmp)
        }
    }
    if ("Custom" %in% featurelist) {
        print("Processing custom features")
        gr <- read.table(customfeature, as.is = T, sep = "\t")
        gr <- GRanges(seqnames = gr[, 1], IRanges(start = gr[, 
            2], end = gr[, 3]))
        tmp <- sapply(bamdata, function(i) countOverlaps(gr, 
            i))
        tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
        if (log2transform) {
            tmp <- log2(tmp + 1)
        }
        if (adjustlen) {
            grrange <- end(gr) - start(gr) + 1
            tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
        }
        tmp <- tmp[rowSums(tmp) > 0, , drop = F]
        allres <- rbind(allres, tmp)
    }
    allres
}

In [None]:
df_out <- SCRATsummary(dir = "../../input/sc-bams_nodup/", 
                               genome = "hg19",
                               featurelist="MOTIF_JASPAR",
                               log2transform = FALSE, adjustlen = FALSE)

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000199.1, GL000195.1, GL000212.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000201.1, GL000197.1, GL000206.1, GL000241.1, GL000191.1, GL000228.1, GL000209.1, GL000220.1, GL000216.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000222.1, GL000194.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000237.1, GL000191.1, GL000220.1, GL000199.1, GL000219.1, GL000223.1, GL000195.1, GL000222.1, GL000200.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL0002

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000241.1, GL000220.1, GL000205.1, GL000223.1, GL000195.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000243.1, GL000233.1, GL000198.1, GL000228.1, GL000221.1, GL000220.1, GL000211.1, GL000199.1, GL000217.1, GL000216.1, GL000205.1, GL000223.1, GL000195.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000227.1, GL000209.1, GL000220.1, GL000211.1, GL000216.1, GL000205.1, GL000219.1, GL000195.1, GL000222.1, GL000200.1, GL000194.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000214.1, GL000220.1, GL000199.1, GL000216.1, GL000219.1, GL000224.1, GL000222.1, GL000194.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000199.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000212.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000231.1, GL000247.1, GL000244.1, GL000234.1, GL000206.1, GL000241.1, GL000242.1, GL000237.1, GL000191.1, GL000227.1, GL000228.1, GL000214.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000212.1

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000195.1, GL000222.1, GL000192.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000243.1, GL000191.1, GL000209.1, GL000220.1, GL000199.1, GL000217.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000212.1, GL000200.1, GL000194.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000209.1, GL000220.1, GL000199.1, GL000205.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000206.1, GL000191.1, GL000228.1, GL000209.1, GL000220.1, GL000199.1, GL000205.1, GL000219.1, GL0001

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000225.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000205.1, GL000219.1, GL000195.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000206.1, GL000198.1, GL000191.1, GL000228.1, GL000221.1, GL000209.1, GL000220.1, GL000216.1, GL000205.1, GL000219.1, GL000224.1, GL000195.1, GL000200.1, GL000194.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has 

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000227.1, GL000220.1, GL000195.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000228.1, GL000220.1, GL000216.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000229.1, GL000240.1, GL000214.1, GL000209.1, GL000220.1, GL000216.1, GL000205.1, GL000195.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000209.1, GL000220.1, GL000205.1
  - in 'y': chrM, chrY
  Make sure to always combine/compare objects based on the same reference
“Each 

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000191.1, GL000214.1, GL000209.1, GL000220.1, GL000217.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000205.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000196.1, GL000206.1, GL000191.1, GL000221.1, GL000220.1, GL000211.1, GL000216.1, GL000205.1, GL000223.1, GL000195.1, GL000222.1, GL000200.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000234.1, GL000199.1, GL000205.1, GL000219.1, GL000195.1, GL000192.1
  - in 

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000228.1, GL000220.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000191.1, GL000228.1, GL000220.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000222.1, GL000193.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000191.1, GL000209.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000193.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1
  - in 'y': chrM, chrY
  Make sure to always combine/compare object

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000246.1, GL000234.1, GL000209.1, GL000220.1, GL000199.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000194.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000220.1
  - in 'y': chrM, chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000200.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in '

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000243.1, GL000220.1, GL000199.1, GL000223.1, GL000195.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000216.1, GL000205.1, GL000194.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000196.1, GL000206.1, GL000191.1, GL000220.1, GL000199.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000222.1, GL000200.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000229.1, GL000232.1, GL000206.1, GL000228.1, GL000209.1, GL000220.1, GL000217.1, GL000205.1, GL000223.1, GL000195.1, GL0001

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000210.1, GL000201.1, GL000240.1, GL000243.1, GL000230.1, GL000237.1, GL000233.1, GL000191.1, GL000221.1, GL000218.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000194.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000239.1, GL000220.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000243.1, GL000191.1, GL000228.1, GL000214.1, GL000221.1, GL000209.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000224.1, GL000223.1, GL000195.1, GL000200.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Eac

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000221.1, GL000220.1, GL000205.1, GL000219.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000209.1, GL000220.1, GL000205.1, GL000212.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000195.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000205.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1
  - in 'y': chrM, chrY
  Make sure to a

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000209.1, GL000220.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000240.1, GL000191.1, GL000221.1, GL000220.1, GL000216.1, GL000205.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000221.1, GL000220.1, GL000199.1, GL000205.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000219.1
  - in 'y': chrM, chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000209.1, GL000220

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000197.1, GL000240.1, GL000191.1, GL000228.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000248.1, GL000244.1, GL000206.1, GL000241.1, GL000191.1, GL000228.1, GL000221.1, GL000209.1, GL000220.1, GL000199.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000222.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000241.1, GL000191.1, GL000227.1, GL000228.1, GL000218.1, GL000220.1, GL000199.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000193.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare obje

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000249.1, GL000191.1, GL000221.1, GL000209.1, GL000220.1, GL000216.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000205.1, GL000192.1
  - in 'y': chrM, chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000209.1, GL000220.1, GL000216.1, GL000205.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000220.1, GL000205.1, GL000195.1, GL000200.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on t

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000191.1, GL000209.1, GL000220.1, GL000199.1, GL000217.1, GL000216.1, GL000205.1, GL000223.1, GL000195.1, GL000212.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000231.1, GL000241.1, GL000208.1, GL000191.1, GL000228.1, GL000221.1, GL000209.1, GL000220.1, GL000199.1, GL000205.1, GL000195.1, GL000212.1, GL000194.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000192.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000220.1, GL000195.1
  - in 'y': chrY
  

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000221.1, GL000220.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000212.1, GL000194.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000227.1, GL000221.1, GL000209.1, GL000220.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000200.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000221.1, GL000209.1, GL000220.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000235.1, GL000206.1, GL000208.1, GL000228.1, GL000214.1, GL000209.1, GL000220.1, GL000211.1, GL0001

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000224.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000241.1, GL000191.1, GL000214.1, GL000209.1, GL000218.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000224.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000194.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000209.1, GL000220.1, GL000205.1, GL000223.1, GL000195.1, GL000212.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000220.1, GL000216.1, GL000225.1
  - in 'y': chrM
  Make sure to always comb

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000223.1, GL000195.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000205.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000209.1, GL000220.1, GL000213.1, GL000205.1, GL000219.1, GL000195.1, GL000200.1
  - in 'y': chrM, chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000199.1, GL000205.1, GL000222.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000221.1, GL000220.1, GL000195.1
  - in 'y

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000220.1, GL000216.1, GL000205.1, GL000219.1, GL000195.1, GL000222.1, GL000194.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000239.1, GL000191.1, GL000227.1, GL000220.1, GL000216.1, GL000205.1, GL000223.1, GL000195.1, GL000222.1, GL000194.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000206.1, GL000228.1, GL000214.1, GL000220.1, GL000211.1, GL000199.1, GL000216.1, GL000205.1, GL000219.1, GL000223.1, GL000195.1, GL000212.1, GL000194.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL0002

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000241.1, GL000191.1, GL000221.1, GL000209.1, GL000220.1, GL000205.1, GL000219.1, GL000195.1, GL000212.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000237.1, GL000223.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000220.1, GL000192.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000241.1, GL000242.1, GL000220.1, GL000216.1, GL000205.1, GL000224.1, GL000223.1, GL000195.1, GL000194.1, GL000225.1, GL000192.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000226.1, GL000239.1, GL000235.1, GL000245.1, GL000230.1, GL000237.1, GL000233.1, GL000208.1, GL000228.1, GL000214.1, GL000218.1, GL000220.1, GL000199.1, GL000217.1, GL000216.1, GL000205.1, GL000224.1, GL000212.1, GL000194.1, GL000225.1
  - in 'y': chrM
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000198.1, GL000220.1, GL000216.1, GL000219.1, GL000225.1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000246.1, GL000244.1, GL000238.1, GL000241.1, GL000243.1, GL000204.1, GL000198.1, GL000208.1, GL000214.1, GL000218.1, GL000220.1, GL000211.1, GL000199.1, GL000217.1, GL000216.1, GL000219.1, GL000224.1, GL000222.1, GL000200.1, GL000194.1, GL000225.1
  - in 'y': chrM
  Make

[1] "Processing MOTIF_JASPAR features"


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': chrY
  - in 'y': chrM, GL000220.1, GL000195.1, GL000192.1
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': chrY
  - in 'y': chrM, GL000220.1, GL000205.1, GL000195.1
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': chrY
  - in 'y': chrM, GL000220.1, GL000195.1
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': chrY
  - in 'y': chrM, GL000191.1, GL000221.1, GL000220.1, GL000205.1, GL000224.1, GL000195.1
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': chrY
  - in 'y': chrM, GL000220.1, GL0

In [None]:
end_time <- Sys.time()

In [8]:
end_time - start_time

Time difference of 13.36512 hours

In [9]:
saveRDS(df_out, file = 'df_out.rds')

In [10]:
dim(df_out)
df_out[1:5,1:5]

Unnamed: 0,atac_v1_pbmc_5k_possorted_bam.AAACGAAAGCGCAATG-1.dedup.st.bam,atac_v1_pbmc_5k_possorted_bam.AAACGAAAGGGTATCG-1.dedup.st.bam,atac_v1_pbmc_5k_possorted_bam.AAACGAAAGTAACATG-1.dedup.st.bam,atac_v1_pbmc_5k_possorted_bam.AAACGAAAGTTACACC-1.dedup.st.bam,atac_v1_pbmc_5k_possorted_bam.AAACGAACAGAGATGC-1.dedup.st.bam
MOTIF:MA0002.2:RUNX1,1894.3454,1889.3308,1897.71,2096.5129,1997.4528
MOTIF:MA0003.3:TFAP2A,3910.0952,3538.5673,4016.57,3996.1254,3594.7782
MOTIF:MA0004.1:Arnt,1824.8934,1703.2454,1840.303,1830.5741,1750.1592
MOTIF:MA0006.1:Ahr::Arnt,3102.9203,2925.8114,3189.053,3167.3124,2942.0505
MOTIF:MA0007.3:Ar,148.2008,128.6395,125.579,134.2022,142.2203


In [11]:
head(sapply(strsplit(colnames(df_out), "\\."),'[',2))

In [12]:
colnames(df_out) = sapply(strsplit(colnames(df_out), "\\."),'[',2)
dim(df_out)
df_out[1:5,1:5]

Unnamed: 0,AAACGAAAGCGCAATG-1,AAACGAAAGGGTATCG-1,AAACGAAAGTAACATG-1,AAACGAAAGTTACACC-1,AAACGAACAGAGATGC-1
MOTIF:MA0002.2:RUNX1,1894.3454,1889.3308,1897.71,2096.5129,1997.4528
MOTIF:MA0003.3:TFAP2A,3910.0952,3538.5673,4016.57,3996.1254,3594.7782
MOTIF:MA0004.1:Arnt,1824.8934,1703.2454,1840.303,1830.5741,1750.1592
MOTIF:MA0006.1:Ahr::Arnt,3102.9203,2925.8114,3189.053,3167.3124,2942.0505
MOTIF:MA0007.3:Ar,148.2008,128.6395,125.579,134.2022,142.2203


In [15]:
if(! all(colnames(df_out) == rownames(metadata))){
    df_out = df_out[,rownames(metadata)]
    dim(df_out)
    df_out[1:5,1:5]
}

In [16]:
saveRDS(df_out, file = '../../output/feature_matrices/FM_SCRAT_10xpbmc5k_motifs.rds')

In [17]:
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_SCRAT/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] SCRAT_0.99.0                SCRATdatahg19_0.99.1       
 [3] GenomicAlignments_1.18.1    Rsamtools_1.34.0           
 [5] Biostrings_2.50.2           XVector_0.22.0             
 [7] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
 [9] BiocPa

In [18]:
save.image(file = 'SCRAT_10xpbmc5k.RData')