In [None]:
list.files("/data/hts2018_pilot/Granek_4837_180427A5/")

In [None]:
library(ShortRead)
library(tidyverse)
library(Biostrings)
set.seed(100)

In [None]:
### Specify FASTQ file
myfqdir <- "/data/hts2018_pilot/Granek_4837_180427A5/"
myfqname <- "35_MA_P_S39_L002_R1_001.fastq.gz"
myfqfile <- file.path(myfqdir, myfqname)
outdir = path.expand("~/work/scratch/shortread")
dir.create(outdir)
tools::md5sum(myfqfile)

In [None]:
### Read entire FASTQ file into memory
myfq <- readFastq(myfqfile)

### Check class
class(myfq)

In [None]:
### Get Summary for FASTQ file
myfq

In [None]:
### Get number of reads
length(myfq)

In [None]:
### Subset based on first three records
myfq[1:3]

In [None]:
### Subset based on three randomly selected records
myfq[sample(seq_len(length(myfq)),3, replace=FALSE)]

In [None]:
### Get sequences and phred scores for reads 1,2, 3 and 1 (add duplicate on
### purpose
myrecs <- myfq[c(1,2,3,1)]

sread(myrecs) -> myseqs

quality(myrecs) -> myphreds

### Compare classes
class(myfq)
class(myseqs)
class(myphreds)

In [None]:
### Check encoding of phred scores
myphreds %>% encoding

In [None]:
myphreds[1]

In [None]:
ShortRead::id(myrecs)

In [None]:
as.character(myseqs)

In [None]:
### Get the sequences and phred scores as character strings
myphreds

In [None]:
### Check for duplicate reads
myseqs %>% duplicated

In [None]:
### Get Unique reads
myseqs %>% unique

In [None]:
### Reverse reads
myseqs %>% reverse

In [None]:
### complement reads
myseqs %>% complement

In [None]:
### Reverse complement reads
myseqs %>% reverseComplement

In [None]:
### letter frequency counting
alphabetFrequency(myseqs)
letterFrequency(myseqs, c("A","T"))

In [None]:
# alphabetByCycle(myrecs)
alphabetScore(myrecs)

# QC Report

In [None]:
?list.files

In [None]:
list.files(myfqdir,pattern = "1[0-1].*RZ.*.fastq.gz",full.names = TRUE) %>%
    qa(BPPARAM = MulticoreParam(workers=1)) %>%
    report(dest=file.path(outdir, "rz_10_11"))

In [None]:
myfqfile %>%
    qa(BPPARAM = MulticoreParam(workers=1)) %>%
    report(dest=file.path(outdir, "35_MA_L002"))

In [None]:
?qa

In [None]:
?report

In [None]:
BiocParallel::registered()

In [None]:
?BiocParallelParam

In [None]:
# myfqfile.qa = qa(myfqdir,pattern = "1[0-1].*RZ.*.fastq.gz",BPPARAM = MulticoreParam(workers=4))

In [None]:
report(myfqfile.qa, dest=outdir)

In [None]:
?qa

In [None]:
### Some pattern matching
mypattern <- "AGTTGG"
matchPattern(mypattern, myseqs, max.mismatch=0)
matchPattern(mypattern, myseqs, max.mismatch=1)

In [None]:
### Count occurences
countPattern(mypattern, myseq, max.mismatch=0) 
countPattern(mypattern, myseq, max.mismatch=1)

In [None]:
### Alignment
pairwiseAlignment(mypattern, myseq)

In [None]:
### How to stream FASTQ (rather than loading into memory
mystream <- FastqStreamer(myfqfile, 5)
mystream

### The first yield will retrieve records 1 through 5
yield(mystream)
### The second yield will retrieve records 6 through 10
yield(mystream)

### Be sure to close the stream
close(mystream)



### random Sampling (without reading in entire file upfront)

mysampler <- FastqSampler(myfqfile, 3)

### Each yield call will sample three records at random
yield(mysampler)
yield(mysampler)

close(mysampler)



sessionInfo()
q(save = "no")

# Streaming

In [None]:
strm <- FastqStreamer(myfqfile)
repeat {
    fq <- yield(strm)
    if (length(fq) == 0)
    break
    ## Do something 
}

# Sampling

In [None]:
sampler <- FastqSampler(myfqfile)
fq <- yield(sampler)
fq

In [None]:
ShortRead::id(fq)