# Biostrings and ShortRead

## Setup

In [None]:
library(ShortRead)
library(tidyverse) # for %>%
library(gdata) # for humanReadable
set.seed(100)

In [None]:
### Specify FASTQ file
myfqdir <- "/data/hts2018_pilot/Granek_4837_180427A5"
myfqname <- "35_MA_P_S39_L002_R1_001.fastq.gz"
myfqfile <- file.path(myfqdir, myfqname)
outdir = path.expand("~/work/scratch/shortread")
dir.create(outdir)
tools::md5sum(myfqfile)

## FASTQ Basics

In [None]:
### Read entire FASTQ file into memory
myfq <- readFastq(myfqfile)

### Check class
class(myfq)

In [None]:
### Get Summary for FASTQ file
myfq

In [None]:
### Get number of reads
length(myfq)

In [None]:
### Subset based on first three records
myfq[1:3]

In [None]:
### Subset based on three randomly selected records
myfq[sample(seq_len(length(myfq)),3, replace=FALSE)]

In [None]:
### Get sequences and phred scores for reads 1,2, 3 and 1 (add duplicate on
### purpose
myrecs <- myfq[c(1,2,3,1)]

sread(myrecs) -> myseqs

quality(myrecs) -> myphreds

### Compare classes
class(myfq)
class(myseqs)
class(myphreds)

In [None]:
ShortRead::id(myrecs)

### Read Quality Scores

In [None]:
### Check encoding of phred scores
myphreds %>% encoding

In [None]:
as(quality(myrecs), "matrix")


### Read Sequences

In [None]:
### Get the sequences as character strings
as.character(myseqs)

In [None]:
### Check for duplicate reads
myseqs %>% duplicated

In [None]:
### Find Unique reads
myseqs %>% unique

In [None]:
### Get rid of duplicate reads
myseqs %>% unique -> myseqs

In [None]:
### letter frequency counting
alphabetFrequency(myseqs)
letterFrequency(myseqs, c("A","T"))

### Sequence Transformations

In [None]:
### Reverse reads
myseqs %>% reverse

In [None]:
### complement reads
myseqs %>% complement

In [None]:
### Reverse complement reads
myseqs %>% reverseComplement

In [None]:
translate(myseqs)

In [None]:
subseq(myseqs, start=10, width=22)

## Searching

In [None]:
matchPattern(pattern="CTCCAGT", myseqs[[1]], max.mismatch=0)

In [None]:
# change a base to introduce a mismatch
matchPattern(pattern="CTCCATT", myseqs[[1]], max.mismatch=0)

In [None]:
# search allowing a single mismatch 
matchPattern(pattern="CTCCATT", myseqs[[1]], max.mismatch=1)

### Search multiple sequences
use `vmatchPattern` to search multiple sequences

In [None]:
vmatchPattern(pattern="CTCCAGT", myseqs, max.mismatch=0)

if we allow mismatches in our pattern we find hits in other sequences

In [None]:
vmatchPattern(pattern="CTCCAGT", myseqs, max.mismatch=2)

### Count sequence matches

In [None]:
vcountPattern(pattern="CTCCAGT", myseqs, max.mismatch=0)

In [None]:
vcountPattern(pattern="CTCCAGT", myseqs, max.mismatch=1) 

In [None]:
vcountPattern(pattern="CTCCAGT", myseqs, max.mismatch=2) 

In [None]:
vcountPattern(pattern="CTCCAGT", myseqs, max.mismatch=3) 

### Pairwise Alignment

In [None]:
pairwiseAlignment(pattern="CTCCAGTC", myseqs[1])

In [None]:
pairwiseAlignment(pattern="ACACGTCTGAACTCCAGTCA", myseqs[1])

In [None]:
pairwiseAlignment(pattern="ACACGTCTGAACAGTCA", myseqs[1])

## Streaming and Sampling

In [None]:
object.size(myfq) %>% humanReadable

In [None]:
file.size(myfqfile) %>% humanReadable

### Streaming
Stream a FASTQ rather than loading the whole thing into memory.

After initializing the streamer, each `yield` call will return the next `n=5` reads from the FASTQ

In [None]:
mystream <- FastqStreamer(myfqfile, n=5)
mystream

In [None]:
object.size(mystream) %>% humanReadable

In [None]:
### The first yield will retrieve records 1 through 5
cur_reads = yield(mystream)
cur_reads

In [None]:
object.size(cur_reads) %>% humanReadable

In [None]:
ShortRead::id(cur_reads)

In [None]:
### The second yield will retrieve records 6 through 10
cur_reads = yield(mystream)
ShortRead::id(cur_reads)

In [None]:
mystream

In [None]:
### Be sure to close the stream
close(mystream)

#### Double Check

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
cur_reads = yield(mystream)
ShortRead::id(cur_reads)
close(mystream)

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
cur_reads = yield(mystream)
ShortRead::id(cur_reads)
close(mystream)

### Sampling
Random sampling without reading in entire file upfront.

After initializing the sampler, each `yield` call returns `n=5` random reads from the FASTQ

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)
sample_reads = yield(mysampler)
object.size(mysampler) %>% humanReadable
object.size(sample_reads)%>% humanReadable
close(mysampler)

## QC Report

In [None]:
# 55 seconds
list.files(myfqdir,pattern = "(^27_|^3_).*MA.*.fastq.gz",full.names = TRUE) %>%
    qa(BPPARAM = MulticoreParam(workers=3)) %>%
    report(dest=file.path(outdir, "ma_3_27"))

In [None]:
# 15 seconds
list.files(myfqdir,pattern = "35_MA_P_S39_L00[1-4]_R1_001.fastq.gz",full.names = TRUE) %>%
    qa(BPPARAM = MulticoreParam(workers=3)) %>%
    report(dest=file.path(outdir, "ma_35"))

## The End

In [None]:
sessionInfo()
q(save = "no")