# ShortRead

## Setup

In [None]:
library(ShortRead)
library(tidyverse)
# library(Biostrings)
set.seed(100)

In [None]:
### Specify FASTQ file
myfqdir <- "/data/hts2018_pilot/Granek_4837_180427A5/"
myfqname <- "35_MA_P_S39_L002_R1_001.fastq.gz"
myfqfile <- file.path(myfqdir, myfqname)
outdir = path.expand("~/work/scratch/shortread")
dir.create(outdir)
tools::md5sum(myfqfile)

## FASTQ Basics

In [None]:
### Read entire FASTQ file into memory
myfq <- readFastq(myfqfile)

### Check class
class(myfq)

In [None]:
### Get Summary for FASTQ file
myfq

In [None]:
### Get number of reads
length(myfq)

In [None]:
### Subset based on first three records
myfq[1:3]

In [None]:
### Subset based on three randomly selected records
myfq[sample(seq_len(length(myfq)),3, replace=FALSE)]

In [None]:
### Get sequences and phred scores for reads 1,2, 3 and 1 (add duplicate on
### purpose
myrecs <- myfq[c(1,2,3,1)]

sread(myrecs) -> myseqs

quality(myrecs) -> myphreds

### Compare classes
class(myfq)
class(myseqs)
class(myphreds)

In [None]:
### Check encoding of phred scores
myphreds %>% encoding

In [None]:
ShortRead::id(myrecs)

In [None]:
as(quality(myrecs), "matrix")


In [None]:
### Get the sequences as character strings
as.character(myseqs)

In [None]:
### Check for duplicate reads
myseqs %>% duplicated

In [None]:
### Get Unique reads
myseqs %>% unique

In [None]:
### Reverse reads
myseqs %>% reverse

In [None]:
### complement reads
myseqs %>% complement

In [None]:
### Reverse complement reads
myseqs %>% reverseComplement

In [None]:
### letter frequency counting
alphabetFrequency(myseqs)
letterFrequency(myseqs, c("A","T"))

## QC Report

In [None]:
list.files(myfqdir,pattern = "1[0-1].*RZ.*.fastq.gz",full.names = TRUE) %>%
    qa(BPPARAM = MulticoreParam(workers=1)) %>%
    report(dest=file.path(outdir, "rz_10_11"))

In [None]:
myfqfile %>%
    qa(BPPARAM = MulticoreParam(workers=1)) %>%
    report(dest=file.path(outdir, "35_MA_L002"))

## Search for subsequences

In [None]:
### Some pattern matching
mypattern <- "AGTTGG"
matchPattern(mypattern, myseqs, max.mismatch=0)
matchPattern(mypattern, myseqs, max.mismatch=1)

In [None]:
### Count occurences
countPattern(mypattern, myseq, max.mismatch=0) 
countPattern(mypattern, myseq, max.mismatch=1)

In [None]:
### Alignment
pairwiseAlignment(mypattern, myseq)

## Streaming and Sampling

### Streaming
Stream a FASTQ rather than loading the whole thing into memory.

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
mystream

In [None]:
### The first yield will retrieve records 1 through 5
cur_reads = yield(mystream)
cur_reads

In [None]:
ShortRead::id(cur_reads)

In [None]:
### The second yield will retrieve records 6 through 10
cur_reads = yield(mystream)
ShortRead::id(cur_reads)

In [None]:
mystream

In [None]:
### Be sure to close the stream
close(mystream)

### Double Check

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
cur_reads = yield(mystream)
ShortRead::id(cur_reads)
close(mystream)

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
cur_reads = yield(mystream)
ShortRead::id(cur_reads)
close(mystream)

### Sampling
Random sampling without reading in entire file upfront.
After initializing the samplers, each `yield` call will sample `n=5` records at random from the FASTQ

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

## The End

In [None]:
sessionInfo()
q(save = "no")