# Set up environment

In [None]:
source("course_config.R")

Recall that after the tutorial one, we have created the hts-pilot-2018.RData.

```
scratch
└── analysis_output  
    ├── out  
    │   └── hts-pilot-2018.RData        
    └── img  
```

In [None]:
OUTDIR

In [None]:
# file directory
cntfile <- file.path(OUTDIR, "hts-course-2018.RData")

# Read in results

In [None]:
### Import count data
attach(cntfile)
tools::md5sum(cntfile)

In [None]:
### Import metadata
tools::md5sum(METADTFILE)

mtdf <- readr::read_csv(METADTFILE) %>%
    dplyr::rename(Label = label,
                  Media = media) %>%
    mutate_at(vars(
        `Label`,
        `Strain`,
        `Media`,
        `group`,
        `experiment_person`
    ), factor)

Recall that there are 204 samples and 8498 genes

In [None]:
head(mtdf)

# Check the label between metadata and mapping results

Recall that we got two data frames in the previous tutorial:  
- genecounts: gene counts for each **sample**
    - Note: We will need to convert it into gene counts for each **library** later
- mapresults: the mapping results 
    - Note: There are 204 samples

The metadata (`mtdf`) contains the information of each sample. Here we need to make sure if the label in the metadata matches the sample names we have in `mapresults` and `genecount`

The code chunk below allows us to check to see if we can match the labels to the those in the metadata file. There should not be any output from the code chunk.

In [None]:
### Use setdiff to check to see if we can match the labels to the those in the metadata file
myregex <- "_[A-Z](100|[1-9][0-9]?)_L00[1-4].*"

mapresults$expid %>%
    str_replace(myregex, "") %>%
    setdiff(mtdf$Label)

mtdf$Label %>%
    setdiff(str_replace(mapresults$expid, myregex, ""))

# Construct gene count matrix for each library

Add the "Label" to the count matrix and mapping results, then merge in phenotype data (by Label)

In [None]:
### Add "Label" to genecounts
genecounts %>%
    mutate(Label = str_replace(expid, myregex, "")) ->
    annogenecnts

In [None]:
### Collapse the gene counts within each label
annogenecnts %>%
    select(-expid) %>%
    group_by(Label) %>%
    summarize_all(sum) %>%
    gather(gene, value, -Label) %>% 
    spread(Label, value) ->
    annogenecnts0

Show the resulting data frame in each step

In [None]:
genecounts[1:6, 1:6]

In [None]:
annogenecnts[1:6, c(colnames(annogenecnts)[1:6], "Label")]

In [None]:
annogenecnts0[1:6, 1:6]

# Metadata

Add "Label" to read map results and merge in phenotype data (-> annomapres)

In [None]:
mapresults %>%
    mutate(Label = str_replace(expid, myregex, "")) %>%
    full_join(mtdf, by = "Label") ->
    annomapres

In [None]:
annomapres

In [None]:
grpvars <- vars(Label, Strain, Media, experiment_person, group)
sumvars <- vars(prop.gene, prop.nofeat, prop.unique, depth)

annomapres %>%
    group_by_at(grpvars) %>%
    summarize_at(sumvars, mean) -> 
    annomapres0

In [None]:
head(annomapres)

In [None]:
head(annomapres0)

# Store the results

In [None]:
outfile <- file.path(OUTDIR, "HTS-Course-Annotated-STAR-counts.RData")
save(mtdf, annogenecnts0, annomapres0, annogenecnts, annomapres, file = outfile)
tools::md5sum(outfile)

# Visualize the mapping results

In [None]:
### Figures for mapping results
mygeom <-  geom_point(position = position_jitter(w = 0.3, h = 0))
mypal <- scale_colour_manual(name="",  values =brewer.pal(3,"Set1"))
mytheme <-  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme_bw()
myfacet <- facet_grid(Strain~ Media, drop=TRUE, scales="free_x", space="free")

### Show the fraction of unique mapped reads among all reads (prob.unique)

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

p1 <- ggplot(annomapres, 
             aes(x = factor(Label), 
                 y = prop.unique, 
                 shape = Strain, 
                 color = Media)) +
        myfacet + 
        mygeom + 
        mytheme + 
        mypal

print(p1)

### Show the fraction of reads mapped to genes (prob.gene)

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

p2 <- ggplot(annomapres, 
             aes(x = Label, 
                 y = prop.gene, 
                 shape = Strain, 
                 color = Media)) +
        myfacet +
        mygeom +
        mytheme + 
        mypal
  
print(p2)

### Show the fraction of reads categorized as "no feature" (prob.nofeat)

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

p3 <- ggplot(annomapres, 
             aes(x = Label, 
                 y = prop.nofeat, 
                 shape = Strain, 
                 color = Media))+
        myfacet +
        mygeom + 
        mytheme + 
        mypal

print(p3)

### Show the number of all the reads in each sample 
(Note: depth = ngenemap + namb + nmulti + nnofeat + nunmap)

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

p4 <- ggplot(annomapres, 
             aes(x = Label, 
                 y = depth, 
                 shape = Strain, 
                 color = Media))+
        myfacet +
        mygeom +
        mytheme +
        mypal

print(p4)

### Store the plots

In [None]:
png(file.path(IMGDIR, "p1.png"), height = 480 * 2, width = 480 * 2)
plot(p1)
graphics.off()

png(file.path(IMGDIR, "p2.png"), height = 480 * 2, width = 480 * 2)
plot(p2)
graphics.off()

png(file.path(IMGDIR, "p3.png"), height = 480 * 2, width = 480 * 2)
plot(p3)
graphics.off()

png(file.path(IMGDIR, "p4.png"), height = 480 * 2, width = 480 * 2)
plot(p4)
graphics.off()