# Analyse the quality score of reads within `.fastq` files
Written by Jason A. Hendry

In [None]:
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# inline stylization
%matplotlib inline
sns.set_style('white')
sns.set_style('ticks')
#sns.set_style("ticks")
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 100

## Settings

In [None]:
savefigs = False
output_dir = "../figs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print("Making output directory: %s" % output_dir)

## How to load a `.fastq` file

In [None]:
input_dir = "data"
fq_fn = "example.fastq"
fq_path = os.path.join(input_dir, fq_fn)
print(".fastq path: %s" % fq_path)

In [None]:
fq = open(fq_path, "r")

In [None]:
fq.readline()

- We can see that we have loaded a `.fastq` file, and we are able to read it line by line
- We can also save **all** the lines into a list

In [None]:
lines = fq.readlines()
lines[:5] # showing the first five lines

- Once we have read all the lines, the variable `fq` is 'emptied'; we would have to `open()` the file again to populate `fq`

In [None]:
fq.readline()

In [None]:
fq = open(fq_path, "r")
fq.readline()  # back to the first line

- The `open()` function also allows us to loop over all of the lines in the file

In [None]:
i = 0
for l in fq:  # iterate over the lines
    i += 1

In [None]:
print("Number of lines: %d" % i)

- What if we wanted to number of *reads* rather than the number of lines?
- Recall the `.fastq` formatting, each new read begins with the @ symbol

In [None]:
fq = open(fq_path, "r")  # re-open the file
i = 0
for l in fq:
    if l[0] == "@":
        i += 1

In [None]:
print("Number of reads: %d" % i)

- There is one issue with the approach that we are taking
- What happens if we have a bug while we are looping over the file?
- The loop will stop, and `fq` will contain whatever lines are left (who knows what those are?)
- We can have better control using the `with` statement

In [None]:
with open(fq_path, "r") as fq:  # this is the same as fq = open(fq_path, "r")
    i = 0
    for l in fq:
        if l[0] == "@":
            i += 1

In [None]:
print("Number of reads: %d" % i)

- Using with, if the code breaks, `fq` is annihilated (it only exists within the "with" statement)
- Now we understand the basics of loading `.fastq` files and iterating over their lines in python

## Extracting quality scores from `.fastq` files
- We want to loop over the `.fastq` file as before, but this time we want to extract quality information
- How?
    - Recall the `.fastq` format:
        - identifier, beginning with "@"
        - nucleotide sequence
        - '+"
        - quality scores
    - So the quality score occurs three lines after the "@"
- Let's make a small loop to get the idea

In [None]:
fastq = open(fq_path, "r")

In [None]:
i = 0
for line in fastq:
    
    if line[0] == "@":  # if you encounter a line with `@`
        j = 0  # reset the index to 0
    if j == 3:  # if the index is 3...
        print(line)  # print the line
    j += 1  # add one to the index
    
    # Terminate loop after 16 lines
    i += 1
    if i > 16:
        break

- The quality scores!
- Now, we want save the scores, not print them...

In [None]:
i = 0
ascii_scores = []  # initiate an empty list where we will put the scores
for line in fastq:
    
    if line[0] == "@":  # if you encounter a line with `@`
        j = 0  # reset the index to 0
    if j == 3:  # if the index is 3...
        ascii_scores.append(line.rstrip())  # append to the list of scores
    j += 1  # add one to the index
    
    # Terminate loop after 16 lines
    i += 1
    if i > 16:
        break

In [None]:
ascii_scores

- The scores are stored in the variable `q_scores`
- But how would we these convert these to error probabilities?


- We have to reverse the encoding...
- Recall..
    
$$ Q = - 10 log(p) $$

- Where $ Q $ is the quality score, and $ p $ is the probability of error.
- The $ Q $ scores (represented as intergers) are then converted to ASCII


$$ Q \rightarrow A $$

- We have to reverse this process
- We can do this using the built-in `ord()` function in python

In [None]:
ord("!")

In [None]:
ord("5")

In [None]:
ord("5") - 33

In [None]:
10**((ord("5") - 33)/-10)  # inverse of function to get p-value

In [None]:
ascii_scores[0]  # here is the first read as ascii

In [None]:
q_scores = np.array([ord(c) - 33 for c in ascii_scores[0]])
q_scores  # here it is as Q scores!

- Finally, we can convert to probabilities...

In [None]:
p_error = 10 ** (q_scores / -10)
p_error # error probabilities!

- Now we know everything we have to do
- Let's fold it into a nice function

## Writing a python function to extract q scores

In [None]:
def extract_error_probs(fq_path):
    """
    Extract the error probabilities from
    a .fastq file
    
    params
        fq_path : str
            Path to the fastq file.
    
    returns
        ps : list of arrays
            List of arrays containing error
            probabilities for each read
    
    """
    
    with open(fq_path, "r") as fastq:
        ps = []
        for line in fastq:
            if line[0] == "@":
                j = 0
            if j == 3:
                ascii_score = line.rstrip()
                q_score = np.array([ord(c) for c in ascii_score]) - 33
                p_error = 10 ** (q_score / -10)
                ps.append(p_error)
            j += 1
    
    return(ps)

In [None]:
ps = extract_error_probs(fq_path)

In [None]:
len(ps)

## Plot the results

### A single read

In [None]:
fig, ax = plt.subplots(1, 1)

ax.plot(ps[0], color='darkgrey', alpha=1)
ax.set_xlabel("Read Oosition (bp)")
ax.set_ylabel("Error Probability")
ax.set_title("First Read Only", loc="left")

### 100 reads

In [None]:
fig, ax = plt.subplots(1, 1)

n_view = 100
for p in ps[:100]:
    ax.plot(p, color='darkgrey', alpha=0.01)
ax.set_xlabel("Read position (bp)")
ax.set_ylabel("Error Probability")
ax.set_title("First %d Reads" % n_view, loc="left")

- We want a trend line through this mess
- We don't want to make any assumptions
    - Use a non-parameter smoothing method
    - That's LOWESS
- For LOWESS we need to give x-values to each read (position)

### Smooth with LOWESS

In [None]:
from statsmodels.nonparametric.smoothers_lowess import lowess

In [None]:
positions = [np.arange(len(p)) for p in ps]

In [None]:
concat_ps = np.concatenate(ps)
concat_positions = np.concatenate(positions)

In [None]:
len(concat_ps)  # 35 million letters of sequence

In [None]:
n_smooth = 10**6
ps_smooth = lowess(endog=concat_ps[:n_smooth], 
                   exog=concat_positions[:n_smooth],
                   frac=0.1)

In [None]:
fig, ax = plt.subplots(1, 1)

ax.plot(ps_smooth[:, 0], ps_smooth[:, 1], color='firebrick')

- Let's look at the average error per read

### Per-position mean

In [None]:
df = pd.DataFrame({"error": concat_ps, "position": concat_positions})

In [None]:
per_position_means = df.groupby("position").mean()
per_position_ns = np.array(df.groupby("position").size())
se = np.array(df.groupby("position").std()) / np.sqrt(per_position_ns)

In [None]:
fig, ax = plt.subplots(1, 1)

ax.plot(per_position_means, color='steelblue', alpha=1.0)
ax.set_ylabel("Read position (bp)")
ax.set_xlabel("Error Probability")

## Questions:

1. Compute the mean error probability per read. Plot a histogram of these values across all reads. Indicate the mean error probability across *all* reads with a vertical bar and in the title of the plot.
2. Compute the length of every read. Plot the distribution of lengths as a histogram. Indicate the mean read length with a veritcal bar and in the title of the plot, as above.
3. Is the mean error probability of a read a function of the read length? Produce at least one statistic and one visualisation to support your reasoning.
4. **Bonus:** Compute the GC-content of each read and determine whether the error probability is a function of the GC content. Produce a visualisation and statistic to support your argument, as above.