Looking at the poly-A tails of human RNA

[FASTA file used](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/mrna.fa.gz)

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pysam import FastaFile as fasta

# Returns a DataFrame with length and adenylation of RNA as columns
# Each row is an RNA sample in the FASTA file
def fasta_adenylation(path):
    ff = fasta(path)
    data = {}
    for ref in ff.references:
        rna = ff.fetch(ref)
        index = -1
        while(rna[index] == 'a'):
            index = index -1
        adenylation = index * -1 - 1
        data[ref] = {"length": len(rna), "adenylation": adenylation}
    return pd.DataFrame.from_dict(data, orient="index")


In [39]:
df = fasta_adenylation("data/mrna.fa")
df.head(10)

Unnamed: 0,length,adenylation
A00118,135,2
A00119,135,0
A00127,2368,1
A00129,252,9
A00149,567,0
A00209,641,0
A00469,814,16
A00501,556,0
A01046,1367,0
A02076,977,0


In [40]:
df.describe()

Unnamed: 0,length,adenylation
count,2683747.0,2683747.0
mean,205.4898,1.215645
std,699.1804,5.291671
min,2.0,0.0
25%,19.0,0.0
50%,19.0,0.0
75%,32.0,1.0
max,205012.0,216.0


In [45]:
%matplotlib notebook

masks = [(df.adenylation <= 3),
         (df.adenylation > 3) & (df.adenylation <= 50),
         (df.adenylation > 50) & (df.adenylation <= 100),
         (df.adenylation > 100) & (df.adenylation <= 150),
         (df.adenylation > 150)]

plt.figure(figsize=(12,9))
for idx, mask in enumerate(masks):
    plt.subplot(len(masks), 1, idx+1)
    df[mask]["adenylation"].plot.hist(bins=50, alpha=0.5, color='k')
plt.show()

<IPython.core.display.Javascript object>

Most of the RNA didn't have poly-A tails

In [42]:
df[(df.adenylation == 0)].describe()

Unnamed: 0,length,adenylation
count,1592318.0,1592318.0
mean,194.4552,0.0
std,654.8029,0.0
min,2.0,0.0
25%,19.0,0.0
50%,19.0,0.0
75%,32.0,0.0
max,74474.0,0.0


In [43]:
%matplotlib notebook
# Remove a few outliers and put them into a table instead
df[(df.length < 20000)].plot.scatter(x='length', y='adenylation', figsize=(12,9))
plt.show()

<IPython.core.display.Javascript object>

In [44]:
# The outliers
df[(df.length >= 20000)]

Unnamed: 0,length,adenylation
AB537889,21055,7
AF357236,20478,1
AF361486,21112,37
AF414442,66765,15
AF435011,21794,18
AF495910,27652,0
AF495911,21779,18
AF535142,27435,0
AJ002535,20435,0
CS329402,49020,0
