In [34]:
from __future__ import division
import os
import sys

from collections import defaultdict

import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline

import gffutils as gfu

## 1) Get some general info on the yeast transcriptome. 

#### 1b) Create a database using gffutils (you can install gffutils via pip). Remember to only make a database once!

In [2]:
%%time
db = gfu.create_db('saccharomyces_cerevisiae.gff', dbfn='sacCer.db')

CPU times: user 1.93 s, sys: 8.69 ms, total: 1.94 s
Wall time: 2.67 s


#### 1c) Use gffutils to figure out all the types of feature that are in the gff

In [4]:
## create_db will return FeatureDB object, but incase this is not the first time, load db
db = gfu.FeatureDB(dbfn='sacCer.db')

In [6]:
featureTypes = [x for x in db.featuretypes()]
print ', '.join(featureTypes)

ARS, ARS_consensus_sequence, CDS, LTR_retrotransposon, W_region, X_element, X_element_combinatorial_repeat, X_region, Y_prime_element, Y_region, Z1_region, Z2_region, blocked_reading_frame, centromere, centromere_DNA_Element_I, centromere_DNA_Element_II, centromere_DNA_Element_III, chromosome, external_transcribed_spacer_region, five_prime_UTR_intron, gene, intein_encoding_region, internal_transcribed_spacer_region, intron, long_terminal_repeat, mRNA, mating_type_region, matrix_attachment_site, ncRNA_gene, non_transcribed_region, noncoding_exon, origin_of_replication, plus_1_translational_frameshift, pseudogene, rRNA_gene, region, silent_mating_type_cassette_array, snRNA_gene, snoRNA_gene, tRNA_gene, telomerase_RNA_gene, telomere, telomeric_repeat, transposable_element_gene


#### 1d) Use gffutils to figure out which which genes have introns. What fraction of genes have introns? Print a list of gene names for genes that have introns. Protip: note that there is an intron type, and that introns have mRNAs as parents.

In [33]:
allGenes = []
genesWithIntrons = []
for parentChildren in db.iter_by_parent_childs(featuretype='mRNA'):
    geneName = parentChildren[0].attributes['Name'][0].split('_')[0]
    allGenes.append(geneName)
    for child in parentChildren[1:]:
        if child.featuretype == 'intron':
            genesWithIntrons.append(geneName)

print '{} / {} genes have introns.'.format(len(genesWithIntrons), len(allGenes))
print
print ', '.join(genesWithIntrons)

314 / 6600 genes have introns.

YAL030W, YAL003W, YAL001C, YBL111C, YBL091C-A, YBL087C, YBL059C-A, YBL059W, YBL050W, YBL040C, YBL027W, YBL026W, YBL018C, YBR048W, YBR062C, YBR078W, YBR082C, YBR084C-A, YBR090C, YBR111W-A, YBR111W-A, YBR119W, YBR181C, YBR186W, YBR189W, YBR191W, YBR215W, YBR219C, YBR230C, YBR255C-A, YCL012C, YCL005W-A, YCL005W-A, YCL002C, YCR028C-A, YCR031C, YCR097W, YCR097W, YDL219W, YDL191W, YDL136W, YDL130W, YDL125C, YDL115C, YDL108W, YDL083C, YDL082W, YDL079C, YDL075W, YDL064W, YDL029W, YDL012C, YDR005C, YDR025W, YDR059C, YDR064W, YDR092W, YDR129C, YDR139C, YDR305C, YDR318W, YDR367W, YDR381W, YDR381C-A, YDR397C, YDR424C, YDR424C, YDR447C, YDR450W, YDR471W, YDR500C, YDR535C, YEL076C-A, YEL012W, YEL003W, YER003C, YER007C-A, YER014C-A, YER044C-A, YER056C-A, YER074W, YER074W-A, YER074W-A, YER093C-A, YER117W, YER133W, YER179W, YFL039C, YFL034C-B, YFL034C-A, YFL031W, YFR024C-A, YFR031C-A, YFR045W, YGL251C, YGL232W, YGL226C-A, YGL183C, YGL178W, YGL137W, YGL103W, YGL087C, YGL0

#### 1e) Compute the length of every gene in the yeast genome, and output a file where the first column is gene name, and the second column is length. Warning: you have to be careful for the genes with introns! You only want the sum of the CDS lengths, and you donâ€™t want to count the intron lengths.

In [42]:
geneLengths = defaultdict(int)
for parentChildren in db.iter_by_parent_childs(featuretype='mRNA'):
    geneName = parentChildren[0].attributes['Name'][0].split('_')[0]
    for child in parentChildren[1:]:
        if child.featuretype == 'CDS':
            geneLengths[geneName] += (child.end - child.start)+1

with open('sacCer_geneLength.txt', 'w') as fh:
    for (g,l) in geneLengths.items():
        fh.write('{}\t{}\n'.format(g,l))

## 2) Map a yeast RNAseq experiment.

## 3) Quantify expression using gffutils and pysam

## 4) Find differential expression expression between two yeast strains.

#### 4c) Using gffutils and pysam, test for differential expression of every gene in the genome