In [1]:
from __future__ import division
import os
import sys

import roman

import numpy as np
import pandas as pd

from Bio import SeqIO

In [2]:
scerGenes = pd.read_csv('data/scer/S_cerevisiae_genes.bed', sep='\t', header=None, usecols=[0,1,2,3,5], names=['chrom','chrom_start','chrom_end','gene','strand']).sort_values(by='gene')
scerGenes['chrom'] = scerGenes.apply(lambda x:'Scer_{}'.format(roman.fromRoman(x['chrom'].strip('chr'))), axis=1)

In [3]:
scerGenes.head()

Unnamed: 0,chrom,chrom_start,chrom_end,gene,strand
496,Scer_1,147593,151166,YAL001C,-
2715,Scer_1,143706,147531,YAL002W,+
2926,Scer_1,142173,143160,YAL003W,+
5244,Scer_1,139502,141431,YAL005C,-
4584,Scer_1,137697,138345,YAL007C,-


In [4]:
sparGenes = pd.read_csv('data/spar/S_paradoxus_genes.bed', sep='\t', header=None, usecols=[0,1,2,3,5], names=['chrom','chrom_start','chrom_end','gene','strand']).sort_values(by='gene')
sparGenes['gene'] = sparGenes.apply(lambda x:x['gene'].split('.')[0], axis=1)

In [5]:
sparGenes.head()

Unnamed: 0,chrom,chrom_start,chrom_end,gene,strand
496,Spar_1,122842,126415,YAL001C,-
2715,Spar_1,118955,122774,YAL002W,+
2926,Spar_1,117437,118422,YAL003W,+
5244,Spar_1,114759,116688,YAL005C,-
4584,Spar_1,112857,113505,YAL007C,-


In [6]:
COMPLEMENTS = {
    'A':'T',
    'T':'A',
    'C':'G',
    'G':'C',
    'N':'N',
}

def get_reverse_complement(seq):
    return ''.join(map(COMPLEMENTS.get, seq.upper()[::-1]))

In [7]:
scerRef = SeqIO.to_dict(SeqIO.parse('data/scer/S_cerevisiae.fa', 'fasta'))
sparRef = SeqIO.to_dict(SeqIO.parse('data/spar/S_paradoxus.fa', 'fasta'))

In [13]:
with open('data/scer/scer_transcriptome.fa', 'w') as fh:
    for ix,row in scerGenes.iterrows():
        forwardSeq = scerRef[row['chrom']].seq[row['chrom_start']:row['chrom_end']]
        seq = forwardSeq if row['strand'] == '+' else get_reverse_complement(forwardSeq)
        fh.write('>{}\n'.format(row['gene']))
        fh.write('{}\n'.format(seq))

In [14]:
with open('data/spar/spar_transcriptome.fa', 'w') as fh:
    for ix,row in sparGenes.iterrows():
        forwardSeq = sparRef[row['chrom']].seq[row['chrom_start']:row['chrom_end']]
        seq = forwardSeq if row['strand'] == '+' else get_reverse_complement(forwardSeq)
        fh.write('>{}\n'.format(row['gene']))
        fh.write('{}\n'.format(seq))