In [139]:
from BCBio import GFF
import operator
from Bio.SeqRecord import SeqRecord
import glob, os
from Bio import SeqIO
import time
import pysam

In [140]:
sourcebase= '/mnt/test_data/hepavac34/rna_benign/spladdrout'
#sourcebase = '/tmp/'
reffile = '/mnt/test_data/refs/GRCh37.primary_assembly.genome.fa'
fileending = 'gff3'
vcffile= '/tmp/vcf.vcf.gz' #must be indexed!

In [136]:
#Getting all gff-files in base directory
def get_gff_files(sourcebase):
    print "Checking for GFF Files in directory..."
    gfffiles = []
    os.chdir(sourcebase)
    for file in glob.glob("*.%s" %fileending):
        gfffiles.append(file)
    print "Found %i file(s) in %s ending with %s." %(len(gfffiles), sourcebase, fileending)
    return gfffiles

In [128]:
#ref_recs = ''
def load_reffile(reffile):
    print "Loading %s into memory" %reffile
    t0=time.clock()
    with open(reffile) as in_handle:
        ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))
    return ref_recs
    t1=time.clock()
    total_ref_bases=0
    for key in ref_recs:
        total_ref_bases+=len(ref_recs[key])
    print "Loaded %i sequences with %i bases in %i seconds" %(len(ref_recs),total_ref_bases,t1-t0)

In [129]:
def get_gffrecords(in_handle, ref_recs):
    """Parse gff file, generating SeqRecord and SeqFeatures for predictions
    """
    for rec in GFF.parse(in_handle, target_lines=1000, base_dict=ref_recs):
        yield rec


In [141]:
def get_records(infile,ref_recs):
    records = []
    records_until_stop = []
    gffrecords = get_gffrecords(infile, ref_recs)
    for rec in gffrecords:
        for feature in rec.features:
                    seq_exons = []
                    for cds in feature.sub_features:
                        seq_exons.append(rec.seq[
                            cds.location.nofuzzy_start:
                            cds.location.nofuzzy_end])
                    gene_seq = reduce(operator.add, seq_exons)
                    if feature.strand == -1:
                        gene_seq = gene_seq.reverse_complement()
                    protein_seq = gene_seq.translate()
                    protein_stopseq = gene_seq.translate(to_stop=True)
                    records.append(SeqRecord(protein_seq, feature.qualifiers["ID"][0], "", ""))
                    records_until_stop.append(SeqRecord(protein_stopseq, feature.qualifiers["ID"][0], "", ""))
    return records, records_until_stop

In [131]:
def write_records(records,records_until_stop,basename):
    print "Writing records to files..."
    outfile = '/tmp/predicted_proteins-%s.fa' %basename
    outfile2 = '/tmp/predicted_proteins_until_stop-%s.fa' %basename   
    #for r in [records,records_until_stop]
    with open(outfile, "w") as out_handle:
        print("Wrote %i sequences" %SeqIO.write(records, out_handle, "fasta"))
    with open(outfile2, "w") as out_handle:
        SeqIO.write(records_until_stop, out_handle, "fasta")


In [132]:
def run():
    gfffiles = get_gff_files(sourcebase)
    ref_recs = load_reffile(reffile)
    for infile in gfffiles:
        records, records_until_stop = get_records(infile,ref_recs)
        write_records(records,records_until_stop,os.path.basename(infile))
        

In [138]:
run()

Checking for GFF Files in directory...
Found 5 file(s) in /mnt/test_data/hepavac34/rna_benign/spladdrout ending with gff3.
Loading /mnt/test_data/refs/GRCh37.primary_assembly.genome.fa into memory




Writing records to files...
Wrote 5183 sequences
Writing records to files...
Wrote 4847 sequences
Writing records to files...
Wrote 7030 sequences
Writing records to files...
Wrote 9779 sequences
Writing records to files...
Wrote 1777 sequences


In [121]:
os.path.basename(infile)

'ensg228794.gff'

In [161]:
infile='/tmp/ensg228794.gff'
for rec in GFF.parse(open(infile),ref_recs):
    #if rec['type']=='gene':
    print rec


ID: GL000191.1
Name: GL000191.1
Description: GL000191.1 GL000191.1
Number of features: 0
Seq('GATCCACCTGCCTCAGCCTCCCAGAGTGCTGGGATTATAGGTGTGAGCCACTGC...ATC', SingleLetterAlphabet())
ID: GL000192.1
Name: GL000192.1
Description: GL000192.1 GL000192.1
Number of features: 0
Seq('GAATTCATTCACCATTATTCTTTTATAATATTGCTATTTTATTATTCTTGATCA...TTC', SingleLetterAlphabet())
ID: GL000193.1
Name: GL000193.1
Description: GL000193.1 GL000193.1
Number of features: 0
Seq('GAATTCATTTCTTTGATTTGTAGAGTGGCAGGGTACCTAAAGTTAACATTTGTC...TTC', SingleLetterAlphabet())
ID: GL000194.1
Name: GL000194.1
Description: GL000194.1 GL000194.1
Number of features: 0
Seq('GATCCCTGCCCTAAAACTTTCCCCCCTCATGTCCAGCAAATGCTGCATGGAGCC...ATC', SingleLetterAlphabet())
ID: GL000195.1
Name: GL000195.1
Description: GL000195.1 GL000195.1
Number of features: 0
Seq('GAATTCCTCGTTCACACAGTTTCTTAAGCTTCCTGGGATGCGACCTGTGATGGC...TTC', SingleLetterAlphabet())
ID: GL000196.1
Name: GL000196.1
Description: GL000196.1 GL000196.1
Number of features: 0
Seq('G

In [160]:
len(a)

794826

In [144]:
a=gfrecs.next()

{'GL000191.1': SeqRecord(seq=Seq('GATCCACCTGCCTCAGCCTCCCAGAGTGCTGGGATTATAGGTGTGAGCCACTGC...ATC', SingleLetterAlphabet()), id='GL000191.1', name='GL000191.1', description='GL000191.1 GL000191.1', dbxrefs=[]),
 'GL000192.1': SeqRecord(seq=Seq('GAATTCATTCACCATTATTCTTTTATAATATTGCTATTTTATTATTCTTGATCA...TTC', SingleLetterAlphabet()), id='GL000192.1', name='GL000192.1', description='GL000192.1 GL000192.1', dbxrefs=[]),
 'GL000193.1': SeqRecord(seq=Seq('GAATTCATTTCTTTGATTTGTAGAGTGGCAGGGTACCTAAAGTTAACATTTGTC...TTC', SingleLetterAlphabet()), id='GL000193.1', name='GL000193.1', description='GL000193.1 GL000193.1', dbxrefs=[]),
 'GL000194.1': SeqRecord(seq=Seq('GATCCCTGCCCTAAAACTTTCCCCCCTCATGTCCAGCAAATGCTGCATGGAGCC...ATC', SingleLetterAlphabet()), id='GL000194.1', name='GL000194.1', description='GL000194.1 GL000194.1', dbxrefs=[]),
 'GL000195.1': SeqRecord(seq=Seq('GAATTCCTCGTTCACACAGTTTCTTAAGCTTCCTGGGATGCGACCTGTGATGGC...TTC', SingleLetterAlphabet()), id='GL000195.1', name='GL000195.1', descriptio