#### calc attributes
- Look at physical attributes of the API genes
- transcript length, intron length, UTR lengths
- GC content
- codon optimality

In [None]:
import sys
import pandas as pd
import seaborn as sns
import os
import gffutils
import numpy as np
from collections import defaultdict
import scipy.stats as stats
from Bio import SeqIO
from Bio.Seq import Seq
import pickle

sys.path.append('../scripts')
from plot_helpers import *

db = gffutils.FeatureDB(gffutils_db)

outdir = '../Figures/gene_attributes/'
os.makedirs(outdir, exist_ok=True)

%load_ext autoreload
%autoreload 2

In [None]:
bg_file1 = '../Figures/summary_files/inspect/synth_rates_filt.csv'
synth_df = pd.read_csv(bg_file1, index_col=0)
bg_genes = synth_df.index

#### Part I: Examine the UTR 

In [None]:
#Get UTR lengths of ASR genes
#background set should be all genes that passed the filter:
tutr_d = defaultdict(list)
futr_d = defaultdict(list)
txt_d = defaultdict(list)
p=0
for i in bg_genes:
    txts = db.children(i, featuretype='transcript')
    for t in txts:
        tlen = db.children_bp(t, child_featuretype='three_prime_utr')
        flen = db.children_bp(t, child_featuretype='five_prime_utr')
        txtlen = db.children_bp(t, child_featuretype='exon')
        tutr_d[i].append((tlen, t.id))
        futr_d[i].append((flen, t.id))
        txt_d[i].append((cdslen, t.id))
    p+=1
    #if p>0:
    #    break

In [None]:
max_tutr = {k:max(v) for (k,v) in tutr_d.items()}
max_futr = {k:max(v) for (k,v) in futr_d.items()}
max_txt = {k:max(v) for (k,v) in txt_d.items()}
max_tutr_len = {k:max(v)[0] for (k,v) in tutr_d.items()}
max_futr_len = {k:max(v)[0] for (k,v) in futr_d.items()}
max_txt_len = {k:max(v)[0] for (k,v) in txt_d.items()}
max_tutr_id = {k:max(v)[1] for (k,v) in tutr_d.items()}
max_futr_id = {k:max(v)[1] for (k,v) in futr_d.items()}
max_txt_id = {k:max(v)[1] for (k,v) in txt_d.items()}
df = pd.DataFrame.from_dict(max_tutr_len, orient='index', columns = ['tutr'])
df['futr'] = df.index.map(max_futr_len)
df['txt'] = df.index.map(max_cds_len)
df['gene_length'] = df.index.map(lambda x: db[x].end - db[x].start + 1)
#df['log_tutr'] = df['tutr'].apply(np.log10)
#df['log_futr'] = df['futr'].apply(np.log10)
#df['log_txt'] = df['txt'].apply(np.log10)
#df['log_gene'] = df['gene_length'].apply(np.log10)
#log_df = df.replace([np.inf, -np.inf], np.nan).dropna(how='any')

In [None]:
#write the genes with the max futr, tutr, or txt length to use for motif analysis
with open(os.path.join(outdir, 'max_tutr_gene.p'), 'wb') as f:
    pickle.dump(max_tutr_id, f)
with open(os.path.join(outdir, 'max_futr_gene.p'), 'wb') as f:
    pickle.dump(max_futr_id, f)
with open(os.path.join(outdir, 'max_txt_gene.p'), 'wb') as f:
    pickle.dump(max_txt_id, f)

#### Part II: Examine the UTR GC content and structure

In [None]:
def parse_fb_fasta(infile):
    records = list(SeqIO.parse(infile, "fasta"))
    seq_dict = {}
    n = 0
    for i in records:
        att_string = i.description.split('; ')
        txt_id = att_string[0].split(' ')[0]
        attributes = dict(zip([i.split('=')[0] for i in att_string], [i.split('=')[1] for i in att_string]))
        gene_id = attributes['parent']
        seq_dict[txt_id] = {}
        seq_dict[txt_id]['gene'] = gene_id
        seq_dict[txt_id]['seq'] = str(i.seq)
        n+=1
    return seq_dict

def count_gc(genelist, seq_dict, gene2txt):
    gc_dict = {}
    for i in genelist:
        try:
            longest_utr = seq_dict[gene2txt[i]]['seq']
            gc_content = (longest_utr.count('G') + longest_utr.count('C'))/len(longest_utr)
            gc_dict[i] = gc_content
        except KeyError:
            #keyError for non-coding RNAs
            pass
    return gc_dict

In [None]:
futr_file = '../../resources/genome_seqs/dmel-all-five_prime_UTR-r6.28.fasta'
futr_dict = parse_fb_fasta(futr_file)

In [None]:
tutr_file = '../../resources/genome_seqs/dmel-all-three_prime_UTR-r6.28.fasta'
tutr_dict = parse_fb_fasta(tutr_file)

In [None]:
tutr_gc = count_gc(df.index, tutr_dict, max_tutr_id)
futr_gc = count_gc(df.index, futr_dict, max_futr_id)

In [None]:
df['futr_gc'] = df.index.map(futr_gc)
df['tutr_gc'] = df.index.map(tutr_gc)

#### Part III: Add in the CAI values (see calc_CAI.ipynb)

In [None]:
cai_file = os.path.join(outdir, 'CAI/CAI_values.csv')
cai_df = pd.read_csv(cai_file, index_col=0)
cai_df.rename(columns = {'length':'cds_length'}, inplace=True)

In [None]:
pd.merge(df, cai_df, left_index=True, right_index=True).to_csv(os.path.join(outdir, 'gene_attributes.csv'))