#### calc attributes
- Look at physical attributes of the genes
- transcript length, CDS, intron length, UTR lengths
- GC content
- codon optimality

In [None]:
import sys
import pandas as pd
import seaborn as sns
import os
import gffutils
import numpy as np
from collections import defaultdict
import scipy.stats as stats
from Bio import SeqIO
from Bio.Seq import Seq
import pickle
from pyfaidx import Fasta

sys.path.append('../scripts')
from plot_helpers import *

db = gffutils.FeatureDB(gffutils_db)

outdir = '../Figures/gene_attributes/'
os.makedirs(outdir, exist_ok=True)

%load_ext autoreload
%autoreload 2

In [None]:
seqs_dir = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/region_fastas'

#### Part I: Examine the UTR 

In [None]:
#Get UTR lengths of ASR genes
bg_genes = db.features_of_type('gene')
#background set should be all genes that passed the filter:
tutr_d = defaultdict(list)
futr_d = defaultdict(list)
txt_d = defaultdict(list)
cds_d = defaultdict(list)
p=0
for i in bg_genes:
    txts = db.children(i.id, featuretype='transcript')
    for t in txts:
        tlen = db.children_bp(t, child_featuretype='three_prime_utr')
        flen = db.children_bp(t, child_featuretype='five_prime_utr')
        txtlen = db.children_bp(t, child_featuretype='exon')
        cdslen = db.children_bp(t, child_featuretype='CDS')
        tutr_d[i.id].append((tlen, t.id))
        futr_d[i.id].append((flen, t.id))
        txt_d[i.id].append((txtlen, t.id))
        cds_d[i.id].append((cdslen, t.id))
    p+=1
    #if p>0:
    #    break

In [None]:
# Finds the transcript with the longest length
max_tutr = {k:max(v) for (k,v) in tutr_d.items()}
max_futr = {k:max(v) for (k,v) in futr_d.items()}
max_txt = {k:max(v) for (k,v) in txt_d.items()}
max_cds = {k:max(v) for (k,v) in cds_d.items()}

# Finds the longest length
max_tutr_len = {k:max(v)[0] for (k,v) in tutr_d.items()}
max_futr_len = {k:max(v)[0] for (k,v) in futr_d.items()}
max_txt_len = {k:max(v)[0] for (k,v) in txt_d.items()}
max_cds_len = {k:max(v)[0] for (k,v) in cds_d.items()}

# Finds the ID corresponding to the longest length
max_tutr_id = {k:max(v)[1] for (k,v) in tutr_d.items()}
max_futr_id = {k:max(v)[1] for (k,v) in futr_d.items()}
max_txt_id = {k:max(v)[1] for (k,v) in txt_d.items()}
max_txt_cds = {k:max(v)[1] for (k,v) in cds_d.items()}

# Make a dataframe with all the lengths
df = pd.DataFrame.from_dict(max_tutr_len, orient='index', columns = ['tutr_len'])
df['futr_len'] = df.index.map(max_futr_len)
df['cds_len'] = df.index.map(max_cds_len)
df['txt_len'] = df.index.map(max_txt_len)
df['gene_length'] = df.index.map(lambda x: db[x].end - db[x].start + 1)

#df['log_tutr'] = df['tutr'].apply(np.log10)
#df['log_futr'] = df['futr'].apply(np.log10)
#df['log_txt'] = df['txt'].apply(np.log10)
#df['log_gene'] = df['gene_length'].apply(np.log10)
#log_df = df.replace([np.inf, -np.inf], np.nan).dropna(how='any')

#### Part II: Examine the UTR GC content and structure

In [None]:
def get_txt_id(s):
    '''This function is needed to extract the txt id for the CDS file'''
    s2 = s.split('; ')
    txt_id = list(filter(lambda x: x.startswith('parent'), s2))[0].split('=')[-1].split(',')[1]
    return txt_id

def parse_fb_fasta(infile, extract_ids=False):
    if extract_ids:
        txt_seqs = Fasta(infile, read_long_names=True, key_function=get_txt_id)
    else:
        txt_seqs = Fasta(infile)
    return txt_seqs

def count_gc(genelist, seq_dict, gene2txt):
    gc_dict = {}
    for i in genelist:
        try:
            longest_utr = seq_dict[gene2txt[i]][:].seq
            gc_content = (longest_utr.count('G') + longest_utr.count('C'))/len(longest_utr)
            gc_dict[i] = gc_content
        except KeyError:
            #keyError for non-coding RNAs
            pass
    return gc_dict

In [None]:
futr_file = os.path.join(seqs_dir, 'five_prime_utr_all.fa')
tutr_file = os.path.join(seqs_dir, 'three_prime_utr_all.fa')
cds_file = os.path.join(seqs_dir, 'CDS_all.fa')
futr_dict = parse_fb_fasta(futr_file)
tutr_dict = parse_fb_fasta(tutr_file)
cds_dict = parse_fb_fasta(cds_file, extract_ids=True)

In [None]:
# This script needs the transcript ID corresponding to the max length transcript
tutr_gc = count_gc(df.index, tutr_dict, max_tutr_id)
futr_gc = count_gc(df.index, futr_dict, max_futr_id)
df['futr_gc'] = df.index.map(futr_gc)
df['tutr_gc'] = df.index.map(tutr_gc)

#### Part III: Add in the CAI values (see calc_CAI.ipynb)

In [None]:
cai_file = os.path.join(outdir, 'CAI/CAI_values.csv')
cai_df = pd.read_csv(cai_file, index_col=0)

In [None]:
pd.merge(df, cai_df, left_index=True, right_index=True).to_csv(os.path.join(outdir, 'gene_attributes.csv'))