#### calc CAI
- calculate the CAI of all Drosophila genes
- try 2 methods: 
1) Use the RSCU values from all CDS 
2) Use the RSCU values from the top 1000 most highly expressed genes in the dataset

In [None]:
import pandas as pd
import sys
import gffutils
from Bio import SeqIO

sys.path.append('../scripts')
from plot_helpers import *
from cai import CodonAdaptationIndex as CAI

db = gffutils.FeatureDB(gffutils_db)

outdir = '../Figures/gene_attributes/CAI'
os.makedirs(outdir, exist_ok=True)

%load_ext autoreload
%autoreload 2

In [None]:
#Identify the most highly expressed genes to use for RSCU calculations
exp_file = os.path.join(results_dir, 'inspect/tot_levels.csv')
exp_df = pd.read_csv(exp_file, index_col=0)

In [None]:
ordered_genes = exp_df.sort_values(by='total_wt', ascending=False).index

In [None]:
# Get transcript with the longest CDS lengths, in order from high to low expression
# Can use the top 1000 most highly expressed genes to calculate high expression CAI set
# We need to add the ones 
n = 1
longest_cds ={}
all_genes = db.features_of_type('gene')
for i in all_genes:
    gene = i.id
    if 'gene_biotype' in db[gene].attributes.keys():
        biotype = db[gene].attributes['gene_biotype'][0]
    else:
        biotype = 'spike'
    if biotype == 'protein_coding':
        txts = db.children(gene, featuretype='transcript')
        lens = []
        for t in txts:
            cds_len = db.children_bp(t.id, child_featuretype='CDS')
            if cds_len % 3 != 0:
                continue
            else:
                lens.append((cds_len, t.id))

        if lens == []:
            continue

        else:
            # transcript id for longest CDS -> gene id
            max_len_id = max(lens)[1]
            n+=1
            longest_cds[max_len_id] = gene

In [None]:
# Now get a dict which is only the top 1000 most highly expressed genes
top_genes = {k:v for (k,v) in longest_cds.items() if v in ordered_genes}

In [None]:
#Write fasta file with all passed CDS sequences or top expressed ones
from utilities import parse_fb_fasta

def write_fasta(fasta, outfile, ids):
    '''
    Write output fasta file. 
    fasta = Fasta() object, ids =  ids to write.
    weird that pyfaidx doesn't seem to have a write method
    You need to access the whole sequence range, by adding [:]
    '''
    with open(outfile, 'w') as f:
        for i in ids:
            record = fasta[i]
            f.write(f'>{record.name}\n')
            f.write(f'{record[:].seq}\n')

seqs_dir = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/region_fastas'
cds_d = parse_fb_fasta(os.path.join(seqs_dir, 'CDS_all.fa'), extract_ids=True)

hiexp_file = os.path.join(outdir, 'CDS_top1000.fa')
all_file = os.path.join(outdir, 'CDS_all.fa')
write_fasta(cds_d, hiexp_file, top_genes)
write_fasta(cds_d, all_file, longest_cds)

In [None]:
# Get the codon adaptation index for all genes or highly expressed genes
cai_hiexp = CAI()
cai_hiexp.generate_index(hiexp_file)
cai_all = CAI()
cai_all.generate_index(all_file)

In [None]:
# Calculate the CAI for all genes using longest CDS
gene_2_txt = {v:k for (k,v) in longest_cds.items()}
all_genes = db.features_of_type('gene')
scores = {}
for i in all_genes:
    if 'gene_biotype' not in db[i.id].attributes.keys():
        continue
    biotype = db[i.id].attributes['gene_biotype'][0]
    if biotype == 'protein_coding':
        try:
            scores[i.id] = {}
            scores[i.id]['CAI_hiexp'] = cai_hiexp.cai_for_gene(cds_d[gene_2_txt[i.id]][:].seq)
            scores[i.id]['CAI_all'] = cai_all.cai_for_gene(cds_d[gene_2_txt[i.id]][:].seq)
        # if gene doesn't have CDS len divisible by 3, will cause a keyerror
        except KeyError:
            continue

In [None]:
# Write the CAI values
codon_df = pd.DataFrame.from_dict(scores, orient='index')
codon_df.to_csv(os.path.join(outdir, 'CAI_values.csv'))