#### calc CAI
- calculate the CAI of all Drosophila genes
- try 2 methods: 1) Use the RSCU values from all CDS 2) Use the RSCU values from the top 1000 most highly expressed genes in the NMJ at t0

In [None]:
import pandas as pd
import sys
import gffutils
from Bio import SeqIO

sys.path.append('../scripts')
from plot_helpers import *
from cai import CodonAdaptationIndex as CAI

db = gffutils.FeatureDB(gffutils_db)

outdir = '../Figures/gene_attributes/CAI'
os.makedirs(outdir, exist_ok=True)

%load_ext autoreload
%autoreload 2

In [None]:
#Identify the most highly expressed genes to use for RSCU calculations
exp_file = os.path.join(results_dir, 'inspect_recalc_tpm/tot_levels.csv')
exp_df = pd.read_csv(exp_file, index_col=0)

In [None]:
ordered_genes = exp_df.sort_values(by='total_0', ascending=False).index

In [None]:
#Get longest coding transcript for top 1000 genes
n = 1
hiexp_cds = []
for gene in ordered_genes:
    biotype = db[gene].attributes['gene_biotype'][0]
    if biotype == 'protein_coding':
        txts = db.children(gene, featuretype='transcript')
        lens = []
        for t in txts:
            cds_len = db.children_bp(t.id, child_featuretype='CDS')
            if cds_len % 3 != 0:
                continue
            else:
                lens.append((cds_len, t.id))

        if lens == []:
            continue
        else:
            max_len_id = max(lens)[1]
            n+=1
            hiexp_cds.append(max_len_id)
    if n>1000:
        break

In [None]:
#Write fasta file with all passed CDS sequences or top expressed ones
CDS_file = '../../resources/genome_seqs/dmel-all-CDS-r6.28.fasta'
passed_cds_file = os.path.join(outdir, 'passed_cds.fasta')
hiexp_cds_file = os.path.join(outdir, 'hiexp_cds.fasta')

records = list(SeqIO.parse(CDS_file, 'fasta'))
n = 0
passed_cds_records = []
hiexp_cds_records = []

for i in records:
    att_string = i.description.split('; ')
    attributes = dict(zip([i.split('=')[0] for i in att_string], [i.split('=')[1] for i in att_string]))
    gene_id, txt_id = attributes['parent'].split(',')
    txt_seq = str(i.seq)
    #some txt seqs not divisible by 3 should be excluded
    if not len(txt_seq)%3 == 0:
        continue

    passed_cds_records.append(i)
    if txt_id in hiexp_cds:
        hiexp_cds_records.append(i)
    n+=1
print('number of coding transcripts', n)

SeqIO.write(passed_cds_records, passed_cds_file, 'fasta')
SeqIO.write(hiexp_cds_records, hiexp_cds_file, 'fasta')

In [None]:
cai_hiexp = CAI()
cai_hiexp.generate_index(hiexp_cds_file)
cai_all = CAI()
cai_all.generate_index(passed_cds_file)

In [None]:
#Go through the fasta file and get the CAI for all genes
scores_by_txt = {}
records = list(SeqIO.parse(CDS_file, "fasta"))
n = 0
for i in records:
    att_string = i.description.split('; ')
    attributes = dict(zip([i.split('=')[0] for i in att_string], [i.split('=')[1] for i in att_string]))
    gene_id, txt_id = attributes['parent'].split(',')
    #some txt seqs not divisible by 3 should be excluded
    cds_len = len(i.seq)
    if not cds_len%3 == 0:
        continue
    scores_by_txt[txt_id] = {}
    scores_by_txt[txt_id]['CAI_hiexp'] = cai_hiexp.cai_for_gene(str(i.seq))
    scores_by_txt[txt_id]['CAI_all'] = cai_all.cai_for_gene(str(i.seq))
    scores_by_txt[txt_id]['gene'] = gene_id
    scores_by_txt[txt_id]['length'] = cds_len
    n+=1
print('number of coding transcripts', n)

In [None]:
codon_df = pd.DataFrame.from_dict(scores_by_txt, orient='index')
codon_df.groupby('gene').max().to_csv(os.path.join(outdir, 'CAI_values.csv'))