In [None]:
import Bio
import numpy as np
from collections import Counter
import collections
import copy
from matplotlib import pyplot as plt

In [None]:
from Bio import SeqIO
from Bio import Data

In [None]:
codon_table = Data.CodonTable.standard_dna_table.forward_table
empty_codon_table = {key: 0 for key in codon_table.keys()}
empty_codon_table = Counter(empty_codon_table)

In [None]:
record = SeqIO.read("./U00096_3.gb", "genbank")

In [None]:
CDS_list = [feature for feature in record.features if feature.type == "CDS"]

In [None]:
def get_CDS_codon_table(record, CDS_feature, empty_codon_table):
    CDS_codon_table = copy.deepcopy(empty_codon_table)
    CDS_seq = str(CDS_feature.location.extract(record).seq)
    codon_list = [CDS_seq[i : i + 3] for i in range(0, len(CDS_seq), 3)]
    unique, counts = np.unique(codon_list, return_counts=True)
    CDS_codon_dict = dict(zip(unique.tolist(), counts.tolist()))
    CDS_codon_table.update(CDS_codon_dict)
    return CDS_codon_table

In [None]:
all_CDS_codon_tables = [
    get_CDS_codon_table(record, CDS_feature, empty_codon_table)
    for CDS_feature in CDS_list
]

In [None]:
genome_codon_table = sum(all_CDS_codon_tables, Counter())
ttl_codons = sum([val for val in genome_codon_table.values()])
genome_codon_table = {
    key: val / ttl_codons for key, val in dict(genome_codon_table).items()
}

In [None]:
bias_thr = 5.0
glu_sum_thr = 25
GAA_to_GAG_bias = np.array(
    [item["GAA"] / (item["GAG"] + 1.0) for item in all_CDS_codon_tables]
)
glu_sum = np.array([item["GAA"] + (item["GAG"]) for item in all_CDS_codon_tables])
GAA_biased_features = list(
    np.array(CDS_list)[
        np.where((GAA_to_GAG_bias > bias_thr) & (glu_sum > glu_sum_thr))[0]
    ]
)
GAA_biased_gene_names = sorted(
    [feature.qualifiers["gene"][0] for feature in GAA_biased_features]
)

In [None]:
plt.hist(glu_sum, bins=20)

In [None]:
plt.hist(GAA_to_GAG_bias, bins=20)

In [None]:
GAA_biased_gene_names

In [None]:
bias_thr = 10.0

GAG_to_GAA_bias = np.array(
    [item["GAG"] / (item["GAA"] + 1) for item in all_CDS_codon_tables]
)
GAG_biased_features = list(np.array(CDS_list)[np.where(GAG_to_GAA_bias > bias_thr)[0]])
GAG_biased_gene_names = sorted(
    [feature.qualifiers["gene"][0] for feature in GAG_biased_features]
)

In [None]:
plt.hist(GAG_to_GAA_bias, bins=20, log=True)

In [None]:
sum(all_CDS_codon_tables[0].values())

In [None]:
bias_thr = 0.1

GAA_bias = np.array([item["GAA"] / sum(item.values()) for item in all_CDS_codon_tables])
GAA_biased_features = list(np.array(CDS_list)[np.where(GAA_bias > bias_thr)[0]])
GAA_biased_gene_names = sorted(
    [feature.qualifiers["gene"][0] for feature in GAA_biased_features]
)

In [None]:
plt.hist(GAA_bias, bins=20)

In [None]:
GAA_biased_gene_names

In [None]:
bias_thr = 0.07

GAG_bias = np.array([item["GAG"] / sum(item.values()) for item in all_CDS_codon_tables])
GAG_biased_features = list(np.array(CDS_list)[np.where(GAG_bias > bias_thr)[0]])
GAG_biased_gene_names = sorted(
    [feature.qualifiers["gene"][0] for feature in GAG_biased_features]
)

In [None]:
plt.hist(GAG_bias, bins=20)

In [None]:
GAG_biased_gene_names

In [None]:
mreb_loc = np.where(
    [True if item.qualifiers["gene"][0] == "mreB" else False for item in CDS_list]
)[0][0]
all_CDS_codon_tables[mreb_loc]

In [None]:
mreb_loc = np.where(
    [True if item.qualifiers["gene"][0] == "glmM" else False for item in CDS_list]
)[0][0]
all_CDS_codon_tables[mreb_loc]