**Purpose**
* Aggregate conditional mutual information tables by conditioned variable category
* Aggregate mutual information values across variable categories

# Setup

In [1]:
import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_columns', None)

In [2]:
import importlib.util

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
ccv_spec = importlib.util.spec_from_file_location("codon_context_variables", 
                                                  "../codon_context_variables.py")
ccv = importlib.util.module_from_spec(ccv_spec)
ccv_spec.loader.exec_module(ccv)

## Files - read

CMI files:

In [5]:
cmi_filenames = glob.glob("../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var*")
cmi_var_file_map = {(x.split("_var_")[1].split("_")[0]):x
                    for x in cmi_filenames}
cmi_var_file_map

{'tAIavg': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_tAIavg_12cod_20bins.tsv',
 'Ccount': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_Ccount_20bins.tsv',
 'mfe': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_mfe_20bins.tsv',
 'efe': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_efe_20bins.tsv',
 'TpAcount': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_TpAcount_20bins.tsv',
 'cfe': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_cfe_20bins.tsv',
 'ApTcount': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_ApTcount_20bins.tsv',
 'meafe': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_meafe_20bins.tsv',
 'end': '../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_end_20bins.tsv',
 'cd': '../../data/2_conditional_mutual_informati

MI with context variables

In [6]:
mi_var_filenames = glob.glob("../../data/2_conditional_mutual_information/mut_inf_codon_var_*")
mi_var_file_map = {(x.split("_var_")[1].split("_")[0]):x
                    for x in mi_var_filenames}
mi_var_file_map

{'CSCavg': '../../data/2_conditional_mutual_information/mut_inf_codon_var_CSCavg_12cod_20bins.tsv',
 'end': '../../data/2_conditional_mutual_information/mut_inf_codon_var_end_20bins.tsv',
 'cd': '../../data/2_conditional_mutual_information/mut_inf_codon_var_cd_20bins.tsv',
 'GCcount': '../../data/2_conditional_mutual_information/mut_inf_codon_var_GCcount_20bins.tsv',
 'CpGcount': '../../data/2_conditional_mutual_information/mut_inf_codon_var_CpGcount_20bins.tsv',
 'ApTcount': '../../data/2_conditional_mutual_information/mut_inf_codon_var_ApTcount_20bins.tsv',
 'TpAcount': '../../data/2_conditional_mutual_information/mut_inf_codon_var_TpAcount_20bins.tsv',
 'cfe': '../../data/2_conditional_mutual_information/mut_inf_codon_var_cfe_20bins.tsv',
 'meafe': '../../data/2_conditional_mutual_information/mut_inf_codon_var_meafe_20bins.tsv',
 'efe': '../../data/2_conditional_mutual_information/mut_inf_codon_var_efe_20bins.tsv',
 'mfe': '../../data/2_conditional_mutual_information/mut_inf_codon_v

## Files - written

Per category CMI tables (long)

In [7]:
sequence_comp_filename = "../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_SeqComp.tsv"
rna_stab_filename = "../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_RNAStab.tsv"
codon_att_filename = "../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_CodonAttrib.tsv"

Combined MI tables with context variables

In [8]:
mi_var_combined_filename = "../../data/2_conditional_mutual_information/mut_inf_codon_var_combined.tsv"

## Functions

In [9]:
def assign_cp_col (p_col) :
    
    return "CP"+((p_col % 3 == 0)*1 + (p_col % 3 == 1)*2 + (p_col % 3 == 2)*3).astype(str)

# Aggregate CMI tables

Code the different variable categories:

In [11]:
sequence_comp = ["GCcount","CpGcount","ApTcount","TpAcount","Ccount"]
rna_stab = ["cfe", "meafe", "efe", "mfe", "end", "cd"]
codon_att = ["tAIavg", "CSCavg"]

Read in each file and add `CP` descriptions:

In [12]:
cmi_var_df_map = {y:pd.read_csv(x, sep="\t", index_col=0)
                  for y,x in cmi_var_file_map.items()}
cmi_var_df_map["efe"].head()

Unnamed: 0,cmi_p0,cmi_p1,cmi_p2,cmi_p3,cmi_p4,cmi_p5,cmi_p6,cmi_p7,cmi_p8,cmi_p9,cmi_p10,cmi_p11,cmi_p12,cmi_p13,cmi_p14,cmi_p15,cmi_p16,cmi_p17,cmi_p18,cmi_p19,cmi_p20,cmi_p21,cmi_p22,cmi_p23,cmi_p24,cmi_p25,cmi_p26,cmi_p27,cmi_p28,cmi_p29,cmi_p30,cmi_p31,cmi_p32,cmi_p33,cmi_p34,cmi_p35,cmi_p36,cmi_p37,cmi_p38,cmi_p39,cmi_p40,cmi_p41,cmi_p42,cmi_p43,cmi_p44,cmi_p45,cmi_p46,cmi_p47,cmi_p48,cmi_p49,cmi_p50,cmi_p51,cmi_p52,cmi_p53,cmi_p54,cmi_p55,cmi_p56,cmi_p57,cmi_p58,cmi_p59,cmi_p60,cmi_p61,cmi_p62,cmi_p63,cmi_p64,cmi_p65,cmi_p66,cmi_p67,cmi_p68,cmi_p69,cmi_p70,cmi_p71,cmi_p72,cmi_p73,cmi_p74,cmi_p75,cmi_p76,cmi_p77,cmi_p78,cmi_p79,cmi_p80,cmi_p81,cmi_p82,cmi_p83,cmi_p84,cmi_p85,cmi_p86,cmi_p87,cmi_p88,cmi_p89,cmi_p90,cmi_p91,cmi_p92,cmi_p93,cmi_p94,cmi_p95,cmi_p96,cmi_p97,cmi_p98,cmi_p99,cmi_p100
F,0.000258,0.000222,0.003091,0.000274,0.000199,0.003402,0.000252,0.000219,0.003373,0.000349,0.000221,0.003731,0.000331,0.000248,0.003591,0.000297,0.000268,0.003547,0.000368,0.000196,0.003419,0.000287,0.000235,0.003352,0.000344,0.000196,0.003753,0.000308,0.000242,0.003774,0.00043,0.000216,0.004142,0.000347,0.000179,0.004103,0.000356,0.000335,0.004072,0.000466,0.0003,0.00416,0.00057,0.000209,0.005086,0.000746,0.000308,0.009683,,,,0.095154,0.002298,0.007663,0.000172,0.000268,0.005154,0.000432,0.000262,0.004488,0.000485,0.000259,0.004649,0.000391,0.000337,0.004607,0.000462,0.000333,0.004517,0.000248,0.000179,0.00387,0.000343,0.000202,0.004058,0.000358,0.000296,0.003921,0.00031,0.000224,0.003627,0.000316,0.000161,0.003586,0.000272,0.000185,0.003309,0.000327,0.000244,0.003698,0.000299,0.00016,0.003357,0.00024,0.000219,0.003404,0.000255,0.000248,0.003227,0.000252,0.000163
L2,0.00017,0.000221,0.000587,0.000197,0.000198,0.000777,0.000162,0.000225,0.000703,0.000261,0.000337,0.000666,0.000244,0.000211,0.000716,0.000187,0.000193,0.000552,0.00026,0.000292,0.000685,0.000279,0.000357,0.00082,0.000234,0.000223,0.000643,0.000273,0.00034,0.000797,0.000326,0.00031,0.000926,0.000192,0.000231,0.000879,0.000387,0.000376,0.001114,0.00029,0.000474,0.000842,0.000154,0.000315,0.000793,0.000219,0.000723,0.002357,,,,0.002036,0.00037,0.000905,0.000666,0.000553,0.000613,0.000226,0.000329,0.000781,0.00016,0.000228,0.000842,0.000263,0.000235,0.000652,0.00031,0.000218,0.000757,0.000286,0.000223,0.000698,0.000203,0.000278,0.000659,0.000151,0.000271,0.000609,0.00018,0.000233,0.000911,0.000322,0.000292,0.000666,0.000259,0.000174,0.000766,0.000287,0.000208,0.000702,0.000211,0.000224,0.000507,0.000218,0.000169,0.0006,0.000223,0.000203,0.000723,0.000211,0.000168
L4,0.000332,0.000393,0.002574,0.000487,0.000341,0.002734,0.000365,0.000444,0.002918,0.000574,0.000598,0.002963,0.000457,0.000491,0.002929,0.000348,0.000381,0.003139,0.000423,0.000645,0.003277,0.000483,0.000583,0.002992,0.000359,0.000481,0.00345,0.000546,0.000603,0.002958,0.000637,0.000601,0.003723,0.000382,0.000529,0.003822,0.000448,0.00062,0.004127,0.000542,0.000529,0.004138,0.000562,0.000791,0.005138,0.000383,0.001165,0.004714,,,,0.05444,0.005634,0.007532,0.001301,0.000663,0.004283,0.000708,0.000564,0.003971,0.000373,0.000415,0.003651,0.000477,0.000457,0.003708,0.00042,0.000384,0.00339,0.000299,0.00037,0.003425,0.000364,0.000402,0.003174,0.000369,0.000469,0.003199,0.000363,0.000394,0.003218,0.000328,0.000379,0.003194,0.000337,0.000382,0.002844,0.000466,0.000389,0.002997,0.000349,0.000392,0.002871,0.000369,0.000396,0.003142,0.000355,0.000351,0.002809,0.000322,0.000307
I,0.000439,0.000351,0.004848,0.000422,0.000442,0.004445,0.000344,0.000311,0.00499,0.000544,0.000471,0.004749,0.00042,0.000323,0.00515,0.000417,0.000351,0.004836,0.000429,0.000347,0.005012,0.000376,0.00041,0.0054,0.000445,0.000383,0.005213,0.000507,0.000289,0.004907,0.000411,0.000432,0.005627,0.000429,0.000295,0.005857,0.000548,0.000452,0.005904,0.0005,0.000452,0.005841,0.000414,0.000561,0.006019,0.000526,0.000646,0.013107,,,,0.054421,0.001814,0.009882,0.000333,0.000486,0.006835,0.000623,0.000773,0.006305,0.000522,0.000572,0.005883,0.000367,0.000454,0.005851,0.000287,0.000433,0.00557,0.000392,0.000477,0.005113,0.000466,0.000353,0.005423,0.000578,0.000434,0.005127,0.000513,0.000403,0.005317,0.000468,0.000431,0.004883,0.000437,0.000347,0.005039,0.000513,0.000396,0.005178,0.000368,0.000349,0.00468,0.00037,0.000354,0.00486,0.000448,0.00034,0.004345,0.000271,0.000259
V,0.000305,0.000336,0.002803,0.000326,0.00045,0.002646,0.0003,0.000317,0.002746,0.000428,0.0005,0.002636,0.000383,0.000373,0.003033,0.000409,0.000437,0.00293,0.000361,0.000533,0.003033,0.000409,0.000436,0.002891,0.000391,0.000444,0.003043,0.000349,0.000543,0.002964,0.000424,0.000468,0.003258,0.000355,0.000476,0.003535,0.000425,0.000586,0.003533,0.000501,0.000508,0.003728,0.000428,0.00063,0.005044,0.000839,0.001533,0.005511,,,,0.053616,0.005102,0.007215,0.000652,0.000546,0.004173,0.000472,0.000467,0.003656,0.000455,0.000434,0.003869,0.000434,0.000471,0.003441,0.000358,0.000436,0.003431,0.000418,0.000351,0.00332,0.000352,0.000405,0.003074,0.00038,0.000403,0.003138,0.000313,0.000372,0.003055,0.000316,0.000344,0.003054,0.000352,0.000434,0.003102,0.000456,0.000426,0.002788,0.0004,0.000352,0.002909,0.000378,0.000366,0.002754,0.000338,0.000431,0.003002,0.00024,0.000235


In [13]:
cmi_var_long_df_map = {y:pd.wide_to_long(x.reset_index(), stubnames='cmi_p', 
                             i=['index'], 
                             j='position_index').\
                                 reset_index().\
                                 rename(columns={"index":"amino_acid"}).\
                                     assign(variable=y).\
                                         assign(CP=lambda z: assign_cp_col(z["position_index"]))
                                         for y,x in cmi_var_df_map.items()}
cmi_var_long_df_map["GCcount"].head()

Unnamed: 0,amino_acid,position_index,cmi_p,variable,CP
0,F,0,0.000197,GCcount,CP1
1,L2,0,0.000234,GCcount,CP1
2,L4,0,0.000404,GCcount,CP1
3,I,0,0.000435,GCcount,CP1
4,V,0,0.000345,GCcount,CP1


Write per-category tables

In [14]:
sequence_comp_df = pd.concat([x for y,x in cmi_var_long_df_map.items()
                              if y in sequence_comp], ignore_index=True,
                             axis=0)
rna_stab_df = pd.concat([x for y,x in cmi_var_long_df_map.items()
                              if y in rna_stab], ignore_index=True,
                             axis=0)
codon_att_df = pd.concat([x for y,x in cmi_var_long_df_map.items()
                              if y in codon_att], ignore_index=True,
                             axis=0)

sequence_comp

print(sequence_comp_df.shape)
print(rna_stab_df.shape)
print(codon_att_df.shape)

(10605, 5)
(12726, 5)
(4242, 5)


In [15]:
sequence_comp_df.to_csv(sequence_comp_filename, sep="\t",
                        index=False)
rna_stab_df.to_csv(rna_stab_filename, sep="\t",
                   index=False)
codon_att_df.to_csv(codon_att_filename, sep="\t",
                    index=False)

# Combine MI of context variables

In [16]:
mi_var_file_df_map = {x:pd.read_csv(y,sep="\t",index_col=0).assign(variable=x).reset_index().rename(columns={"index":"amino_acid"})
                      for x,y in mi_var_file_map.items()}
mi_var_file_df_map["mfe"].head()

Unnamed: 0,amino_acid,mi,variable
0,F,0.035835,mfe
1,L2,0.039856,mfe
2,L4,0.05997,mfe
3,I,0.055959,mfe
4,V,0.063422,mfe


In [17]:
mi_var_combined_df = pd.concat(mi_var_file_df_map.values(),
                           ignore_index=True,
                           axis=0)
mi_var_combined_df.head()

Unnamed: 0,amino_acid,mi,variable
0,F,0.048565,CSCavg
1,L2,0.030982,CSCavg
2,L4,0.067254,CSCavg
3,I,0.081848,CSCavg
4,V,0.070078,CSCavg


Write combined table

In [18]:
mi_var_combined_df.to_csv(mi_var_combined_filename,
                          sep="\t",index=False)