**Purpose**

* Assemble some amino acid attributes into a table
* Add some codon-nucleotide mutual information metrics to table (both with observed and sub-sampled values)
* Generate a codon-contextual codon mutual information approximation from codon-nucleotide mutual information table
* Aggregate mutual information tables from repeated samplings of shuffled data

# Setup

In [1]:
import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_columns', None)

In [2]:
import importlib.util

In [3]:
ccv_spec = importlib.util.spec_from_file_location("codon_context_variables", 
                                                  "../codon_context_variables.py")
ccv = importlib.util.module_from_spec(ccv_spec)
ccv_spec.loader.exec_module(ccv)

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

## Files - read

Load amino acid property files:

In [5]:
aa_polarity_filename = "../../data/0_data_processing/aa_polarity.tsv"
aa_degeneracy_filename = "../../data/0_data_processing/aa_sub_degeneracy.tsv"
codon_content_filename = "../../data/0_data_processing/codon_content.tsv"

For `codon_frequency_filename`:
* Source: http://doi.org/10.1093/hmg/ddw207
* Table 1

Load the $H(C)$ files:

In [7]:
sh_entropy_codon_filename = "../../data/1_mutual_information/shannon_entropy_codon_AAsub.tsv"

Mutual information files:

In [5]:
mi_codon_nuc_pos_filename = "../../data/1_mutual_information/mut_info_codon_nuc_pos_101bp_AAsub.tsv"
mi_codon_nuc_pos_equal_AAsubsampled_filename = "../../data/1_mutual_information/mut_info_codon_nuc_pos_101bp_AAsub_subsampled.tsv"
mi_codon_nuc_cxtCodon_filename = "../../data/1_mutual_information/mut_info_codon_cxtCodon_33cod_AAsub.tsv"

Shuffled context mutual information files:

In [9]:
mi_shuffled_filenames = glob.glob("../../data/1_mutual_information/mut_info_context_shuffled/*")
len(mi_shuffled_filenames)

100

## Files - written

In [10]:
aa_atribute_filename = "../../data/1_mutual_information/aa_atributes_mut_info_AAsub.tsv"

In [6]:
mi_codon_3nuc_pos_filename = "../../data/1_mutual_information/mut_info_codon_sum3nuc_33cod_AAsub.tsv"

In [12]:
mi_shuffled_long_filename = "../../data/1_mutual_information/mut_info_codon_nuc_pos_context_permutation_100n_long.tsv"

# AA attributes

Consider amino acid polarity, degeneracy, GC content, and shannon entropy.

In [13]:
aa_polarity_df = pd.read_csv(aa_polarity_filename,
                             sep="\t")
aa_polarity_df.head()

Unnamed: 0,AminoAcid,Polarity
0,F,Nonpolar
1,L,Nonpolar
2,I,Nonpolar
3,V,Nonpolar
4,S,Polar


In [14]:
aa_degeneracy_df = pd.read_csv(aa_degeneracy_filename,
                               sep="\t",
                               index_col=0)
aa_degeneracy_df.head()

Unnamed: 0_level_0,Degeneracy
AminoAcid_sub,Unnamed: 1_level_1
F,2
L2,2
L4,4
I,3
V,4


In [15]:
codon_content_df = pd.read_csv(codon_content_filename,
                               sep="\t")
codon_content_df.head()

Unnamed: 0,Codon,GC_bases,Third_base
0,TTT,0,T
1,TTC,1,C
2,TTA,0,A
3,TTG,1,G
4,TCT,1,T


In [16]:
codon_content_df["AminoAcid_sub"] = codon_content_df["Codon"].map(ccv.codon_aa_sub_dict)
codon_content_df.head()

Unnamed: 0,Codon,GC_bases,Third_base,AminoAcid_sub
0,TTT,0,T,F
1,TTC,1,C,F
2,TTA,0,A,L2
3,TTG,1,G,L2
4,TCT,1,T,S4


In [17]:
aa_content_df = codon_content_df.groupby("AminoAcid_sub").\
    agg({"GC_bases":np.mean})
aa_content_df.head()

Unnamed: 0_level_0,GC_bases
AminoAcid_sub,Unnamed: 1_level_1
A,2.5
C,1.5
D,1.5
E,1.5
F,0.5


In [18]:
codon_frequency_df = pd.read_csv(codon_frequency_filename,
                                 sep="\t")
codon_frequency_df.head()

Unnamed: 0,Codon,AminoCode,Freq_per_k
0,UUU,Phe,17.6
1,UUC,Phe,20.3
2,UUA,Leu,7.7
3,UUG,Leu,12.9
4,CUU,Leu,13.2


In [19]:
codon_frequency_df["AminoAcid_sub"] = codon_frequency_df["Codon"].str.replace("U","T").\
    map(ccv.codon_aa_sub_dict)
aa_frequency_df = codon_frequency_df.groupby("AminoAcid_sub").agg({"Freq_per_k":np.sum})
aa_frequency_df.head()

Unnamed: 0_level_0,Freq_per_k
AminoAcid_sub,Unnamed: 1_level_1
A,69.3
C,23.2
D,46.9
E,68.6
F,37.9


In [20]:
sh_entropy_codon_df = pd.read_csv(sh_entropy_codon_filename,
                                  sep="\t",
                                  index_col=0)
sh_entropy_codon_df.head()

Unnamed: 0,h
F,0.691474
L2,0.664786
L4,1.211468
I,1.031249
V,1.266124


Combine amino acid attribute tables:

In [21]:
aa_attrib_df = pd.concat([aa_degeneracy_df, aa_content_df,
                          aa_frequency_df,
                          sh_entropy_codon_df],
                         ignore_index=False,
                         axis=1)
aa_attrib_df["AminoAcid"] = [x[0] for x in aa_attrib_df.index]
aa_attrib_df = aa_attrib_df.reset_index().merge(aa_polarity_df,
                                  on="AminoAcid",
                                  how="left").\
                                      rename(columns={"index":"AminoAcid_sub"})
aa_attrib_df.head()

Unnamed: 0,AminoAcid_sub,Degeneracy,GC_bases,Freq_per_k,h,AminoAcid,Polarity
0,F,2.0,0.5,37.9,0.691474,F,Nonpolar
1,L2,2.0,0.5,20.6,0.664786,L,Nonpolar
2,L4,4.0,1.5,79.6,1.211468,L,Nonpolar
3,I,3.0,0.333333,44.3,1.031249,I,Nonpolar
4,V,4.0,1.5,60.7,1.266124,V,Nonpolar


## Match with mutual information

In [22]:
mi_codon_nuc_pos_df = pd.read_csv(mi_codon_nuc_pos_filename,
                    sep="\t",
                    index_col=0)
mi_codon_nuc_pos_df.head()

Unnamed: 0,mi_p0,mi_p1,mi_p2,mi_p3,mi_p4,mi_p5,mi_p6,mi_p7,mi_p8,mi_p9,mi_p10,mi_p11,mi_p12,mi_p13,mi_p14,mi_p15,mi_p16,mi_p17,mi_p18,mi_p19,mi_p20,mi_p21,mi_p22,mi_p23,mi_p24,mi_p25,mi_p26,mi_p27,mi_p28,mi_p29,mi_p30,mi_p31,mi_p32,mi_p33,mi_p34,mi_p35,mi_p36,mi_p37,mi_p38,mi_p39,mi_p40,mi_p41,mi_p42,mi_p43,mi_p44,mi_p45,mi_p46,mi_p47,mi_p48,mi_p49,mi_p50,mi_p51,mi_p52,mi_p53,mi_p54,mi_p55,mi_p56,mi_p57,mi_p58,mi_p59,mi_p60,mi_p61,mi_p62,mi_p63,mi_p64,mi_p65,mi_p66,mi_p67,mi_p68,mi_p69,mi_p70,mi_p71,mi_p72,mi_p73,mi_p74,mi_p75,mi_p76,mi_p77,mi_p78,mi_p79,mi_p80,mi_p81,mi_p82,mi_p83,mi_p84,mi_p85,mi_p86,mi_p87,mi_p88,mi_p89,mi_p90,mi_p91,mi_p92,mi_p93,mi_p94,mi_p95,mi_p96,mi_p97,mi_p98,mi_p99,mi_p100
F,0.000898,0.000489,0.009987,0.001003,0.000359,0.01064,0.000931,0.000365,0.010564,0.000725,0.000419,0.011088,0.000963,0.000489,0.010566,0.001002,0.000422,0.011028,0.000925,0.000443,0.010698,0.000909,0.000391,0.010388,0.00112,0.000528,0.0112,0.000808,0.000342,0.011028,0.000986,0.000635,0.011698,0.000833,0.000504,0.011783,0.000962,0.000566,0.011691,0.001004,0.000433,0.011142,0.001151,0.000401,0.012995,0.00044,0.000426,0.019747,0.0,0.0,0.0,0.079302,0.002875,0.016938,0.000619,0.000724,0.013166,0.001017,0.000413,0.012028,0.000908,0.000336,0.012353,0.000905,0.000386,0.012572,0.000792,0.000384,0.012184,0.000828,0.000448,0.011508,0.0009,0.000451,0.011779,0.00092,0.000447,0.011388,0.000901,0.000543,0.010595,0.001046,0.000441,0.011051,0.000803,0.000529,0.010391,0.0009,0.000319,0.010848,0.000779,0.000408,0.010414,0.000811,0.000316,0.010542,0.00097,0.000494,0.010054,0.000934,0.000355
L2,0.000539,0.000212,0.003999,0.00059,0.00029,0.004325,0.000498,0.000304,0.004262,0.000558,0.000198,0.004015,0.000545,0.000351,0.004373,0.00057,0.000403,0.003712,0.000563,0.000242,0.00411,0.000404,0.00016,0.004367,0.000677,0.000399,0.003955,0.000418,0.000122,0.004506,0.000456,0.000251,0.00464,0.000499,0.000322,0.004485,0.000475,0.000193,0.00516,0.00043,7.5e-05,0.004513,0.000476,0.000527,0.004432,0.000679,0.000783,0.008151,0.0,0.0,0.0,0.004859,0.000318,0.004713,0.000706,0.000836,0.003862,0.000305,0.000407,0.004198,0.000664,0.00031,0.004445,0.000607,0.000331,0.004084,0.000413,0.000323,0.004092,0.000527,0.00025,0.004218,0.000594,0.000213,0.003869,0.000442,0.00036,0.004073,0.000466,0.000279,0.004263,0.000702,0.000329,0.003739,0.000386,0.00029,0.004056,0.000335,0.00036,0.00401,0.000578,0.000249,0.003645,0.000523,0.00028,0.003733,0.000508,0.000203,0.004014,0.000479,0.00019
L4,0.000987,0.000427,0.011659,0.001517,0.000601,0.012052,0.001353,0.000704,0.01254,0.00162,0.000365,0.01261,0.001504,0.000714,0.012711,0.001351,0.000674,0.012898,0.001616,0.000372,0.013479,0.001363,0.000406,0.012698,0.001466,0.000723,0.01381,0.001293,0.00039,0.012474,0.001729,0.000746,0.014175,0.001525,0.000644,0.014679,0.001221,0.000595,0.014705,0.001718,0.000473,0.014637,0.001856,0.001003,0.0167,0.001071,0.000878,0.017092,0.0,0.0,0.0,0.051991,0.005053,0.018732,0.00175,0.001014,0.014966,0.001566,0.000915,0.014623,0.001497,0.000861,0.013631,0.002018,0.00075,0.014492,0.001992,0.000676,0.013794,0.001316,0.000826,0.013616,0.001515,0.000528,0.013235,0.001502,0.000461,0.013534,0.001454,0.000737,0.012742,0.001594,0.000689,0.013112,0.001002,0.000564,0.012315,0.001499,0.000749,0.012379,0.001438,0.000573,0.012421,0.001457,0.000351,0.0128,0.001292,0.00066,0.012302,0.001159,0.000642
I,0.001146,0.000506,0.014883,0.001026,0.000408,0.014158,0.001087,0.000484,0.015167,0.001115,0.000634,0.014609,0.001166,0.000515,0.015609,0.001141,0.000452,0.015048,0.000951,0.000588,0.015302,0.001004,0.000554,0.016013,0.001053,0.00049,0.015899,0.001154,0.00045,0.014995,0.000979,0.000617,0.016382,0.001064,0.000478,0.016954,0.00113,0.000651,0.016597,0.00077,0.000604,0.016623,0.000616,0.000575,0.016253,0.00057,0.000877,0.030607,0.0,0.0,0.0,0.043454,0.002786,0.023063,0.000601,0.000856,0.018227,0.001057,0.000679,0.017351,0.000974,0.000592,0.016523,0.000922,0.0008,0.016712,0.00075,0.000537,0.016033,0.000936,0.000309,0.015111,0.001032,0.000496,0.015973,0.001193,0.000615,0.015593,0.001376,0.00047,0.015802,0.000834,0.000525,0.015084,0.000978,0.000511,0.015495,0.000967,0.000413,0.015123,0.000907,0.000351,0.014378,0.00096,0.000475,0.014742,0.001083,0.00045,0.013722,0.000965,0.000325
V,0.001151,0.00048,0.012516,0.001129,0.000423,0.012237,0.001092,0.000598,0.012349,0.00112,0.00052,0.012468,0.001169,0.000555,0.013212,0.001276,0.000452,0.013229,0.001088,0.000467,0.013312,0.001172,0.000591,0.01293,0.00122,0.000511,0.013484,0.001124,0.000479,0.013294,0.001165,0.000549,0.013437,0.001117,0.000576,0.014391,0.001182,0.000587,0.014479,0.00108,0.000543,0.01485,0.000998,0.000994,0.017043,0.000792,0.000928,0.019085,0.0,0.0,0.0,0.049554,0.004303,0.018739,0.001068,0.000999,0.015423,0.001137,0.000637,0.014742,0.001376,0.000709,0.014815,0.00159,0.000654,0.014231,0.001423,0.000595,0.014251,0.001288,0.000619,0.013858,0.001222,0.000498,0.013373,0.001234,0.000572,0.013446,0.001224,0.000585,0.013363,0.001083,0.000618,0.013392,0.001195,0.000545,0.013444,0.000984,0.000473,0.012735,0.00128,0.000552,0.01285,0.001201,0.000514,0.012598,0.001207,0.000445,0.012853,0.001141,0.000353


Read in the sub-sampled version, with each AAsub class getting equal depth

In [23]:
mi_codon_nuc_pos_eqsampled_df = pd.read_csv(mi_codon_nuc_pos_equal_AAsubsampled_filename,
                    sep="\t",
                    index_col=0)
mi_codon_nuc_pos_eqsampled_df.head()

Unnamed: 0,mi_p0,mi_p1,mi_p2,mi_p3,mi_p4,mi_p5,mi_p6,mi_p7,mi_p8,mi_p9,mi_p10,mi_p11,mi_p12,mi_p13,mi_p14,mi_p15,mi_p16,mi_p17,mi_p18,mi_p19,mi_p20,mi_p21,mi_p22,mi_p23,mi_p24,mi_p25,mi_p26,mi_p27,mi_p28,mi_p29,mi_p30,mi_p31,mi_p32,mi_p33,mi_p34,mi_p35,mi_p36,mi_p37,mi_p38,mi_p39,mi_p40,mi_p41,mi_p42,mi_p43,mi_p44,mi_p45,mi_p46,mi_p47,mi_p48,mi_p49,mi_p50,mi_p51,mi_p52,mi_p53,mi_p54,mi_p55,mi_p56,mi_p57,mi_p58,mi_p59,mi_p60,mi_p61,mi_p62,mi_p63,mi_p64,mi_p65,mi_p66,mi_p67,mi_p68,mi_p69,mi_p70,mi_p71,mi_p72,mi_p73,mi_p74,mi_p75,mi_p76,mi_p77,mi_p78,mi_p79,mi_p80,mi_p81,mi_p82,mi_p83,mi_p84,mi_p85,mi_p86,mi_p87,mi_p88,mi_p89,mi_p90,mi_p91,mi_p92,mi_p93,mi_p94,mi_p95,mi_p96,mi_p97,mi_p98,mi_p99,mi_p100
F,0.000832,0.000533,0.009951,0.000908,0.000305,0.010509,0.00092,0.000332,0.010284,0.000802,0.000422,0.011014,0.000997,0.000472,0.010762,0.00104,0.000456,0.010913,0.000933,0.000412,0.0106,0.000894,0.000397,0.010333,0.000995,0.000525,0.011167,0.000914,0.00031,0.011155,0.001029,0.000631,0.011772,0.000825,0.000525,0.011787,0.001052,0.000572,0.011625,0.001044,0.000415,0.011272,0.00095,0.000453,0.012711,0.00039,0.000407,0.019897,0.0,0.0,0.0,0.079904,0.002724,0.016992,0.000548,0.000567,0.013271,0.000972,0.000406,0.011828,0.000842,0.000318,0.012114,0.000848,0.000414,0.012673,0.00076,0.000283,0.012189,0.000688,0.000448,0.011519,0.000775,0.000497,0.011554,0.000776,0.000449,0.011273,0.000976,0.000481,0.010305,0.001037,0.000376,0.01103,0.000805,0.000486,0.010429,0.001005,0.000299,0.011005,0.000787,0.000395,0.010604,0.000881,0.000314,0.010305,0.001035,0.000463,0.009804,0.000843,0.000405
L2,0.000539,0.000212,0.003999,0.00059,0.00029,0.004325,0.000498,0.000304,0.004262,0.000558,0.000198,0.004015,0.000545,0.000351,0.004373,0.00057,0.000403,0.003712,0.000563,0.000242,0.00411,0.000404,0.00016,0.004367,0.000677,0.000399,0.003955,0.000418,0.000122,0.004506,0.000456,0.000251,0.00464,0.000499,0.000322,0.004485,0.000475,0.000193,0.00516,0.00043,7.5e-05,0.004513,0.000476,0.000527,0.004432,0.000679,0.000783,0.008151,0.0,0.0,0.0,0.004859,0.000318,0.004713,0.000706,0.000836,0.003862,0.000305,0.000407,0.004198,0.000664,0.00031,0.004445,0.000607,0.000331,0.004084,0.000413,0.000323,0.004092,0.000527,0.00025,0.004218,0.000594,0.000213,0.003869,0.000442,0.00036,0.004073,0.000466,0.000279,0.004263,0.000702,0.000329,0.003739,0.000386,0.00029,0.004056,0.000335,0.00036,0.00401,0.000578,0.000249,0.003645,0.000523,0.00028,0.003733,0.000508,0.000203,0.004014,0.000479,0.00019
L4,0.001019,0.000452,0.011491,0.001379,0.000623,0.012388,0.001248,0.000738,0.012703,0.001277,0.000384,0.012375,0.001351,0.000629,0.012269,0.001294,0.000659,0.012795,0.001651,0.000476,0.013324,0.001285,0.000321,0.01209,0.001319,0.00064,0.013385,0.001328,0.000444,0.012455,0.001722,0.000844,0.013905,0.00148,0.00061,0.014229,0.00122,0.000602,0.014777,0.001795,0.000508,0.014638,0.001754,0.001028,0.017174,0.001011,0.000922,0.016662,0.0,0.0,0.0,0.05194,0.005228,0.018799,0.00167,0.001127,0.015002,0.001432,0.000831,0.014633,0.001539,0.000739,0.013005,0.002007,0.000895,0.013905,0.002026,0.000592,0.013864,0.001414,0.000701,0.013205,0.001523,0.000471,0.013817,0.001537,0.000509,0.013013,0.001345,0.000716,0.012928,0.001623,0.000693,0.012866,0.001067,0.000635,0.011646,0.001526,0.000669,0.012555,0.001415,0.000655,0.011896,0.001447,0.000396,0.012369,0.001351,0.000659,0.011864,0.001112,0.000704
I,0.001014,0.000549,0.014678,0.000994,0.000487,0.014362,0.00115,0.000493,0.015268,0.001019,0.000705,0.014337,0.001201,0.000526,0.015575,0.001164,0.000358,0.015035,0.000975,0.000618,0.015226,0.00101,0.000607,0.015485,0.001057,0.000438,0.016168,0.001168,0.00048,0.01555,0.001002,0.000585,0.016277,0.001144,0.000439,0.016433,0.001185,0.000708,0.016491,0.000741,0.000678,0.016416,0.00063,0.00058,0.016394,0.000572,0.000827,0.031165,0.0,0.0,0.0,0.042626,0.002855,0.023398,0.000499,0.000905,0.018099,0.001069,0.000682,0.017113,0.001007,0.000632,0.016549,0.000887,0.000825,0.016682,0.000783,0.000526,0.015764,0.001033,0.000298,0.014576,0.000885,0.000487,0.016061,0.0014,0.000639,0.015673,0.001438,0.000446,0.015573,0.000822,0.000582,0.015027,0.001019,0.000558,0.015343,0.000983,0.000369,0.015147,0.000922,0.000283,0.014308,0.000974,0.000443,0.014688,0.001045,0.00043,0.013786,0.000931,0.000283
V,0.001139,0.000548,0.012262,0.001141,0.000449,0.012501,0.001016,0.000599,0.012202,0.00112,0.000519,0.012041,0.00128,0.000648,0.012831,0.001304,0.000533,0.013031,0.001062,0.000459,0.013438,0.001207,0.000613,0.013147,0.001166,0.00056,0.013495,0.00116,0.00048,0.013244,0.001132,0.000462,0.01371,0.001257,0.000529,0.014564,0.001282,0.000614,0.014668,0.001061,0.00052,0.014642,0.001035,0.000948,0.016582,0.000838,0.000994,0.019098,0.0,0.0,0.0,0.049745,0.004402,0.019226,0.000971,0.000927,0.015577,0.001043,0.00064,0.014453,0.001346,0.000718,0.014307,0.001607,0.000676,0.014304,0.001514,0.000612,0.01482,0.001319,0.000635,0.014132,0.001206,0.000549,0.01317,0.001234,0.000531,0.012932,0.001189,0.000551,0.013544,0.001117,0.000595,0.013291,0.001036,0.000424,0.013587,0.001008,0.000432,0.012709,0.001316,0.000571,0.012723,0.001299,0.000639,0.012583,0.001229,0.000457,0.012652,0.000991,0.000349


Summarize with average:

In [24]:
aa_sub_mi_nuc_avg = pd.concat([mi_codon_nuc_pos_df.loc[:,"mi_p0":"mi_p48"],
                               mi_codon_nuc_pos_df.loc[:,"mi_p51":]],
                              axis=1).\
                                  mean(axis=1)
                                  
aa_sub_mi_nuc_eq_avg = pd.concat([mi_codon_nuc_pos_eqsampled_df.loc[:,"mi_p0":"mi_p48"],
                                  mi_codon_nuc_pos_eqsampled_df.loc[:,"mi_p51":]],
                              axis=1).\
                                  mean(axis=1)

In [25]:
#Calculation without +1 MI values
aa_sub_mi_nuc_nop1_avg = pd.concat([mi_codon_nuc_pos_df.loc[:,"mi_p0":"mi_p48"],
                                    mi_codon_nuc_pos_df.loc[:,"mi_p52":]],
                                   axis=1).\
                                       mean(axis=1)
                                       
aa_sub_mi_nuc_nop1_eq_avg = pd.concat([mi_codon_nuc_pos_eqsampled_df.loc[:,"mi_p0":"mi_p48"],
                                       mi_codon_nuc_pos_eqsampled_df.loc[:,"mi_p52":]],
                                       axis=1).\
                                        mean(axis=1)

In [26]:
aa_attrib_combined_df = aa_attrib_df.set_index("AminoAcid_sub")

aa_attrib_combined_df["MI_p1"] = mi_codon_nuc_pos_df.loc[:,"mi_p51"]
aa_attrib_combined_df["MI_avg"] = aa_sub_mi_nuc_avg
aa_attrib_combined_df["MI_nop1_avg"] = aa_sub_mi_nuc_nop1_avg

aa_attrib_combined_df["MI_eq_p1"] = mi_codon_nuc_pos_eqsampled_df.loc[:,"mi_p51"]
aa_attrib_combined_df["MI_eq_avg"] = aa_sub_mi_nuc_eq_avg
aa_attrib_combined_df["MI_eq_nop1_avg"] = aa_sub_mi_nuc_nop1_eq_avg

In [27]:
aa_attrib_combined_df

Unnamed: 0_level_0,Degeneracy,GC_bases,Freq_per_k,h,AminoAcid,Polarity,MI_p1,MI_avg,MI_nop1_avg,MI_eq_p1,MI_eq_avg,MI_eq_nop1_avg
AminoAcid_sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
F,2.0,0.5,37.9,0.691474,F,Nonpolar,0.079302,0.005042,0.004284,0.079904,0.005022,0.004258
L2,2.0,0.5,20.6,0.664786,L,Nonpolar,0.004859,0.001716,0.001684,0.004859,0.001716,0.001684
L4,4.0,1.5,79.6,1.211468,L,Nonpolar,0.051991,0.005683,0.00521,0.05194,0.005619,0.005146
I,3.0,0.333333,44.3,1.031249,I,Nonpolar,0.043454,0.006253,0.005874,0.042626,0.006236,0.005864
V,4.0,1.5,60.7,1.266124,V,Nonpolar,0.049554,0.005621,0.005172,0.049745,0.005616,0.005166
S2,2.0,1.5,31.6,0.670166,S,Polar,0.068942,0.004801,0.004146,0.069519,0.004799,0.004139
S4,4.0,1.5,49.5,1.292105,S,Polar,0.072003,0.005417,0.004738,0.072213,0.005477,0.004796
P,4.0,2.5,61.1,1.325112,P,Nonpolar,0.0527,0.00553,0.005049,0.052911,0.005595,0.005113
T,4.0,1.5,53.2,1.318265,T,Polar,0.065148,0.005225,0.004614,0.064347,0.005249,0.004646
A,4.0,2.5,69.3,1.291111,A,Nonpolar,0.069737,0.006613,0.005969,0.069473,0.006602,0.00596


In [28]:
aa_attrib_syn_df = aa_attrib_combined_df.dropna(axis=0, how="any")

In [29]:
aa_attrib_syn_df

Unnamed: 0_level_0,Degeneracy,GC_bases,Freq_per_k,h,AminoAcid,Polarity,MI_p1,MI_avg,MI_nop1_avg,MI_eq_p1,MI_eq_avg,MI_eq_nop1_avg
AminoAcid_sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
F,2.0,0.5,37.9,0.691474,F,Nonpolar,0.079302,0.005042,0.004284,0.079904,0.005022,0.004258
L2,2.0,0.5,20.6,0.664786,L,Nonpolar,0.004859,0.001716,0.001684,0.004859,0.001716,0.001684
L4,4.0,1.5,79.6,1.211468,L,Nonpolar,0.051991,0.005683,0.00521,0.05194,0.005619,0.005146
I,3.0,0.333333,44.3,1.031249,I,Nonpolar,0.043454,0.006253,0.005874,0.042626,0.006236,0.005864
V,4.0,1.5,60.7,1.266124,V,Nonpolar,0.049554,0.005621,0.005172,0.049745,0.005616,0.005166
S2,2.0,1.5,31.6,0.670166,S,Polar,0.068942,0.004801,0.004146,0.069519,0.004799,0.004139
S4,4.0,1.5,49.5,1.292105,S,Polar,0.072003,0.005417,0.004738,0.072213,0.005477,0.004796
P,4.0,2.5,61.1,1.325112,P,Nonpolar,0.0527,0.00553,0.005049,0.052911,0.005595,0.005113
T,4.0,1.5,53.2,1.318265,T,Polar,0.065148,0.005225,0.004614,0.064347,0.005249,0.004646
A,4.0,2.5,69.3,1.291111,A,Nonpolar,0.069737,0.006613,0.005969,0.069473,0.006602,0.00596


## Save table

In [30]:
aa_attrib_syn_df.to_csv(aa_atribute_filename,
                        sep="\t",
                        index=True)

# Comparing codon-nucleotide and codon-cxtCodon distributions

In [7]:
mi_codon_nuc_cxtCodon_df = pd.read_csv(mi_codon_nuc_cxtCodon_filename,
                                       sep="\t",
                                       index_col=0)
mi_codon_nuc_cxtCodon_df.head()

Unnamed: 0,mi_p0,mi_p1,mi_p2,mi_p3,mi_p4,mi_p5,mi_p6,mi_p7,mi_p8,mi_p9,mi_p10,mi_p11,mi_p12,mi_p13,mi_p14,mi_p15,mi_p16,mi_p17,mi_p18,mi_p19,mi_p20,mi_p21,mi_p22,mi_p23,mi_p24,mi_p25,mi_p26,mi_p27,mi_p28,mi_p29,mi_p30,mi_p31,mi_p32
F,0.012635,0.013239,0.013034,0.0134,0.013162,0.013461,0.013205,0.013001,0.013864,0.013426,0.01451,0.014387,0.014466,0.013615,0.015658,0.021577,0.0,0.102561,0.015819,0.014798,0.014966,0.015061,0.01454,0.013868,0.014451,0.013781,0.013291,0.013704,0.012951,0.013113,0.0127,0.012737,0.012444
L2,0.005253,0.005883,0.00579,0.005353,0.00592,0.005321,0.005702,0.005568,0.005681,0.005738,0.006107,0.0061,0.006538,0.005745,0.006206,0.010281,0.0,0.011692,0.005935,0.005611,0.005998,0.005756,0.005451,0.005708,0.005392,0.005561,0.00564,0.005558,0.005343,0.005304,0.005191,0.005202,0.005367
L4,0.014579,0.015569,0.016113,0.01606,0.016259,0.016543,0.01729,0.016028,0.017357,0.015644,0.018047,0.018316,0.018244,0.018352,0.020854,0.022219,0.0,0.079854,0.019718,0.019443,0.017835,0.018838,0.01805,0.017654,0.016813,0.017054,0.016193,0.016757,0.015391,0.015838,0.015676,0.016052,0.015596
I,0.018323,0.017503,0.0186,0.017773,0.018944,0.018486,0.018506,0.019336,0.019368,0.018364,0.019968,0.020075,0.020228,0.020337,0.01945,0.033572,0.0,0.075037,0.021681,0.020952,0.019639,0.020195,0.019167,0.018286,0.019126,0.019155,0.019253,0.018213,0.018999,0.018282,0.017305,0.017685,0.016867
V,0.015449,0.015113,0.015479,0.015599,0.01641,0.016319,0.016343,0.016358,0.016759,0.016354,0.016758,0.017713,0.017742,0.018245,0.020529,0.02426,0.0,0.078504,0.019338,0.018462,0.018781,0.018223,0.017773,0.017379,0.016513,0.016701,0.016711,0.016456,0.016651,0.01565,0.015986,0.015648,0.015762


In [8]:
mi_codon_nuc_pos_df = pd.read_csv(mi_codon_nuc_pos_filename,
                    sep="\t",
                    index_col=0)
mi_codon_nuc_pos_df.head()

Unnamed: 0,mi_p0,mi_p1,mi_p2,mi_p3,mi_p4,mi_p5,mi_p6,mi_p7,mi_p8,mi_p9,mi_p10,mi_p11,mi_p12,mi_p13,mi_p14,mi_p15,mi_p16,mi_p17,mi_p18,mi_p19,mi_p20,mi_p21,mi_p22,mi_p23,mi_p24,mi_p25,mi_p26,mi_p27,mi_p28,mi_p29,mi_p30,mi_p31,mi_p32,mi_p33,mi_p34,mi_p35,mi_p36,mi_p37,mi_p38,mi_p39,mi_p40,mi_p41,mi_p42,mi_p43,mi_p44,mi_p45,mi_p46,mi_p47,mi_p48,mi_p49,mi_p50,mi_p51,mi_p52,mi_p53,mi_p54,mi_p55,mi_p56,mi_p57,mi_p58,mi_p59,mi_p60,mi_p61,mi_p62,mi_p63,mi_p64,mi_p65,mi_p66,mi_p67,mi_p68,mi_p69,mi_p70,mi_p71,mi_p72,mi_p73,mi_p74,mi_p75,mi_p76,mi_p77,mi_p78,mi_p79,mi_p80,mi_p81,mi_p82,mi_p83,mi_p84,mi_p85,mi_p86,mi_p87,mi_p88,mi_p89,mi_p90,mi_p91,mi_p92,mi_p93,mi_p94,mi_p95,mi_p96,mi_p97,mi_p98,mi_p99,mi_p100
F,0.000898,0.000489,0.009987,0.001003,0.000359,0.01064,0.000931,0.000365,0.010564,0.000725,0.000419,0.011088,0.000963,0.000489,0.010566,0.001002,0.000422,0.011028,0.000925,0.000443,0.010698,0.000909,0.000391,0.010388,0.00112,0.000528,0.0112,0.000808,0.000342,0.011028,0.000986,0.000635,0.011698,0.000833,0.000504,0.011783,0.000962,0.000566,0.011691,0.001004,0.000433,0.011142,0.001151,0.000401,0.012995,0.00044,0.000426,0.019747,0.0,0.0,0.0,0.079302,0.002875,0.016938,0.000619,0.000724,0.013166,0.001017,0.000413,0.012028,0.000908,0.000336,0.012353,0.000905,0.000386,0.012572,0.000792,0.000384,0.012184,0.000828,0.000448,0.011508,0.0009,0.000451,0.011779,0.00092,0.000447,0.011388,0.000901,0.000543,0.010595,0.001046,0.000441,0.011051,0.000803,0.000529,0.010391,0.0009,0.000319,0.010848,0.000779,0.000408,0.010414,0.000811,0.000316,0.010542,0.00097,0.000494,0.010054,0.000934,0.000355
L2,0.000539,0.000212,0.003999,0.00059,0.00029,0.004325,0.000498,0.000304,0.004262,0.000558,0.000198,0.004015,0.000545,0.000351,0.004373,0.00057,0.000403,0.003712,0.000563,0.000242,0.00411,0.000404,0.00016,0.004367,0.000677,0.000399,0.003955,0.000418,0.000122,0.004506,0.000456,0.000251,0.00464,0.000499,0.000322,0.004485,0.000475,0.000193,0.00516,0.00043,7.5e-05,0.004513,0.000476,0.000527,0.004432,0.000679,0.000783,0.008151,0.0,0.0,0.0,0.004859,0.000318,0.004713,0.000706,0.000836,0.003862,0.000305,0.000407,0.004198,0.000664,0.00031,0.004445,0.000607,0.000331,0.004084,0.000413,0.000323,0.004092,0.000527,0.00025,0.004218,0.000594,0.000213,0.003869,0.000442,0.00036,0.004073,0.000466,0.000279,0.004263,0.000702,0.000329,0.003739,0.000386,0.00029,0.004056,0.000335,0.00036,0.00401,0.000578,0.000249,0.003645,0.000523,0.00028,0.003733,0.000508,0.000203,0.004014,0.000479,0.00019
L4,0.000987,0.000427,0.011659,0.001517,0.000601,0.012052,0.001353,0.000704,0.01254,0.00162,0.000365,0.01261,0.001504,0.000714,0.012711,0.001351,0.000674,0.012898,0.001616,0.000372,0.013479,0.001363,0.000406,0.012698,0.001466,0.000723,0.01381,0.001293,0.00039,0.012474,0.001729,0.000746,0.014175,0.001525,0.000644,0.014679,0.001221,0.000595,0.014705,0.001718,0.000473,0.014637,0.001856,0.001003,0.0167,0.001071,0.000878,0.017092,0.0,0.0,0.0,0.051991,0.005053,0.018732,0.00175,0.001014,0.014966,0.001566,0.000915,0.014623,0.001497,0.000861,0.013631,0.002018,0.00075,0.014492,0.001992,0.000676,0.013794,0.001316,0.000826,0.013616,0.001515,0.000528,0.013235,0.001502,0.000461,0.013534,0.001454,0.000737,0.012742,0.001594,0.000689,0.013112,0.001002,0.000564,0.012315,0.001499,0.000749,0.012379,0.001438,0.000573,0.012421,0.001457,0.000351,0.0128,0.001292,0.00066,0.012302,0.001159,0.000642
I,0.001146,0.000506,0.014883,0.001026,0.000408,0.014158,0.001087,0.000484,0.015167,0.001115,0.000634,0.014609,0.001166,0.000515,0.015609,0.001141,0.000452,0.015048,0.000951,0.000588,0.015302,0.001004,0.000554,0.016013,0.001053,0.00049,0.015899,0.001154,0.00045,0.014995,0.000979,0.000617,0.016382,0.001064,0.000478,0.016954,0.00113,0.000651,0.016597,0.00077,0.000604,0.016623,0.000616,0.000575,0.016253,0.00057,0.000877,0.030607,0.0,0.0,0.0,0.043454,0.002786,0.023063,0.000601,0.000856,0.018227,0.001057,0.000679,0.017351,0.000974,0.000592,0.016523,0.000922,0.0008,0.016712,0.00075,0.000537,0.016033,0.000936,0.000309,0.015111,0.001032,0.000496,0.015973,0.001193,0.000615,0.015593,0.001376,0.00047,0.015802,0.000834,0.000525,0.015084,0.000978,0.000511,0.015495,0.000967,0.000413,0.015123,0.000907,0.000351,0.014378,0.00096,0.000475,0.014742,0.001083,0.00045,0.013722,0.000965,0.000325
V,0.001151,0.00048,0.012516,0.001129,0.000423,0.012237,0.001092,0.000598,0.012349,0.00112,0.00052,0.012468,0.001169,0.000555,0.013212,0.001276,0.000452,0.013229,0.001088,0.000467,0.013312,0.001172,0.000591,0.01293,0.00122,0.000511,0.013484,0.001124,0.000479,0.013294,0.001165,0.000549,0.013437,0.001117,0.000576,0.014391,0.001182,0.000587,0.014479,0.00108,0.000543,0.01485,0.000998,0.000994,0.017043,0.000792,0.000928,0.019085,0.0,0.0,0.0,0.049554,0.004303,0.018739,0.001068,0.000999,0.015423,0.001137,0.000637,0.014742,0.001376,0.000709,0.014815,0.00159,0.000654,0.014231,0.001423,0.000595,0.014251,0.001288,0.000619,0.013858,0.001222,0.000498,0.013373,0.001234,0.000572,0.013446,0.001224,0.000585,0.013363,0.001083,0.000618,0.013392,0.001195,0.000545,0.013444,0.000984,0.000473,0.012735,0.00128,0.000552,0.01285,0.001201,0.000514,0.012598,0.001207,0.000445,0.012853,0.001141,0.000353


Write new table by taking codon-nucleotide MI table and summing per-position values by the codon they compose in the context

In [9]:
num_cpos = mi_codon_nuc_cxtCodon_df.shape[1]
mi_codon_nuc3_pos_df = pd.DataFrame(np.add.reduceat(mi_codon_nuc_pos_df.values, 
                                                    np.arange(len(mi_codon_nuc_pos_df.columns))[:(3*num_cpos):3], 
                                                    axis=1),
                                    index=mi_codon_nuc_pos_df.index)
mi_codon_nuc3_pos_df.columns = mi_codon_nuc_cxtCodon_df.columns
mi_codon_nuc3_pos_df.head()


Unnamed: 0,mi_p0,mi_p1,mi_p2,mi_p3,mi_p4,mi_p5,mi_p6,mi_p7,mi_p8,mi_p9,mi_p10,mi_p11,mi_p12,mi_p13,mi_p14,mi_p15,mi_p16,mi_p17,mi_p18,mi_p19,mi_p20,mi_p21,mi_p22,mi_p23,mi_p24,mi_p25,mi_p26,mi_p27,mi_p28,mi_p29,mi_p30,mi_p31,mi_p32
F,0.011375,0.012002,0.011859,0.012232,0.012018,0.012452,0.012066,0.011689,0.012847,0.012178,0.013319,0.01312,0.013219,0.012579,0.014548,0.020612,0.0,0.099115,0.014509,0.013459,0.013597,0.013862,0.01336,0.012784,0.01313,0.012755,0.012039,0.012539,0.011723,0.012066,0.011602,0.01167,0.012809
L2,0.00475,0.005205,0.005064,0.004771,0.005269,0.004684,0.004915,0.004931,0.005032,0.005046,0.005347,0.005306,0.005828,0.005018,0.005434,0.009613,0.0,0.009889,0.005405,0.00491,0.005419,0.005022,0.004829,0.004995,0.004676,0.004876,0.005008,0.00477,0.004732,0.004705,0.004472,0.004536,0.005395
L4,0.013073,0.01417,0.014596,0.014595,0.014929,0.014923,0.015467,0.014467,0.015999,0.014157,0.01665,0.016847,0.016521,0.016827,0.019559,0.019041,0.0,0.075777,0.01773,0.017104,0.01599,0.01726,0.016462,0.015759,0.015278,0.015497,0.014934,0.015395,0.013881,0.014627,0.014432,0.014608,0.016057
I,0.016535,0.015592,0.016737,0.016358,0.017289,0.016642,0.016841,0.017571,0.017443,0.016599,0.017978,0.018496,0.018378,0.017997,0.017444,0.032054,0.0,0.069303,0.019684,0.019087,0.018089,0.018435,0.017321,0.016356,0.017501,0.017401,0.017648,0.016443,0.016983,0.016503,0.015637,0.016177,0.016546
V,0.014147,0.013789,0.014039,0.014108,0.014937,0.014957,0.014868,0.014692,0.015215,0.014897,0.015151,0.016084,0.016248,0.016473,0.019035,0.020805,0.0,0.072596,0.01749,0.016516,0.016901,0.016475,0.016269,0.015765,0.015093,0.015252,0.015173,0.015093,0.015184,0.014191,0.014682,0.014313,0.015999


Write modified codon-nucleotide table:

In [10]:
mi_codon_nuc3_pos_df.to_csv(mi_codon_3nuc_pos_filename,
                            sep="\t",
                            index=True)

# Aggregating results on shuffled data

In [34]:
#Load set of all tables
mi_shuffled_dfs = [pd.read_csv(x, sep="\t", index_col=0).reset_index() 
                   for x in mi_shuffled_filenames]

In [35]:
mi_shuffled_dfs[10].head()

Unnamed: 0,index,mi_p0,mi_p1,mi_p2,mi_p3,mi_p4,mi_p5,mi_p6,mi_p7,mi_p8,mi_p9,mi_p10,mi_p11,mi_p12,mi_p13,mi_p14,mi_p15,mi_p16,mi_p17,mi_p18,mi_p19,mi_p20,mi_p21,mi_p22,mi_p23,mi_p24,mi_p25,mi_p26,mi_p27,mi_p28,mi_p29,mi_p30,mi_p31,mi_p32,mi_p33,mi_p34,mi_p35,mi_p36,mi_p37,mi_p38,mi_p39,mi_p40,mi_p41,mi_p42,mi_p43,mi_p44,mi_p45,mi_p46,mi_p47,mi_p48,mi_p49,mi_p50,mi_p51,mi_p52,mi_p53,mi_p54,mi_p55,mi_p56,mi_p57,mi_p58,mi_p59,mi_p60,mi_p61,mi_p62,mi_p63,mi_p64,mi_p65,mi_p66,mi_p67,mi_p68,mi_p69,mi_p70,mi_p71,mi_p72,mi_p73,mi_p74,mi_p75,mi_p76,mi_p77,mi_p78,mi_p79,mi_p80,mi_p81,mi_p82,mi_p83,mi_p84,mi_p85,mi_p86,mi_p87,mi_p88,mi_p89,mi_p90,mi_p91,mi_p92,mi_p93,mi_p94,mi_p95,mi_p96,mi_p97,mi_p98,mi_p99,mi_p100
0,F,4e-06,3.209375e-07,2e-06,2e-06,3e-06,9.538165e-07,6e-06,7e-06,8e-06,1.1e-05,5e-06,4e-06,1.4e-05,2e-06,2e-06,9e-06,1e-05,3e-06,5e-06,4e-06,5.427969e-06,2e-06,6.41927e-07,6.298585e-07,8e-06,2e-06,4.805664e-07,4e-06,6e-06,1.564219e-05,8e-06,8e-06,4e-06,9.669201e-07,3e-06,2e-06,1e-06,5.61956e-07,2.962721e-07,3e-06,8e-06,1.1e-05,5e-06,5.965733e-07,9.565296e-07,4e-06,4e-06,3e-06,0.0,0.0,0.0,6e-06,6e-06,2e-06,1e-06,3e-06,2e-06,4e-06,3e-06,7e-06,6e-06,2e-06,6e-06,9e-06,7e-06,8e-06,5e-06,2e-06,3e-06,2e-06,2e-06,4e-06,3e-06,4.656436e-07,2e-06,1e-06,3e-06,1.808782e-06,6e-06,9.887212e-07,2e-06,9e-06,2e-06,5.907694e-07,1e-05,1.5e-05,7e-06,3e-06,1e-06,5e-06,2.226484e-06,2e-06,2e-06,3e-06,1e-06,5e-06,8.148896e-06,4.329316e-07,5e-06,4e-06,3e-06
1,L2,2.2e-05,1.333883e-05,6e-06,2e-06,4e-06,4.353046e-06,2e-06,7e-06,1.4e-05,2e-06,5e-06,5e-06,3e-06,4e-06,1.6e-05,4e-06,5e-06,7e-06,7e-06,2.3e-05,4.84072e-07,1e-06,1.18825e-05,6.671089e-06,4e-06,3e-06,8.38775e-06,9e-06,5e-06,7.604478e-07,3e-06,1.9e-05,1.2e-05,3.989273e-06,7e-06,9e-06,8e-06,4.351895e-06,8.160709e-06,1e-06,1.4e-05,1.4e-05,4e-06,9.356943e-06,1.367973e-05,6e-06,3e-06,1.2e-05,0.0,0.0,0.0,4e-06,7e-06,2e-06,8e-06,1e-06,5e-06,5e-06,2.8e-05,1.5e-05,2e-06,2e-05,2e-06,3e-06,1e-05,4e-06,7e-06,1.1e-05,4e-06,4e-06,3e-06,9e-06,7e-06,8.119894e-06,1.5e-05,9e-06,3e-06,9.72984e-07,3e-06,2.867274e-06,2.1e-05,1e-05,4e-06,6.345449e-06,1.7e-05,3e-06,4e-06,4e-06,2e-06,4e-06,7.04193e-07,7e-06,9e-06,1.1e-05,1.2e-05,1e-06,9.72239e-07,9.816336e-06,1e-05,4e-06,5e-06
2,L4,6e-06,7.87945e-06,4e-06,2e-06,5e-06,5.674426e-06,3e-06,5e-06,5e-06,4e-06,5e-06,6e-06,5e-06,9e-06,3e-06,5e-06,8e-06,9e-06,7e-06,3e-06,2.556082e-06,7e-06,5.892078e-06,7.909011e-06,7e-06,3e-06,4.057815e-06,4e-06,7e-06,4.20567e-06,6e-06,4e-06,5e-06,6.04641e-06,4e-06,3e-06,9e-06,6.988698e-06,4.068942e-06,7e-06,8e-06,8e-06,1e-05,4.229697e-06,5.96725e-06,4e-06,8e-06,4e-06,0.0,0.0,0.0,1e-05,4e-06,5e-06,4e-06,5e-06,7e-06,2e-06,1e-06,8e-06,8e-06,3e-06,2e-06,8e-06,3e-06,3e-06,7e-06,3e-06,5e-06,4e-06,3e-06,9e-06,4e-06,2.856274e-06,6e-06,4e-06,8e-06,8.654112e-06,3e-06,9.633103e-06,4e-06,5e-06,9e-06,9.19686e-06,4e-06,5e-06,4e-06,5e-06,9e-06,1.3e-05,6.203641e-06,1.1e-05,1e-05,5e-06,8e-06,9e-06,8.812974e-06,4.682384e-06,3e-06,2e-06,8e-06
3,I,3e-06,6.897066e-06,9e-06,8e-06,3e-06,1.009146e-05,1.1e-05,5e-06,1.1e-05,4e-06,3e-06,8e-06,4e-06,6e-06,5e-06,1.1e-05,5e-06,1e-06,5e-06,9e-06,7.803661e-06,3.3e-05,5.316826e-06,7.498001e-06,1.1e-05,4e-06,1.841373e-06,9e-06,5e-06,3.003665e-06,4e-06,6e-06,4e-06,9.817693e-06,7e-06,3e-06,2e-06,4.303896e-06,6.259928e-06,5e-06,3e-06,4e-06,1.5e-05,1.02898e-05,7.388242e-06,8e-06,1.6e-05,2e-06,0.0,0.0,0.0,3e-06,1.1e-05,1e-05,6e-06,1.3e-05,6e-06,8e-06,4e-06,6e-06,6e-06,3e-06,1e-06,4e-06,2e-06,4e-06,9e-06,2e-06,2e-06,1.4e-05,9e-06,8e-06,1e-05,5.154198e-06,3e-06,6e-06,2e-06,7.083426e-06,3e-06,2.685056e-06,2e-06,9e-06,3e-06,1.304164e-06,6e-06,1e-05,6e-06,1e-05,5e-06,5e-06,3.612371e-06,9e-06,5e-06,8e-06,4e-06,4e-06,3.643022e-06,1.327186e-05,6e-06,1.1e-05,4e-06
4,V,2e-06,7.191717e-06,8e-06,1.6e-05,7e-06,2.895873e-06,6e-06,5e-06,1.3e-05,1.3e-05,1.2e-05,4e-06,6e-06,3e-06,1.1e-05,4e-06,1e-05,4e-06,5e-06,4e-06,2.105933e-06,2e-06,8.391722e-06,7.574882e-06,5e-06,5e-06,1.81371e-06,5e-06,7e-06,1.423904e-06,6e-06,8e-06,6e-06,9.574959e-06,7e-06,7e-06,5e-06,5.532069e-06,5.667002e-06,5e-06,8e-06,4e-06,3e-06,1.11521e-05,7.855856e-06,4e-06,8e-06,6e-06,0.0,0.0,0.0,5e-06,9e-06,1e-05,1e-05,9e-06,9e-06,7e-06,6e-06,4e-06,9e-06,5e-06,7e-06,8e-06,8e-06,5e-06,9e-06,1.6e-05,4e-06,5e-06,1e-05,1e-05,7e-06,6.599615e-06,4e-06,1.2e-05,1.2e-05,5.119298e-06,8e-06,7.014963e-06,5e-06,1.2e-05,9e-06,3.961944e-06,7e-06,1.2e-05,2e-06,1.2e-05,5e-06,7e-06,7.662609e-06,5e-06,7e-06,8e-06,9e-06,8e-06,9.327832e-06,1.172104e-05,1e-05,1.8e-05,3e-06


In [36]:
#convert to long format table
mi_shuffled_long_dfs = [pd.wide_to_long(x, stubnames='mi_p', 
                             i=['index'], 
                             j='position_index').\
                                 reset_index().\
                                 rename(columns={"index":"amino_acid"}).\
                                     assign(shuffle=i)
                                 for i,x in enumerate(mi_shuffled_dfs)]

In [37]:
mi_shuffled_long_df = pd.concat(mi_shuffled_long_dfs,
                                ignore_index=True,
                                axis=0)
print(mi_shuffled_long_df.shape)
mi_shuffled_long_df.head()

(212100, 4)


Unnamed: 0,amino_acid,position_index,mi_p,shuffle
0,F,0,4e-06,0
1,L2,0,9e-06,0
2,L4,0,2e-06,0
3,I,0,9e-06,0
4,V,0,1.2e-05,0


Write collated shuffled MI data:

In [38]:
mi_shuffled_long_df.to_csv(mi_shuffled_long_filename,
                           index=False,
                           sep="\t")