**Purpose**

Read in CMI table for $I_A(C,N_i|GC)$, measure range for dropoff in CMI per amino acid, write out table, and print average for upstream and downstream ranges.

# Setup

In [1]:
import pandas as pd
import numpy as np

## Files read

In [2]:
cond_mut_inf_gc_filename = "../../data/2_conditional_mutual_information/cond_mut_inf_codon_nuc_pos_var_GCcount_20bins.tsv"

## Files written

In [3]:
cmi_range_gc_filename = "../../data/2_conditional_mutual_information/cmi_codon_nuc_pos_GCcount_20bins_range.tsv"

# Parameters

Define the indices for the context sequence

In [4]:
seq_length = 101 #total context length (should correspond with CMI ranges)

upstream_start = 0 #site furthest from central codon, upstream
downstream_start = 99 #site furthest from central codon, downstream (last full codon)

upstream_end = 48 #C1
downstream_end = 51 #+1

Set parameters for how many sites to average and advance through, and threshold set against the CMI mean

In [5]:
std_factor = 2 #number of standard deviations to use as threshold
cp3_baseline = 8 #index of CP3 position to measure average after (inclusive)

# Measure CMI range against mean

In [6]:
cond_mut_inf_gc_df = pd.read_csv(cond_mut_inf_gc_filename,
                                 sep="\t",
                                 index_col=0)
cond_mut_inf_gc_df.head()

Unnamed: 0,cmi_p0,cmi_p1,cmi_p2,cmi_p3,cmi_p4,cmi_p5,cmi_p6,cmi_p7,cmi_p8,cmi_p9,...,cmi_p91,cmi_p92,cmi_p93,cmi_p94,cmi_p95,cmi_p96,cmi_p97,cmi_p98,cmi_p99,cmi_p100
F,0.000197,0.000333,0.000693,0.000113,0.000304,0.000865,0.00017,0.000288,0.000843,0.000223,...,0.000228,0.000828,0.00016,0.000288,0.000826,0.000135,0.000267,0.000835,0.000157,0.000357
L2,0.000234,0.00032,0.000432,0.000159,0.00023,0.000525,0.000195,0.000304,0.000476,0.000238,...,0.000339,0.000375,0.000174,0.000342,0.000338,0.000183,0.00032,0.000446,0.000225,0.000401
L4,0.000404,0.000633,0.000861,0.000334,0.000477,0.000917,0.000228,0.000472,0.001036,0.000371,...,0.000526,0.001014,0.000295,0.000597,0.00111,0.000256,0.000392,0.00105,0.000262,0.00057
I,0.000435,0.000473,0.001349,0.000315,0.00044,0.001165,0.000263,0.000469,0.001395,0.000353,...,0.000556,0.001202,0.000283,0.000519,0.001344,0.000324,0.00051,0.00117,0.000269,0.000528
V,0.000345,0.000582,0.001055,0.000229,0.000672,0.001003,0.000291,0.000559,0.001048,0.000346,...,0.000495,0.001138,0.000271,0.000649,0.001093,0.000235,0.000651,0.001194,0.00024,0.000653


In [7]:
cmi_range_gc_rows = []

for amin in cond_mut_inf_gc_df.index :
    print(amin)
    #-upstream
    #--reverse sequence order so smaller index is closer to central codon
    vals_up = cond_mut_inf_gc_df.loc[amin][upstream_start:upstream_end][::-1]
    #
    vals_cp3_up = vals_up[slice(0,len(vals_up),3)]
    avg_up = np.mean(vals_cp3_up[cp3_baseline:])
    std_up = np.std(vals_cp3_up[cp3_baseline:])
    threshold_up = avg_up+(std_factor*std_up)
    
    last_outlier_up = 0
    for i, w_i in enumerate(vals_cp3_up[:cp3_baseline]) :
        if w_i > threshold_up :
            last_outlier_up = i
        else :
            break
    print(" up:", threshold_up, "- last above:", vals_cp3_up[:cp3_baseline][last_outlier_up])

    #-downstream
    vals_dn = cond_mut_inf_gc_df.loc[amin][downstream_end:downstream_start]
    vals_cp3_dn = vals_dn[slice(2,upstream_end,3)]
    avg_dn = np.mean(vals_cp3_dn[cp3_baseline:])
    std_dn = np.std(vals_cp3_dn[cp3_baseline:])
    threshold_dn = avg_dn+(std_factor*std_dn)
    
    last_outlier_dn = 0
    for i, w_i in enumerate(vals_cp3_dn[:cp3_baseline]) :
        if w_i > threshold_dn :
            last_outlier_dn = i
        else :
            break
    print(" down:", threshold_dn, "- last above:", vals_cp3_dn[:cp3_baseline][last_outlier_dn])

    
    cmi_range_gc_row = {"AminoAcid":amin,
                        "CMI_base_avg_up":avg_up,
                        "CMI_base_sd_up":std_up,
                        "CMI_base_avg_down":avg_dn,
                        "CMI_base_sd_down":std_dn,
                        "cp3_range_up":last_outlier_up,
                        "cp3_range_down":last_outlier_dn}
    
    
    cmi_range_gc_rows.append(cmi_range_gc_row)

F
 up: 0.0009855310360932976 - last above: 0.0010561598293782
 down: 0.0011022029342183713 - last above: 0.0013424615971593
L2
 up: 0.0005717289036917257 - last above: 0.0006215996916948
 down: 0.0005448353775185549 - last above: 0.0006955600158316
L4
 up: 0.001185099578838532 - last above: 0.0016269444835871
 down: 0.0012015131315230728 - last above: 0.0014067634383998
I
 up: 0.0016115369286952438 - last above: 0.0017331500566912
 down: 0.001600864257127963 - last above: 0.0016870269146048
V
 up: 0.0012281296335950125 - last above: 0.0012309584858742
 down: 0.0012849682836102276 - last above: 0.0013864718209946
S4
 up: 0.00180907499955456 - last above: 0.0019203396056789
 down: 0.0018059393512874894 - last above: 0.0020488145808716
S2
 up: 0.001051730903907567 - last above: 0.0009759908907763
 down: 0.001115660028216264 - last above: 0.0017272313091495
P
 up: 0.0024661242302758146 - last above: 0.002705434549237
 down: 0.002592784474619938 - last above: 0.002601656625295
T
 up: 0.0017

Convert to data frame

In [8]:
cmi_range_gc_df = pd.DataFrame(cmi_range_gc_rows)
cmi_range_gc_df.head()

Unnamed: 0,AminoAcid,CMI_base_avg_up,CMI_base_sd_up,CMI_base_avg_down,CMI_base_sd_down,cp3_range_up,cp3_range_down
0,F,0.000847,6.9e-05,0.000919,9.2e-05,5,5
1,L2,0.000437,6.7e-05,0.000439,5.3e-05,1,0
2,L4,0.001007,8.9e-05,0.001079,6.1e-05,5,6
3,I,0.001367,0.000122,0.001359,0.000121,4,5
4,V,0.001095,6.6e-05,0.00117,5.8e-05,6,6


In [9]:
cmi_range_gc_df.query("AminoAcid == 'G'")

Unnamed: 0,AminoAcid,CMI_base_avg_up,CMI_base_sd_up,CMI_base_avg_down,CMI_base_sd_down,cp3_range_up,cp3_range_down
20,G,0.002041,0.000159,0.001979,0.000201,3,5


Calculate mean ranges for upstream and downstream directions

In [10]:
cmi_range_gc_df["cp3_range_up"].mean()

3.761904761904762

In [11]:
cmi_range_gc_df["cp3_range_down"].mean()

3.9523809523809526

## Save CMI range table

In [12]:
cmi_range_gc_df.to_csv(cmi_range_gc_filename,
                       sep="\t",
                       index=False)