In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

import statsmodels.formula.api as smf

from matplotlib import pyplot as plt

# Import data

This data comes from this paper: https://www.nature.com/articles/s41467-019-13483-w

It's a big excel file with expression and growth rate data in different sheets and this code will use a few of these sheets.

In [3]:
df = pd.ExcelFile(r'../Data/raw_data/41467_2019_13483_MOESM4_ESM.xlsx')

## Basic metadata clean up and data subsetting

After some manual inspection, looking at distributions, etc. we decided to exclude samples where:

1. The growth rate data was unknown


2. The growth rate data was reported as zero. This one might seem strange but it's is a little unclear in general if that's possible/true to have zero growth rate. It is possible that these were stationary phase cultures but equally likely from my stand-point that these are errors in the table.


3. Really poor alignment (perhaps indicating some overall contamination)

In [4]:
meta_df = df.parse('Metadata') ###This grabs the sheet that contains information about the samples
print(meta_df.shape)
meta_df = meta_df[meta_df['Growth Rate (1/hr)'].isnull() == False]
print(meta_df.shape)
meta_df = meta_df[meta_df['Growth Rate (1/hr)'] > 0.0]
print(meta_df.shape)
meta_df = meta_df[meta_df['Alignment'] > 80]
print(meta_df.shape)
meta_df.head()

(278, 26)
(195, 26)
(179, 26)
(173, 26)


Unnamed: 0,Sample ID,Study,Project ID,Condition ID,Replicate #,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),...,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,Alignment,DOI,GEO
4,fur__wt_fe__1,Fur,fur,wt_fe,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,1.060606,No,,MiSeq,,2,93.35,doi.org/10.1038/ncomms5910,GSE54900
5,fur__wt_fe__2,Fur,fur,wt_fe,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,1.060606,No,,MiSeq,,2,92.38,doi.org/10.1038/ncomms5910,GSE54900
8,fur__delfur_fe2__1,Fur,fur,delfur_fe2,1,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,0.619469,No,,MiSeq,,2,92.8,doi.org/10.1038/ncomms5910,GSE54900
9,fur__delfur_fe2__2,Fur,fur,delfur_fe2,2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,0.619469,No,,MiSeq,,2,93.24,doi.org/10.1038/ncomms5910,GSE54900
55,omics__bw_ac__1,Omics,omics,bw_ac,1,Escherichia coli BW25113,BW25113,M9,acetate(3.5),NH4Cl(1),...,Batch,0.203,No,,MiSeq,,2,97.8,doi.org/10.1038/ncomms13091,GSE59759


## Read in the expression data

In [5]:
exp_df = df.parse('Expression Data', index_col='log-TPM')
print(exp_df.shape)
exp_df = exp_df[meta_df['Sample ID']] ###Only grab the columns corresponding to the samples identified above
print(exp_df.shape)
assert list(meta_df['Sample ID']) == list(exp_df.columns) ###Check our work
exp_df.head()

(3923, 278)
(3923, 173)


Unnamed: 0_level_0,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_fe2__1,fur__delfur_fe2__2,omics__bw_ac__1,omics__bw_ac__2,omics__bw_fum__1,omics__bw_fum__2,omics__bw_glc__1,omics__bw_glc__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0002,11.06481,10.779071,11.229767,11.214065,9.257348,9.182322,9.709213,9.672126,10.208587,10.218351,...,10.271327,10.276565,11.148538,11.170578,11.676604,11.726097,11.881529,11.923237,11.49641,11.552762
b0003,10.776984,10.59781,10.897938,10.861157,8.983408,8.943151,9.436004,9.394573,9.609637,9.677931,...,10.160291,10.116861,10.314322,10.392251,10.916426,10.909277,11.023924,11.038426,10.624301,10.764195
b0004,10.394971,10.11395,10.185151,10.164655,8.76169,8.77992,9.532673,9.53437,9.883558,9.870356,...,10.475069,10.434352,10.679541,10.723953,11.14331,11.112721,11.184795,11.241845,10.953206,11.001006
b0005,6.716069,6.410864,6.527653,6.136168,4.474204,4.72049,5.782102,5.326669,5.846675,5.972022,...,5.979079,5.705586,6.30612,6.29134,5.058537,4.83555,5.448097,5.757951,5.873964,5.808618
b0006,6.761813,6.816532,6.862147,6.81748,6.536457,6.439917,6.408731,6.276017,6.9102,6.843384,...,8.371287,8.32239,8.137515,8.071837,7.354131,7.365536,7.328101,7.314761,8.05775,8.105213


**Manual inspection found some weird and highly implausible/impossible duplicate column/s, so we'll make a note of that here and deal with it later**

In [6]:
exp_df[[col for col in exp_df.columns if 'pal__lyx_ale' in col]].head()

Unnamed: 0_level_0,pal__lyx_ale2_f6__1,pal__lyx_ale2__1,pal__lyx_ale2__2,pal__lyx_ale4__1,pal__lyx_ale4__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b0002,9.627287,10.130315,10.155462,10.130315,10.155462
b0003,9.250534,9.708944,9.831138,9.708944,9.831138
b0004,9.203814,10.049444,10.190627,10.049444,10.190627
b0005,4.807384,5.772047,5.933463,5.772047,5.933463
b0006,6.398236,6.435048,6.220552,6.435048,6.220552


# Averaging gene expression values across replicates to clean up/simplify the data

## First getting gene expression averages between replicates and creating a new `dataframe` to hold this information

Thus just exploits the fact that replicates are denoted by "__x" in the sample names so we identify these and average them when possible. 

In [7]:
unique_cols = set(exp_df.columns.str[:-3])
new_exp_df = pd.DataFrame()

for i in unique_cols:
    new_exp_df[i] = exp_df[[col for col in exp_df.columns if col[:-3]==i]].mean(axis=1)
print(new_exp_df.shape)
new_exp_df.head()

(3923, 105)


Unnamed: 0_level_0,ssw__glc_ac_glc2,omics__bw_fum,ssw__glc_glyc_glc,efeU__menFentC_ale39,ssw__glc_glyc_glyc,ytf__delyafC,42c__wt_42c,omics__bw_glc,ytf__delyheO,42c__42c_ale3,...,pgi__pgi_ale8,ssw__xyl_ale2,ssw__glc_xyl_glc,pgi__glu4_delpgi,ica__thm_gal,pgi__pgi_ale6,glu__glu_ale3,ica__wt_glc,rpoB__rpoBE672K_lb,ica__cytd_rib
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0002,8.38384,9.690669,10.535606,10.344302,10.306998,9.312825,11.499706,10.213469,9.401156,10.915428,...,10.482452,9.954253,10.079777,9.846088,11.09047,9.859055,10.90357,11.172912,8.145485,7.162862
b0003,8.012263,9.415289,10.18765,10.265772,9.952324,8.680662,10.770397,9.643784,8.933743,10.372598,...,9.744255,9.577094,9.672696,9.25333,10.402663,9.214803,10.319803,10.525032,7.709501,7.805904
b0004,8.472842,9.533522,10.075001,10.712971,9.755426,8.735598,10.785005,9.876957,9.164724,10.165121,...,9.761211,9.595546,9.735267,9.273931,10.633509,9.12837,10.452338,10.84,7.808036,7.978095
b0005,4.976962,5.554385,6.59052,6.245881,6.057605,4.190217,5.455772,5.909349,4.21512,6.416271,...,6.083744,6.859922,6.280117,4.381576,4.686068,6.575517,6.308882,5.977584,3.922764,3.161854
b0006,6.55353,6.342374,6.633852,7.161943,6.597041,6.769807,7.316602,6.876792,6.748235,6.742684,...,6.296399,6.718642,6.89557,7.260331,6.663905,6.865126,7.039349,6.852355,6.849328,6.193035


**Double checking the work**

Just making sure somethings add up here by taking an example column/condition and looking at the replicate values

In [8]:
example_col = list(unique_cols)[0]

exp_df[[col for col in exp_df.columns if col[:-3]==example_col]].head()

Unnamed: 0_level_0,ssw__glc_ac_glc2__1
log-TPM,Unnamed: 1_level_1
b0002,8.38384
b0003,8.012263
b0004,8.472842
b0005,4.976962
b0006,6.55353


And their average

In [9]:
new_exp_df[[example_col]].head()

Unnamed: 0_level_0,ssw__glc_ac_glc2
log-TPM,Unnamed: 1_level_1
b0002,8.38384
b0003,8.012263
b0004,8.472842
b0005,4.976962
b0006,6.55353


## Dealing with the weird duplicate column/s

Time to kill any completely identical columns, these must be bugs on the data end and even though I could in theory keep one, their growth rate value is unclear (since it differs)

In [10]:
###Get an all-to-all correlation matrix between gene expression values
temp_corr = new_exp_df.corr(method='spearman')

In [11]:
###And make this into a symmetric dataframe
temp_df = pd.DataFrame(
    np.where(np.equal(*np.indices(temp_corr.shape)), np.nan, temp_corr.values),
    temp_corr.index, temp_corr.columns
)
print(temp_df.shape)
temp_df.head()

(105, 105)


Unnamed: 0,ssw__glc_ac_glc2,omics__bw_fum,ssw__glc_glyc_glc,efeU__menFentC_ale39,ssw__glc_glyc_glyc,ytf__delyafC,42c__wt_42c,omics__bw_glc,ytf__delyheO,42c__42c_ale3,...,pgi__pgi_ale8,ssw__xyl_ale2,ssw__glc_xyl_glc,pgi__glu4_delpgi,ica__thm_gal,pgi__pgi_ale6,glu__glu_ale3,ica__wt_glc,rpoB__rpoBE672K_lb,ica__cytd_rib
ssw__glc_ac_glc2,,0.798605,0.911234,0.908617,0.847677,0.91528,0.796595,0.823336,0.923189,0.795138,...,0.806305,0.814213,0.903232,0.777635,0.741072,0.799794,0.844985,0.793425,0.74387,0.692237
omics__bw_fum,0.798605,,0.873423,0.842448,0.901253,0.823685,0.876622,0.956253,0.826844,0.927336,...,0.924819,0.922801,0.889026,0.90903,0.903754,0.926953,0.865879,0.928363,0.858555,0.793033
ssw__glc_glyc_glc,0.911234,0.873423,,0.936165,0.946849,0.919569,0.869255,0.904691,0.920877,0.863067,...,0.877207,0.876801,0.945579,0.870993,0.802166,0.875471,0.912913,0.854626,0.804843,0.726135
efeU__menFentC_ale39,0.908617,0.842448,0.936165,,0.900025,0.938303,0.894847,0.867961,0.947682,0.848821,...,0.859305,0.852954,0.926893,0.849512,0.823608,0.845927,0.908139,0.859675,0.838525,0.735231
ssw__glc_glyc_glyc,0.847677,0.901253,0.946849,0.900025,,0.868861,0.867133,0.901341,0.872487,0.875549,...,0.888633,0.924119,0.913621,0.895194,0.829203,0.88981,0.890242,0.867806,0.832671,0.752474


**Identify columns that contain a value of "1." since this indicates they have a perfect correlation**

In [12]:
temp_df[temp_df.values==1]

Unnamed: 0,ssw__glc_ac_glc2,omics__bw_fum,ssw__glc_glyc_glc,efeU__menFentC_ale39,ssw__glc_glyc_glyc,ytf__delyafC,42c__wt_42c,omics__bw_glc,ytf__delyheO,42c__42c_ale3,...,pgi__pgi_ale8,ssw__xyl_ale2,ssw__glc_xyl_glc,pgi__glu4_delpgi,ica__thm_gal,pgi__pgi_ale6,glu__glu_ale3,ica__wt_glc,rpoB__rpoBE672K_lb,ica__cytd_rib
pal__lyx_ale2,0.768289,0.935901,0.849801,0.829072,0.904835,0.799936,0.857175,0.895297,0.8044,0.885634,...,0.905686,0.90877,0.859288,0.89333,0.883647,0.907159,0.829465,0.898642,0.856223,0.79395
pal__lyx_ale4,0.768289,0.935901,0.849801,0.829072,0.904835,0.799936,0.857175,0.895297,0.8044,0.885634,...,0.905686,0.90877,0.859288,0.89333,0.883647,0.907159,0.829465,0.898642,0.856223,0.79395


In [13]:
temp_df.loc[['pal__lyx_ale4', 'pal__lyx_ale2'], ['pal__lyx_ale4', 'pal__lyx_ale2']]

Unnamed: 0,pal__lyx_ale4,pal__lyx_ale2
pal__lyx_ale4,,1.0
pal__lyx_ale2,1.0,


Get rid of them both since there is obviously an error here somewhere

In [14]:
new_exp_df.drop(['pal__lyx_ale2', 'pal__lyx_ale4'], axis=1, inplace=True)
print(new_exp_df.shape)

(3923, 103)


# Average the growth rates across these replicates in the metadata as well

In [15]:
###Assign a unique id that removes the replicate information
meta_df['Simple_sample_id'] = meta_df['Sample ID'].str[:-3]
print(meta_df.shape)
###Group according to this new id
group_cols = ['Simple_sample_id']
###For these numeric columns I'll take the mean
metric_cols_a = ['Temperature (C)', 'pH', 'Growth Rate (1/hr)', 'Alignment']
aggs_a = meta_df.groupby(group_cols)[metric_cols_a].mean()
###And for these I'll just grab the count
metric_cols_b = ['Replicate #', 'Biological Replicates']
aggs_b = meta_df.groupby(group_cols)['Replicate #'].count()

###Drop the columns from the original dataframe (we'll add them back in later)
meta_df.drop(metric_cols_a, axis=1, inplace=True)
meta_df.drop(metric_cols_b, axis=1, inplace=True)
###And duplicates
meta_df.drop_duplicates(subset=group_cols, keep='first', inplace=True)

###Now merge the main dataframe with the grouped ones
meta_df = meta_df.merge(right=aggs_a, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df = meta_df.merge(right=aggs_b, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df.head()

(173, 27)
(105, 25)
(105, 26)


Unnamed: 0,Sample ID,Study,Project ID,Condition ID,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,...,Sequencing Machine,Additional Details,DOI,GEO,Simple_sample_id,Temperature (C),pH,Growth Rate (1/hr),Alignment,Replicate #
132,42c__42c_ale1__1,42C Evolution,42c,42c_ale1,Escherichia coli 42C.1.124.1,MG1655,M9,glucose(4),NH4Cl(1),O2,...,MiSeq,42C A1 F124 I1,doi.org/10.1093/molbev/msu209,GSE132442,42c__42c_ale1,42,7.0,0.95,98.4,1
140,42c__42c_ale10__1,42C Evolution,42c,42c_ale10,Escherichia coli 42C.10.153.1,MG1655,M9,glucose(4),NH4Cl(1),O2,...,MiSeq,42C A10 F153 I1,doi.org/10.1093/molbev/msu209,GSE132442,42c__42c_ale10,42,7.0,0.98,96.91,1
133,42c__42c_ale2__1,42C Evolution,42c,42c_ale2,Escherichia coli 42C.2.163.1,MG1655,M9,glucose(4),NH4Cl(1),O2,...,MiSeq,42C A2 F163 I1,doi.org/10.1093/molbev/msu209,GSE132442,42c__42c_ale2,42,7.0,0.97,97.51,1
134,42c__42c_ale3__1,42C Evolution,42c,42c_ale3,Escherichia coli 42C.3.120.1,MG1655,M9,glucose(4),NH4Cl(1),O2,...,MiSeq,42C A3 F120 I1,doi.org/10.1093/molbev/msu209,GSE132442,42c__42c_ale3,42,7.0,0.92,97.13,1
135,42c__42c_ale4__1,42C Evolution,42c,42c_ale4,Escherichia coli 42C.4.161.1,MG1655,M9,glucose(4),NH4Cl(1),O2,...,MiSeq,42C A4 F161 I1,doi.org/10.1093/molbev/msu209,GSE132442,42c__42c_ale4,42,7.0,1.03,97.07,1


**And get rid of those problematic samples from this dataframe as well**

In [16]:
print(meta_df.shape)
meta_df = meta_df[meta_df['Sample ID'].str.contains('pal__lyx_ale2__')==False]
print(meta_df.shape)
meta_df = meta_df[meta_df['Sample ID'].str.contains('pal__lyx_ale4__')==False]
print(meta_df.shape)

(105, 26)
(104, 26)
(103, 26)


**Sum should equal the original shape!**

In [17]:
meta_df['Replicate #'].value_counts()

2    61
1    41
6     1
Name: Replicate #, dtype: int64

**Make sure that the columns line up when matching across these two dataframes** 

In [18]:
new_exp_df = new_exp_df[meta_df['Simple_sample_id']]

In [19]:
assert all(new_exp_df.columns == meta_df['Simple_sample_id'])

**And add the doubling time just for good measure**

Which is just a slight transformation of growth rate

In [20]:
meta_df['Doubling_time'] = np.log(2)/meta_df['Growth Rate (1/hr)']

# Construct a third `dataframe` containing gene expression data summary stats

Strictly speaking this isn't super necessary but might as well do it now to get it done and over with

In [21]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)

**The % signs seem to cause some problems down the road so lets remove them**

In [22]:
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


**And add some other dispersion metrics**

In [23]:
exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

**Finally, adding some of (what we think are) the cool new variables to consider**

In [24]:
slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

# Save some files

This was the whole point of all the code above. Should have some straightforward data tables now

In [None]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_ecoli.tsv', sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_ecoli.tsv', sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_ecoli.tsv', sep='\t')

# Get a thinned down dataset as a robustness check

Since conditions are correlated with one another, I'm constructing a more sparsely populated dataset where inter-condition correlations are minimized using a greedy algorithm. 

The algorithm works as follows:
1. Find which two conditions are the most highly correlated across the entire all-to-all correlation matrix
2. Randomly delete one of the two conditions in question
3. Iterate to (1)
4. Stop after reaching a pre-defined final dataset size

In [26]:
import random
random.seed(42)

In [27]:
cond_corr_mat = new_exp_df.corr(method='spearman')
cond_corr_mat.head()

Unnamed: 0,42c__42c_ale1,42c__42c_ale10,42c__42c_ale2,42c__42c_ale3,42c__42c_ale4,42c__42c_ale5,42c__42c_ale6,42c__42c_ale8,42c__42c_ale9,42c__wt_42c,...,ytf__delydcI_ph5,ytf__delydcI_ph8,ytf__delyddM,ytf__delyeiE,ytf__delyheO,ytf__delyiaJ,ytf__delyieP,ytf__wt_glc,ytf__wt_ph5,ytf__wt_ph8
42c__42c_ale1,1.0,0.92712,0.882641,0.862337,0.890492,0.844707,0.934393,0.910083,0.853765,0.836816,...,0.806675,0.798028,0.828742,0.840311,0.902051,0.864096,0.889329,0.841736,0.803002,0.802468
42c__42c_ale10,0.92712,1.0,0.931647,0.909807,0.963706,0.902406,0.919654,0.961923,0.890414,0.843208,...,0.836582,0.841167,0.846182,0.830256,0.86161,0.84132,0.89836,0.876208,0.832988,0.839818
42c__42c_ale2,0.882641,0.931647,1.0,0.964109,0.956908,0.968556,0.856027,0.918062,0.963966,0.902696,...,0.899551,0.90167,0.888363,0.876668,0.827253,0.818446,0.865943,0.904547,0.892907,0.902017
42c__42c_ale3,0.862337,0.909807,0.964109,1.0,0.938152,0.974434,0.858418,0.89659,0.965164,0.900097,...,0.897568,0.89612,0.888025,0.880415,0.811377,0.803547,0.85194,0.880488,0.888256,0.896066
42c__42c_ale4,0.890492,0.963706,0.956908,0.938152,1.0,0.930396,0.887814,0.955125,0.919252,0.87538,...,0.869244,0.874356,0.863983,0.849296,0.83569,0.823237,0.879365,0.89417,0.863277,0.872864


**The `final_data_size` is of course completely arbitrary. But the point is to get rid of some correlated data so it does the trick.**

In [28]:
final_data_size = 30
n_to_prune = cond_corr_mat.shape[1] - final_data_size

In [29]:
np.fill_diagonal(cond_corr_mat.values, np.nan)
to_prune = []
for i in range(n_to_prune):
    tempy = cond_corr_mat.loc[[i for i in cond_corr_mat.columns if i not in to_prune]][[i for i in cond_corr_mat.columns if i not in to_prune]].max()
    to_prune.append(random.choice([tempy.sort_values(ascending=False).index[0],\
                                   tempy.sort_values(ascending=False).index[1]]))
np.fill_diagonal(cond_corr_mat.values, 1.)

**Remove relevant rows from two of the dataframes**

In [30]:
to_keep = [i for i in cond_corr_mat.columns if i not in to_prune]
new_exp_sprs_df = new_exp_df[to_keep]
meta_sprs_df = meta_df[meta_df['Simple_sample_id'].isin(to_keep)]

**And regenerate the summary dataframe to only use this thinned-down set of samples**

In [31]:
exp_summary_sprs_df = new_exp_sprs_df.apply(pd.DataFrame.describe, axis=1)
col_listy = []
for col in exp_summary_sprs_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_sprs_df.columns = col_listy

exp_summary_sprs_df['cv'] = exp_summary_sprs_df['std']/exp_summary_sprs_df['mean']
exp_summary_sprs_df['noise'] = exp_summary_sprs_df['std'].pow(2)/exp_summary_sprs_df['mean'].pow(2)

slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_sprs_df.index:
    a, b, c, d, e = stats.linregress(new_exp_sprs_df.loc[gene], meta_sprs_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_sprs_df.loc[gene], meta_sprs_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_sprs_df['lin_slope'] = slopes
exp_summary_sprs_df['lin_r'] = pearsons
exp_summary_sprs_df['spearmans_rho'] = spearmans

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


In [32]:
print(new_exp_sprs_df.shape)
print(meta_sprs_df.shape)
print(exp_summary_sprs_df.shape)

(3923, 30)
(30, 27)
(3923, 13)


In [None]:
new_exp_sprs_df.to_csv('../Data/processed_data/processed_expression_ecoli_SPARSE.tsv', sep='\t')
meta_sprs_df.to_csv('../Data/processed_data/processed_metadata_ecoli_SPARSE.tsv', sep='\t')
exp_summary_sprs_df.to_csv('../Data/processed_data/processed_expression_summary_ecoli_SPARSE.tsv', sep='\t')

## Create neutral data set for additional robustness check
Limit analysis to non-ALE (adaptive laboratory environment) strains to control for selection

In [34]:
meta_ntrl_df = meta_df[meta_df['Evolved Sample'].str.contains('No')==True]
print(meta_ntrl_df.shape)

(48, 27)


In [39]:
new_exp_ntrl_df = new_exp_df[meta_ntrl_df['Simple_sample_id']]
print(new_exp_ntrl_df.shape)

(3923, 48)


In [40]:
new_exp_ntrl_df.to_csv('../Data/processed_data/processed_expression_ecoli_NEUTRAL.tsv', sep='\t')
meta_ntrl_df.to_csv('../Data/processed_data/processed_metadata_ecoli_NEUTRAL.tsv', sep='\t')