In [12]:
import pandas as pd
import numpy as np
from glob import glob
from scipy import stats as sci_stats

### Now getting edge-specific (mutation-specific) data, combining modeling data with some basic stats we measure here and gene annotation data:

In [14]:
# Parsing modeling data
modeling = pd.read_csv('../../output/VTn_modeling.csv')
modeling['Cmodel'] = modeling['Cond']+'_'+modeling['Model']
vcols = ['R2', 'BIC', 'Params', 'Coeffs', 'Pvalues']
dats = [modeling.pivot(index='Edge', columns='Cmodel', values=v).reset_index() for v in vcols]
edge_models = dats[0]
base_cols = [i for i in edge_models if i!='Edge']
for i in range(1, len(dats)):
    edge_models = edge_models.merge(dats[i], on='Edge', how='outer', suffixes=('', '_'+vcols[i]))
edge_models = edge_models.rename(columns={i: i+'_R2' for i in base_cols})
edge_models.head(3)

Cmodel,Edge,P1_SC_37C_FM_R2,P1_SC_37C_IM_R2,P1_SC_37C_XM_R2,P1_YPD_30C_FM_R2,P1_YPD_30C_IM_R2,P1_YPD_30C_XM_R2,P3_SC_37C_FM_R2,P3_SC_37C_IM_R2,P3_SC_37C_XM_R2,...,P1_SC_37C_FM_Pvalues,P1_SC_37C_IM_Pvalues,P1_SC_37C_XM_Pvalues,P1_YPD_30C_FM_Pvalues,P1_YPD_30C_IM_Pvalues,P1_YPD_30C_XM_Pvalues,P3_SC_37C_FM_Pvalues,P3_SC_37C_IM_Pvalues,P3_SC_37C_XM_Pvalues,P3_bad_SC_37C_IM_Pvalues
0,AAAACATTATCAAAG,0.583375,0.310877,0.189243,0.517727,0.488751,0.050878,0.014657,-0.018977,0.014657,...,0.00048180681223513764;0.05745915153137658;0.0...,0.0003133476864991813,0.0060386928482683,0.0001823605601557505;0.0007770018944939864;0....,0.004147813442994854;1.4809655134762085e-06;2....,8.41485021425589e-06,0.30379547807973617,,0.3037954780797361,0.004870223422826999
1,AAAAGAAAAATGTAT,0.751957,0.680262,-0.289779,0.64805,0.560962,0.527182,0.688285,0.745178,0.530203,...,0.0034781438463235455;5.813983728026561e-09;1....,0.0018046154087660157;2.4698525976725255e-11;2...,0.0003273172318743,0.001657579148805668;6.227894548052331e-16,0.00020334035897033055;2.3196619299709384e-07;...,3.4576000611980207e-14,0.0004025941098777246;1.8083727916507018e-16,8.785300904379002e-07;5.98118260467144e-11;8.5...,3.5316626309259863e-16,0.01870503868266918;6.989524934677474e-07;0.00...
2,AAAAGCGACATTTCT,0.197264,0.603517,-1.734324,0.615196,0.598032,0.513759,0.572575,0.74025,0.572575,...,0.0004258269032847119;4.452746736927259e-08;5....,0.0007877069027766392;3.8893830388020806e-08;1...,0.0333561831114598,0.00581355032679557;7.181825284917696e-15,0.00029638196382691966;1.0309953366158507e-07;...,3.5860965517204254e-15,4.393039537917841e-14,0.00016041987063632854;2.4495686738544583e-08;...,4.3930395379178414e-14,0.00042795015391711594;2.933269970712822e-06


In [23]:
# Gene annotation data etc.
edge_info = pd.read_csv('../accessory_files/TP_data_by_edge.csv')
e2g = {i[0]:i[1] for i in np.array(edge_info[['Edge', 'Gene.Use']])}

In [24]:
def call_slope(row, cond):
    if pd.isnull(row[cond+'_slope']): return np.nan
    if row[cond+'_p']<0.05:
        if row[cond+'_slope'] > 0.05:
            return '+'
        elif row[cond+'_slope'] < -0.05:
            return '-'
    return 'NS'

mat = []
for edge in set(vtn_s['Edge']):
    tmp = [edge]
    dfs = [byrm_s] + [vtn_s[vtn_s.Cond==cond] for cond in conditions]
    things = ['BYxRM'] + conditions
    i = 0
    for df in dfs:
        # For each condition
        # Filtering for >= 5 cBCs
        td = df[(pd.notnull(df['s'])) & (df['Edge']==edge) & (df['num_cbcs']>=5)]
        if len(td) >= 20: # recording mean s and variance of s
            tmp += [np.mean(td['s']), np.var(td['s'])]
        else:
            tmp += [np.nan, np.nan]
        td = df[(pd.notnull(df['Fitness'])) & (pd.notnull(df['s'])) & (df['Edge']==edge) & (df['num_cbcs']>=5)]
        if len(td) >= 20: # recording regression results
            lr = sci_stats.linregress(td['Fitness'], td['s'])
            tmp += [lr[0], lr[3], lr[2]**2] # slope, P, R^2
        else:
            #print(things[i], e2g[edge], edge)
            tmp += [np.nan, np.nan, np.nan]
        i += 1
    mat.append(tmp)
cols = ['Edge']
for c in ['BYxRM'] + conditions:
    cols += [c+'_s_mean', c+'_s_var', c+'_slope', c+'_p', c+'_x_R2']
# turning it into a dataframe
edge_stats = pd.DataFrame(mat, columns=cols)
for cond in ['BYxRM'] + conditions:
    edge_stats[cond+'_call'] = edge_stats.apply(lambda row: call_slope(row, cond), axis=1)

edge_short = edge_info[['Edge', 'chromosome', 'Type', 'Gene.Use', 'briefDescription', 'insertion_edge', 'phenotypeSummary', 'phenotypeSummary.nearby']].rename(columns={'Gene.Use': 'Gene_Use'})
edge_stats = edge_stats.merge(edge_short, on='Edge', how='left') # adding Gene annotations etc.
edge_stats = edge_stats.merge(edge_models, on='Edge', how='outer') # adding modeling data
edge_stats.head(2)

Unnamed: 0,Edge,BYxRM_s_mean,BYxRM_s_var,BYxRM_slope,BYxRM_p,BYxRM_x_R2,P1_YPD_30C_s_mean,P1_YPD_30C_s_var,P1_YPD_30C_slope,P1_YPD_30C_p,...,P1_SC_37C_FM_Pvalues,P1_SC_37C_IM_Pvalues,P1_SC_37C_XM_Pvalues,P1_YPD_30C_FM_Pvalues,P1_YPD_30C_IM_Pvalues,P1_YPD_30C_XM_Pvalues,P3_SC_37C_FM_Pvalues,P3_SC_37C_IM_Pvalues,P3_SC_37C_XM_Pvalues,P3_bad_SC_37C_IM_Pvalues
0,GTGTGATTACTAAAT,-0.022098,0.002189,-0.783077,1.5522590000000002e-28,0.684124,-0.107254,0.000456,-0.122734,0.210653,...,0.027115506904014278;4.717426795782087e-09;1.0...,0.01148768209031891;4.3043829830687724e-11;0.0...,9.517749042502612e-05,0.00518846637687012;0.05706598874774437,0.0010271241175055605,0.0110554856807254,0.009736714840669632;1.7388863910459905e-13,0.0017560373576142518;6.039919700786389e-06;0....,1.7638101369985302e-12,0.013879040478284612;4.370404036455418e-06;1.0...
1,ATTATCAAGTGCCAG,-0.008074,8.8e-05,-0.066622,6.210889e-06,0.130114,-0.02573,6.6e-05,-0.057428,0.116787,...,4.633289464039908e-07;0.03476877728055573,1.6694756622713384e-08,0.0014263006007516,0.014587472877529192;0.7921454023368454;3.7273...,0.009489061055084473;1.9507104846570335e-10;0....,0.0761288653262598,0.0152158132266571;0.020089996033347376,,0.1691654338499615,0.005608510955189237;1.651531037898096e-05;0.0...


In [25]:
edge_stats.to_csv('data_by_mutation.csv', index=False)