In [1]:
import pandas as pd, numpy as np, glob, pysam, seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
pheno_path = '../DATA/PHENOTYPE/GROWTHCURVES/Tecan-deletion-plate-July252019_baselined_median51.csv.gz'
dp_df = pd.read_csv(pheno_path,index_col=0)

In [3]:
## Print the unique strains
dp_df.strain.unique()

array(['CNB02995', 'CNB03020', 'CNB03090', 'CNB03170', 'CNB03200',
       'CNB03215', 'CNB03230', 'CNB03240', 'CNB03270', 'CNB03300',
       'CNB03310', 'CNB03330', 'CNN01165', 'CNN01195', 'CNN01270',
       'CNN01310', 'CNN01320', 'CNN01340', 'CNN01345', 'CNN01360',
       'CNN01400', 'CNN01410', 'CNN01430', 'CNK00120', 'CNK00150',
       'CNK00190', 'CNK00210', 'CNK00220', 'CNK00230', 'CNK00250',
       'CNL05550', 'CNL05560', 'CNL05570', 'CNL05620', 'CNL05630',
       'CNL05650', 'CNL05700', 'CNL05710', 'CNL05750', 'SSK1 rescue',
       'CM018', 'KN99 alpha', '168', '169', '180', '181', '182', '183',
       '184', '185', '186', '187', '188', 'CF1705', 'CF1706', 'CF1707',
       'CF1730', 'CNK00140', 'H99', 'JEC21'], dtype=object)

In [4]:
## View the data where the geene CNN01270 has been ko
dp_df[(dp_df.strain=='CNN01270')]

Unnamed: 0,date,plate,rep,row,column,chr,type,strain,background,gene,...,251100s,252000s,252900s,253800s,254700s,255600s,256500s,257400s,258300s,259200s
29,Apr252019,2,2,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.6354,0.6394,0.6404,0.6404,0.6404,0.6454,0.6464,0.6514,0.6524,0.6544
125,Apr282019,2,2,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,1.0438,1.0438,1.0448,1.0458,1.0458,1.0458,1.0468,1.0468,1.0468,1.0468
234,Apr282019,2,2,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.6504,0.6514,0.6524,0.6524,0.6524,0.6554,0.6554,0.6554,0.6564,0.6584
317,Jun82019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.28896,0.28996,0.28996,0.29096,0.29096,0.29196,0.29196,0.29296,0.29396,0.29496
426,Jun82019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.093,0.094,0.095,0.098,0.099,0.1,0.101,0.102,0.104,0.106
509,Jul32019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.7244,0.7254,0.7264,0.7314,0.7314,0.7324,0.7344,0.7364,0.7384,0.7434
618,Jul32019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.45988,0.46088,0.46288,0.46388,0.46588,0.47088,0.47188,0.47188,0.47488,0.47688
701,Jul102019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.52172,0.52272,0.52372,0.53472,0.53572,0.53672,0.53872,0.54172,0.54772,0.54772
810,Jul102019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012
893,Jul142019,3,3,c,6,14.0,knockout,CNN01270,KN99 alpha,RIC8,...,0.34,0.342,0.344,0.347,0.349,0.35,0.355,0.355,0.361,0.364


In [5]:
## Take data of interested and edit labels
dp_df = dp_df[(dp_df.amphB.isin([0,0.075,0.125,0.175])) & 
              (dp_df.temp.isin([30,37,39]))]
dp_df['background'].replace(dict(zip(['KN99 alpha (?)'],['KN99 alpha'])),inplace=True)
dp_df['strain'].replace(dict(zip(['KN99 alpha (?)'],['KN99 alpha'])),inplace=True)
dp_df['background'].replace(dict(zip(['KN99 alpha'],['KN99'])),inplace=True)
dp_df['strain'].replace(dict(zip(['KN99 alpha'],['KN99'])),inplace=True)
assert dp_df[(dp_df.background=='KN99 alpha (?)')].shape[0] == 0

In [6]:
## Calculate the time vector and delta time
times = dp_df.columns.tolist()[dp_df.columns.tolist().index('0s'):]
timex = np.array([int(a[:-1])/(60**2) for a in times])
dt = np.unique(np.round(np.diff(timex),2))[0]

In [7]:
## What are the unique backgronds?
dp_df.background.unique()

array(['KN99', 'CM018', nan, 'H99', 'JEC21'], dtype=object)

In [8]:
## how much data do we have?
dp_df.shape

(954, 303)

In [9]:
## Gather unique strain by conditions
unique_strains_con = dp_df[(dp_df.chr.isin(dp_df.chr.dropna().unique()))
                          ].sort_values(['chr','strain','temp','amphB'])[
                    ['strain','chr','background','temp','amphB']].drop_duplicates()

In [10]:
## Gather strains by background
strainsxback = unique_strains_con[
    ['strain','chr','background']
].drop_duplicates().reset_index(drop=True)
ny = strainsxback.shape[0];
nx = unique_strains_con.groupby(
    ['strain','background']).count()['temp'].max()
print('%s X %s'%(ny,nx))
strainsxback.head()

45 X 5


Unnamed: 0,strain,chr,background
0,CNB02995,2.0,KN99
1,CNB03020,2.0,CM018
2,CNB03020,2.0,H99
3,CNB03090,2.0,KN99
4,CNB03170,2.0,KN99


In [11]:
## take the wild type strains
wildtype = dp_df[(dp_df.strain==dp_df.background)
                ].sort_values(['strain','temp','amphB'])

wt_mean = wildtype.groupby(['strain','temp','amphB']
                          ).agg(np.mean,axis=0).reset_index()

wt_median = wildtype.groupby(['strain','temp','amphB']
                            ).agg(np.median,axis=0).reset_index()

wt_median.head()

Unnamed: 0,strain,temp,amphB,plate,rep,column,chr,gene,saturatedOD,0s,...,251100s,252000s,252900s,253800s,254700s,255600s,256500s,257400s,258300s,259200s
0,CM018,30,0.0,2,2,3,,,,0.0,...,1.176,1.176,1.176,1.177,1.177,1.178,1.178,1.179,1.179,1.18
1,CM018,37,0.0,3,3,3,,,,0.0,...,0.6886,0.6886,0.6886,0.6886,0.6886,0.6886,0.6886,0.6886,0.6886,0.6886
2,CM018,37,0.125,3,3,3,,,,0.0,...,0.6706,0.6706,0.6706,0.6706,0.6706,0.6706,0.6711,0.6711,0.6711,0.6711
3,CM018,37,0.175,3,3,3,,,,0.0,...,0.6582,0.6592,0.6652,0.6682,0.6702,0.6712,0.6792,0.6842,0.6882,0.6922
4,CM018,39,0.0,3,3,3,,,,0.0,...,0.4753,0.4753,0.4753,0.4753,0.4753,0.4763,0.4763,0.4763,0.4763,0.4763


In [12]:
## Make a cycle tools for plotting colors
from itertools import cycle

mycolors = ['tab:blue','tab:brown','tab:purple',
            'tab:green','tab:orange','tab:red','skyblue']

mycolcy = cycle(mycolors)

In [13]:
## Make an experimental conditions matrix
conmat = dp_df[['temp','amphB']
              ].drop_duplicates().sort_values(
    ['temp','amphB']).reset_index(drop=True)
conmat

Unnamed: 0,temp,amphB
0,30,0.0
1,37,0.0
2,37,0.125
3,37,0.175
4,39,0.0


In [14]:
## Bring in gene name info across species
gene_names_path = '../DATA/GENOTYPE/JEC21_H99_gene_names.csv'
gene_names = pd.read_csv(gene_names_path,index_col=0)
gene_names.drop('background',axis=1,inplace=True)
gene_names.drop_duplicates(inplace=True)
gene_names['CM018'] = gene_names['KN99']
gene_names['JEC21'] = gene_names.index
gene_names = gene_names.T
gene_names.head().T.head()

Unnamed: 0_level_0,KN99,H99,CM018,JEC21
JEC21,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CNB02995,CKF44_03807,CNAG_03807,CKF44_03807,CNB02995
CNB03020,CKF44_03811,CNAG_03811,CKF44_03811,CNB03020
CNB03090,CKF44_03818,CNAG_03818,CKF44_03818,CNB03090
CNB03170,CKF44_03827,CNAG_03827,CKF44_03827,CNB03170
CNB03200,CKF44_03831,CNAG_03831,CKF44_03831,CNB03200


In [15]:
## How many samples are not on chromosomes of interest?
dp_df[~(dp_df.chr.isin(dp_df.chr.dropna().unique()))].shape

(262, 303)

In [16]:
## What are the unique chromosomes?
dp_df.chr.dropna().unique()

array([ 2., 14., 11., 12.])

In [17]:
## Set critical values
z = 1.96

In [18]:
## is CM018 in the strain backgrounds?
'CM018'in strainsxback.background.unique()

True

In [19]:
## Make supplementary figures 10 - 13
## Plot knockout growth curves per chromosome
sup_labels = np.array([10,11,12,13])
for C,ch in enumerate(strainsxback.groupby('chr').count().index.tolist()):
    ny = strainsxback[(strainsxback.chr==ch)].shape[0]
    fig,ax = plt.subplots(ny,nx,figsize=(6,11),
                          sharex=True,sharey=True)
    strain_tempdf = strainsxback[(strainsxback.chr==ch)].reset_index(drop=True)
    for i,r in strain_tempdf.iterrows():
        tempdf = unique_strains_con[(unique_strains_con.strain==r['strain']) & 
                              (unique_strains_con.background==r['background'])
                                   ].reset_index(drop=True)
        mc = next(mycolcy)
        for j,d in tempdf.iterrows():
            y = dp_df[(dp_df.strain==d.strain) & 
                  (dp_df.temp==d.temp) & 
                  (dp_df.amphB==d.amphB) & 
                  (dp_df.background==d.background)
                 ][times].median(axis=0).values
        
            ywt = wt_median[(wt_median.strain==d.background) & 
                        (wt_median.temp==d.temp) & 
                        (wt_median.amphB==d.amphB)][times].T.values
            plt.sca(ax[i,j])
            ax[i,j].spines#u'$\it{%s}$'
            sn = gene_names[d.strain].T[d.background].lower()
            if '_' in sn:
                sn = sn.split('_')
                mylabel = r'$\it{%s}$'%(sn[0])+'_'+r'$\it{%s}$'%(sn[1]
                                ) + u'$\u0394$\n' + r'( $\it{%s}$ )'%(d.strain)
            else:
                mylabel =  r'$\it{%s}$'%(sn)+ u'$\u0394$\n' + r'( $\it{%s}$ )'%(d.strain)
            
            if d.background == 'KN99':
                wtlabel = 'KN99' +r'$\mathrm{\alpha}$'+'\n(WT)'
            elif d.background == 'JEC21':
                wtlabel = 'JEC21'+r'$\mathrm{\alpha}$'+'\n(WT)'
            elif d.background == 'CM018':
                wtlabel = 'CM018\n(WT)'#+r'$\mathrm{\alpha}$'+'\n(WT)'
            elif d.background == 'H99':
                wtlabel = 'H99' +r'$\mathrm{\alpha}$'+'\n(WT)'
            else:
                print('Error! No WT strain.\n')
                print(d.background)
            plt.plot(timex,y,color=mc,alpha=0.7,label= mylabel if j == 0 else None)
            plt.plot(timex,ywt,'k--',alpha=0.5,label= wtlabel if j == 0 else None);
            plt.xticks([24,48,72]);
            ax[i,j].spines['right'].set_visible(False)
            ax[i,j].spines['top'].set_visible(False)
            plt.title('%s °C %s \u03BCg/ml'%(d.temp,d.amphB),fontsize=7)
            if j == 0:
                plt.legend(bbox_to_anchor=(-.8,1.1),
                      ncol=2,prop={'size':10})#prop={'style':'italic','size':10})
                plt.ylabel('OD')
        lo = np.arange(j,5)[1:]
        if len(lo) > 0:
            for l in lo:
                plt.sca(ax[i,l]);plt.axis('off');
    plt.yticks([0,0.5,1]);plt.sca(ax[-1,0]);
    plt.xlabel('Hours');plt.subplots_adjust(hspace=.7,wspace=0.5);
    plt.savefig('../FIGURES/SUPP/PDFs/S%s_Fig.pdf'%str(sup_labels[C]),
                    dpi=300,bbox_inches='tight');
    plt.close()