In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap
from scipy import stats
plt.style.use('ggplot')
%matplotlib inline

In [2]:
nor_counts=pd.read_table('normalized_counts.txt')
col_new=['geneID', 'geneLength', 'geneProduct','185-1','WT-1','185-2','185-3','WT-2','WT-3']
nor_counts.columns=col_new
nor_counts=nor_counts[['geneID','geneLength','geneProduct','WT-1','WT-2','WT-3','185-1','185-2','185-3']]
nor_counts['WT_mean']=np.round(np.apply_along_axis(np.mean,1,nor_counts[['WT-1','WT-2','WT-3']]),2)
nor_counts['185_mean']=np.round(np.apply_along_axis(np.mean,1,nor_counts[['185-1','185-2','185-3']]),2)

In [3]:
gene_id_1=pd.read_table('Gene_id_1_1000.txt')
gene_id_2=pd.read_table('Gene_id_1001_2000.txt')
gene_id_3=pd.read_table('Gene_id_2001_3000.txt')
gene_id_4=pd.read_table('Gene_id_3001_3031.txt')
gene_id_main=[gene_id_1,gene_id_2,gene_id_3,gene_id_4]
gene_id_main=pd.concat(gene_id_main)
gene_id_main=gene_id_main.drop(['Batch1','Genome Name','Unnamed: 7','Genome ID'],axis=1)

In [4]:
merged=nor_counts.merge(gene_id_main,how='left',left_on='geneID',right_on='Gene ID')
df=merged.drop(['Gene Product Name','Gene ID','geneLength'],axis=1)
col_order=['geneID','Locus Tag','Gene Symbol','geneProduct','WT-1', 'WT-2', 'WT-3', '185-1', '185-2',
       '185-3', 'WT_mean', '185_mean' ]
df=df[col_order]

In [5]:
df.head(5)

Unnamed: 0,geneID,Locus Tag,Gene Symbol,geneProduct,WT-1,WT-2,WT-3,185-1,185-2,185-3,WT_mean,185_mean
0,650468844,Clo1313_0001,dnaA,chromosomal replication initiator protein DnaA,95.68,68.16,33.52,12.22,13.38,14.04,65.79,13.21
1,650468845,Clo1313_0002,,"DNA polymerase III, beta subunit (EC 2.7.7.7)",91.89,65.97,34.76,7.39,13.24,7.93,64.21,9.52
2,650468846,Clo1313_0003,,S4 domain protein YaaA,31.74,23.39,12.33,6.55,7.83,12.05,22.49,8.81
3,650468847,Clo1313_0004,recF,DNA replication and repair protein RecF,21.31,13.09,9.2,1.22,0.0,1.12,14.53,0.78
4,650468848,Clo1313_0005,,hypothetical protein,221.06,242.11,267.31,103.35,82.3,130.65,243.49,105.43


### Add fold change columns to the data frame

In [6]:
df['log2(WT)']=np.round(np.log2(df['WT_mean']),2)
df['log2(185)']=np.round(np.log2(df['185_mean']),2)
df['log2(FC)']=np.round(df['log2(185)']-df['log2(WT)'],2)

  """Entry point for launching an IPython kernel.
  


In [7]:
# remove genes with detectable transcription, which leads to numeric instability in log2 fold change computation.
# this filtered data frame should be used in picking out the top 10 gene list
# df_filtered=df[np.logical_and(df['WT_mean']>2,df['185_mean']>2)]
# df_filtered=df_filtered[['geneID','Locus Tag','Gene Symbol','geneProduct','WT_mean','185_mean','log2(FC)']]

In [8]:
# remove rows with nan or inf output in log2(FC)
row_nan_inf=np.logical_or(np.isnan(df['log2(FC)']),np.isinf(df['log2(FC)']))
df=df[~row_nan_inf]

In [9]:
df.head(5)

Unnamed: 0,geneID,Locus Tag,Gene Symbol,geneProduct,WT-1,WT-2,WT-3,185-1,185-2,185-3,WT_mean,185_mean,log2(WT),log2(185),log2(FC)
0,650468844,Clo1313_0001,dnaA,chromosomal replication initiator protein DnaA,95.68,68.16,33.52,12.22,13.38,14.04,65.79,13.21,6.04,3.72,-2.32
1,650468845,Clo1313_0002,,"DNA polymerase III, beta subunit (EC 2.7.7.7)",91.89,65.97,34.76,7.39,13.24,7.93,64.21,9.52,6.0,3.25,-2.75
2,650468846,Clo1313_0003,,S4 domain protein YaaA,31.74,23.39,12.33,6.55,7.83,12.05,22.49,8.81,4.49,3.14,-1.35
3,650468847,Clo1313_0004,recF,DNA replication and repair protein RecF,21.31,13.09,9.2,1.22,0.0,1.12,14.53,0.78,3.86,-0.36,-4.22
4,650468848,Clo1313_0005,,hypothetical protein,221.06,242.11,267.31,103.35,82.3,130.65,243.49,105.43,7.93,6.72,-1.21


### Compute adjusted p-value (Welch-t-test, Benjamini-Hochberg correction)
Using unpooled error to calculate the unadjusted p-value
There are several ways to adjust p-values, Bonferroni correction will simply take the unadjusted p-value and times the number of hypothesis. Alternative, FDR-adjusted, aka Benjimini & Hochberg correction, can also be used. For multiple comparison with large m, Bonferroni tend to be too severe. Benjimini & Hochberg method is used herein.

In [10]:
def calculate_mean_and_variance(df):
    """
    Imput df as a pandas dataframe (m x n).
    Each row of df is a different gene of interest, each column of df is a repeat measurement.
    Output mean (m x 1) and variance (m x 1).
    """
    df_np=np.array(df)
    m=df.shape[0]
    n=df.shape[1]
    df_mean=np.apply_along_axis(np.mean,1,df_np).reshape((m,1))
    df_var=np.sum((df_np-df_mean)**2,axis=1)/n
    return df_mean,df_var

In [11]:
def calculate_t_stat(interest,control):
    """
    Take in two dataframes of same shape (m x n)
    Each row of df is a different gene of interest, each column of df is a repeat measurement.
    Calculate the t-stastic (m x 1) and degree of freedom (m x 1), assuming equal sample size, unequal variancies (Welch t-test)
    Output in the form of numpy array.
    """
    assert interest.shape==control.shape
    m=interest.shape[0]
    n=interest.shape[1]
    int_mean,int_var=calculate_mean_and_variance(interest)
    ctl_mean,ctl_var=calculate_mean_and_variance(control)
    variance=np.sqrt((int_var/n)+(ctl_var/n)).reshape((m,1))
    t_stat=(int_mean-ctl_mean)/variance
    denom=np.power(variance,4)
    term1=(int_var/n)**2/(n-1)
    term1=term1.reshape((m,1))
    term2=(ctl_var/n)**2/(n-1)
    term2=term2.reshape((m,1))
    deg_of_free=denom/(term1+term2)
    return t_stat,deg_of_free
    
    

In [12]:
def calculate_p_value(interest,control):
    """
    Take in two dataframes of same shape (m x n), compute unadjusted two-sided p_value (m x 1) using Welch-t-test,
    assuming unequal variance.
    """
    t_stat,deg_of_free=calculate_t_stat(interest,control)
    p_value=stats.t.sf(np.abs(t_stat),deg_of_free)*2
    return p_value

In [13]:
def calculate_adjusted_p_value(interest,control):
    """
    Take in two dataframes of same shape (m x n), compute unadjusted two-sided p_value (m x 1) using Welch-t-test,
    and adjusted p_value (m x 1) using Benjamini-Hochberg method.
    """
    p_value=calculate_p_value(interest,control)
    m=len(p_value)
    p_tmp=pd.DataFrame(p_value,columns=['p-value'])
    p_tmp=p_tmp.sort_values('p-value')
    p_rank=stats.rankdata(p_tmp['p-value'])
    p_test=p_tmp['p-value']*m/p_rank
    p_adj=np.zeros(m)
    for i in range(m):
        p_adj[i]=np.min([np.min(p_test[i:]),1])   #double min, the outer min ensures p value doesn't exceed 1.
    p_tmp['p-adjusted']=p_adj
    p_tmp=p_tmp.sort_index()
    return np.round(p_tmp['p-value'],2),np.round(p_tmp['p-adjusted'],2)
        

In [14]:
results=df[['geneID','Locus Tag','Gene Symbol','geneProduct','WT_mean','185_mean','log2(FC)']]
results=results.reset_index(drop=True)
results['p-value'],results['p-value adj.']=calculate_adjusted_p_value(df[['185-1','185-2','185-3']],df[['WT-1','WT-2','WT-3']])

In [15]:
results.head(5)

Unnamed: 0,geneID,Locus Tag,Gene Symbol,geneProduct,WT_mean,185_mean,log2(FC),p-value,p-value adj.
0,650468844,Clo1313_0001,dnaA,chromosomal replication initiator protein DnaA,65.79,13.21,-2.32,0.07,0.17
1,650468845,Clo1313_0002,,"DNA polymerase III, beta subunit (EC 2.7.7.7)",64.21,9.52,-2.75,0.05,0.14
2,650468846,Clo1313_0003,,S4 domain protein YaaA,22.49,8.81,-1.35,0.09,0.18
3,650468847,Clo1313_0004,recF,DNA replication and repair protein RecF,14.53,0.78,-4.22,0.04,0.13
4,650468848,Clo1313_0005,,hypothetical protein,243.49,105.43,-1.21,0.0,0.03


### Apply style to the data.frame

In [16]:
def color_red_green(val):
    if type(val)==float and val > 0:
        color='red'
    else:
        color='green'
    return 'color: %s'% color

In [17]:
def add_style(df):
    return df.style.applymap(color_red_green,pd.IndexSlice[:,['log2(FC)']])

In [18]:
add_style(results.head(10))

Unnamed: 0,geneID,Locus Tag,Gene Symbol,geneProduct,WT_mean,185_mean,log2(FC),p-value,p-value adj.
0,650468844,Clo1313_0001,dnaA,chromosomal replication initiator protein DnaA,65.79,13.21,-2.32,0.07,0.17
1,650468845,Clo1313_0002,,"DNA polymerase III, beta subunit (EC 2.7.7.7)",64.21,9.52,-2.75,0.05,0.14
2,650468846,Clo1313_0003,,S4 domain protein YaaA,22.49,8.81,-1.35,0.09,0.18
3,650468847,Clo1313_0004,recF,DNA replication and repair protein RecF,14.53,0.78,-4.22,0.04,0.13
4,650468848,Clo1313_0005,,hypothetical protein,243.49,105.43,-1.21,0.0,0.03
5,650468849,Clo1313_0006,gyrB,"DNA gyrase, B subunit",516.26,378.67,-0.45,0.0,0.03
6,650468850,Clo1313_0007,,chromosome segregation ATPase,168.57,88.55,-0.93,0.02,0.1
7,650468851,Clo1313_0008,,parB-like partition protein,196.9,213.86,0.12,0.4,0.52
8,650468852,Clo1313_0009,,hypothetical protein,125.62,68.16,-0.88,0.06,0.16
9,650468853,Clo1313_0010,,TPR repeat-containing protein,233.45,132.38,-0.82,0.09,0.19


### Check the fold change in key metabolic pathways

In [19]:
def check_pathway_transcription(file,df):
    pathway=pd.read_table(file)
    pathway=pathway[['Gene ID','Locus Tag','Gene Product Name']]
    out=pathway.merge(df,how='left',on='Locus Tag')
    out=out[['Locus Tag','Gene Product Name','log2(FC)','p-value adj.']]
    return add_style(out)

In [20]:
check_pathway_transcription('ketoacid_pathway(PWY-7111).txt',results)

Unnamed: 0,Locus Tag,Gene Product Name,log2(FC),p-value adj.
0,Clo1313_0099,"acetolactate synthase, large subunit (EC 2.2.1.6)",2.84,0.04
1,Clo1313_0100,"acetolactate synthase, small subunit (EC 2.2.1.6)",2.6,0.02
2,Clo1313_0101,ketol-acid reductoisomerase (EC 1.1.1.86),3.57,0.05
3,Clo1313_0304,dihydroxy-acid dehydratase,0.95,0.17
4,Clo1313_0305,"acetolactate synthase, large subunit (EC 2.2.1.6)",2.79,0.02
5,Clo1313_1798,acetaldehyde dehydrogenase (EC 1.2.1.10)/alcohol dehydrogenase AdhE (EC 1.1.1.1),0.11,0.52


In [21]:
check_pathway_transcription('mixedacid_pathway(FERMENTATION-PWY).txt',results)

Unnamed: 0,Locus Tag,Gene Product Name,log2(FC),p-value adj.
0,Clo1313_0640,"hydro-lyase, Fe-S type, tartrate/fumarate subfamily, alpha subunit",-0.01,0.98
1,Clo1313_0641,"hydro-lyase, Fe-S type, tartrate/fumarate subfamily, beta subunit",-0.33,0.51
2,Clo1313_1185,phosphate acetyltransferase,-1.12,0.1
3,Clo1313_1186,acetate kinase (EC 2.7.2.1),0.29,0.65
4,Clo1313_1717,formate acetyltransferase,0.36,0.5
5,Clo1313_1798,acetaldehyde dehydrogenase (EC 1.2.1.10)/alcohol dehydrogenase AdhE (EC 1.1.1.1),0.11,0.52
6,Clo1313_1944,isocitrate dehydrogenase (NADP) (EC 1.1.1.42),0.48,0.24


In [22]:
check_pathway_transcription("reductiveTCAcycle(reductive TCA cycle I).txt",results)

Unnamed: 0,Locus Tag,Gene Product Name,log2(FC),p-value adj.
0,Clo1313_0020,"pyruvate/ketoisovalerate oxidoreductase, gamma subunit",-1.92,0.08
1,Clo1313_0021,"pyruvate ferredoxin/flavodoxin oxidoreductase, delta subunit",-1.73,0.08
2,Clo1313_0022,pyruvate flavodoxin/ferredoxin oxidoreductase domain protein,-0.78,0.11
3,Clo1313_0023,thiamine pyrophosphate TPP-binding domain-containing protein,-0.4,0.34
4,Clo1313_0382,"pyruvate ferredoxin oxidoreductase, gamma subunit (EC 1.2.7.1)",-3.15,0.11
5,Clo1313_0383,"pyruvate ferredoxin/flavodoxin oxidoreductase, delta subunit",-3.27,0.1
6,Clo1313_0384,pyruvate flavodoxin/ferredoxin oxidoreductase domain protein,-2.78,0.04
7,Clo1313_0385,"pyruvate ferredoxin oxidoreductase, beta subunit (EC 1.2.7.1)",-0.62,0.12
8,Clo1313_0640,"hydro-lyase, Fe-S type, tartrate/fumarate subfamily, alpha subunit",-0.01,0.98
9,Clo1313_0641,"hydro-lyase, Fe-S type, tartrate/fumarate subfamily, beta subunit",-0.33,0.51


In [23]:
check_pathway_transcription('glycolysis(glycolysis I).txt',results)

Unnamed: 0,Locus Tag,Gene Product Name,log2(FC),p-value adj.
0,Clo1313_0080,Phosphoglycerate mutase,0.68,0.13
1,Clo1313_0813,Phosphoglycerate mutase,0.43,0.46
2,Clo1313_0966,phosphoglycerate mutase (EC 5.4.2.1),-0.43,0.04
3,Clo1313_0997,6-phosphofructokinase (EC 2.7.1.11),0.56,0.1
4,Clo1313_1271,Phosphoglycerate mutase,-1.33,0.05
5,Clo1313_1517,Phosphoglycerate mutase,0.23,0.59
6,Clo1313_1875,fructose-bisphosphate aldolase (EC 4.1.2.13),0.83,0.11
7,Clo1313_1876,phosphofructokinase,0.88,0.15
8,Clo1313_2015,Glucose-6-phosphate isomerase,0.86,0.11
9,Clo1313_2090,enolase,0.65,0.11


In [24]:
# check_pathway_transcription('valine_biosynthesis.txt',df)

### List of most differentially transcribed genes

In [25]:
up_reg=results.sort_values('log2(FC)',ascending=False)
up_reg=up_reg[up_reg['p-value adj.']<=0.05]
up_reg=up_reg.reset_index(drop=True)
add_style(up_reg[['geneProduct','Locus Tag','log2(FC)','p-value adj.']].head(10))

Unnamed: 0,geneProduct,Locus Tag,log2(FC),p-value adj.
0,ketol-acid reductoisomerase (EC 1.1.1.86),Clo1313_0101,3.57,0.05
1,Uncharacterized conserved protein UCP033563,Clo1313_0745,2.86,0.04
2,"acetolactate synthase, large subunit (EC 2.2.1.6)",Clo1313_0099,2.84,0.04
3,"acetolactate synthase, large subunit (EC 2.2.1.6)",Clo1313_0305,2.79,0.02
4,ribonucleoside-diphosphate reductase class II (EC 1.17.4.-),Clo1313_2179,2.72,0.05
5,ribosomal protein S8,Clo1313_0459,2.71,0.02
6,SSU ribosomal protein S17P,Clo1313_0454,2.62,0.05
7,LSU ribosomal protein L29P,Clo1313_0453,2.6,0.02
8,"acetolactate synthase, small subunit (EC 2.2.1.6)",Clo1313_0100,2.6,0.02
9,"RNA polymerase, sigma subunit, SigV",Clo1313_0811,2.56,0.04


In [26]:
down_reg=results.sort_values('log2(FC)')
down_reg=down_reg[down_reg['p-value adj.']<=0.05]
down_reg=down_reg.reset_index(drop=True)
add_style(down_reg[['geneProduct','Locus Tag','log2(FC)','p-value adj.']].head(10))

Unnamed: 0,geneProduct,Locus Tag,log2(FC),p-value adj.
0,response regulator receiver protein,Clo1313_1416,-7.69,0.05
1,"two component transcriptional regulator, LuxR family",Clo1313_0991,-6.75,0.04
2,"MCP methyltransferase, CheR-type",Clo1313_1413,-6.6,0.05
3,response regulator receiver modulated CheB methylesterase,Clo1313_1414,-5.83,0.03
4,S-layer domain-containing protein,Clo1313_0084,-5.5,0.02
5,"amino acid ABC transporter substrate-binding protein, PAAT family (TC 3.A.1.3.-)",Clo1313_0531,-5.49,0.05
6,pseudaminic acid biosynthesis-associated methylase,Clo1313_2904,-5.02,0.05
7,hypothetical protein,Clo1313_2298,-4.83,0.02
8,flagellar hook-basal body protein,Clo1313_0208,-4.79,0.02
9,flagellar hook-associated protein FlgK,Clo1313_2916,-4.74,0.02


### Making a heat map (creating a customary color map)
After some thought, I think heat map is not a good idea to present this data, as there is only one condition. Without showing repeat samples, it will simply be a heat map of one column (pretty much defeats the purpose of a heat map.)
I leave the code here just for my future reference.

In [27]:
test_plot=df.iloc[:10,[2,4,5,6,7,8,9]]

In [28]:
test_plot

Unnamed: 0,Gene Symbol,WT-1,WT-2,WT-3,185-1,185-2,185-3
0,dnaA,95.68,68.16,33.52,12.22,13.38,14.04
1,,91.89,65.97,34.76,7.39,13.24,7.93
2,,31.74,23.39,12.33,6.55,7.83,12.05
3,recF,21.31,13.09,9.2,1.22,0.0,1.12
4,,221.06,242.11,267.31,103.35,82.3,130.65
5,gyrB,504.81,540.53,503.43,397.22,354.17,384.61
6,,131.89,193.19,180.62,94.27,75.07,96.3
7,,173.67,207.32,209.7,233.88,176.93,230.78
8,,110.78,168.52,97.56,55.85,50.83,97.81
9,,168.44,225.44,306.47,129.97,122.45,144.73


In [None]:
gene_names=list(test_plot.iloc[:,0])

In [None]:
# some exercise for making a heat map in ipython notebook. In this data set, as there is only one condition, 
# it is better to simply present data as fold change instead of showing the heatmap

cdict1 = {'red':   ((0.0, 0.0, 0.0),
                   (0.5, 0.0, 0.0),
                   (1.0, 1.0, 1.0)),

         'green': ((0.0, 1.0, 1.0),
                   (0.5, 0.0, 0.0),
                   (1.0, 0.0, 0.0)),

         'blue':  ((0.0, 0.0, 0.0),
                   (0.5, 0.0, 0.0),
                   (1.0, 0.0, 0.0))
        }
green_red=LinearSegmentedColormap('green_red',cdict1)
fig=plt.figure(figsize=(10,5))
plt.imshow(test_plot.iloc[:,1:],cmap=green_red)
plt.colorbar()
plt.grid(False)
ax=fig.gca()
ax.set_yticklabels(gene_names)
ax.tick_params(labelsize=20)
ax.set_yticks(range(10))
ax.set_xticks(range(6))