# Python code for differential expression analysis by using rpy2 in Jupyter Notebook.
For this analysis I adapted code from Shiva Prasad Patil https://github.com/shivaprasad-patil/LIMMA-Python-implementation <br>
There is three good resource for DEG analysis in Python<br>
&emsp;&emsp; https://github.com/shivaprasad-patil/LIMMA-Python-implementation<br>
&emsp;&emsp; https://github.com/wckdouglas/diffexpr<br>
&emsp;&emsp; https://github.com/jhonP-Li/DE_rpy2<br>
&emsp;&emsp; https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2

# Import Python and R libraries

In [9]:
# -*- coding: utf-8 -*-
"""
@author: Nitish K Mishra
email:nitishimtech@gmail.com
"""

import sys
import os
import click
import numpy as np
import pandas as pd
import rpy2.robjects as ro
#from rpy2.rinterface import RRuntimeError
from rpy2.robjects import pandas2ri, Formula
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
from statsmodels.stats.multitest import multipletests

#Import R libraries
base = importr('base')
stats = importr('stats')
limma = importr('limma')
writexl = importr('writexl')
os.getcwd()

'/research/rgs01/home/clusterHome/nmishra/diffexpr-master'

<font size =4> In this analysis I am using design which I generated by using pandas pd.DataFrame. But I can also use sample sheet in CSV/exls to generate design matrix (in below cell). But for this notebook I am using example from https://github.com/wckdouglas/diffexpr, so I am using his code for design matrix. </font> 

# Make design matrix

In [10]:
#data =  pd.read_excel('./test/data/ercc.tsv') #replace your own data file
#data = data.set_index('ID') #replace 'ID' with your own annotation if necessary
data =  pd.read_csv('./test/data/ercc.tsv', sep="\t") #replace your own data file
data = data.set_index('id') #replace 'ID' with your own annotation if necessary
#design = pd.read_excel('limma_design_file.xlsx') #replace with your own design file
design = pd.read_excel('limma_design_file.xlsx') #replace with your own design file
design

Unnamed: 0,ID,Target
0,1,zero
1,2,zero
2,3,zero
3,4,one
4,5,zero
5,6,one
6,7,zero
7,8,zero
8,9,zero
9,10,one


# Make design matrix from pandas

In [11]:
sample_df = pd.DataFrame({'samplename': data.columns}) \
        .query('samplename != "id"')\
        .assign(sample = lambda d: d.samplename.str.extract('([AB])_', expand=False)) \
        .assign(replicate = lambda d: d.samplename.str.extract('_([123])', expand=False)) 
sample_df.index = sample_df.samplename
sample_df

Unnamed: 0_level_0,samplename,sample,replicate
samplename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A_1,A_1,A,1
A_2,A_2,A,2
A_3,A_3,A,3
B_1,B_1,B,1
B_2,B_2,B,2
B_3,B_3,B,3


# Convert data and design pandas dataframes to R dataframes

In [12]:
# Convert data and design pandas dataframes to R dataframes
with localconverter(ro.default_converter + pandas2ri.converter):
    r_data = ro.conversion.py2rpy(data)
    #r_design = ro.conversion.py2rpy(design)
    r_design = ro.conversion.py2rpy(sample_df)
    genes = ro.StrVector(
        [
            str(index)
            #added tovalues to convert to numpy array
            for index in data.index.tolist()
            #for index in data.index.tolist()
        ]
    )


# Create a model matrix using design

In [13]:
# Create a model matrix using design's sample/Target column using the R formula "~0 + f" to get all the unique factors as columns
f = base.factor(r_design.rx2('sample'), levels=base.unique(r_design.rx2('sample')))
form = Formula('~0 + f')
form.environment['f'] = f
r_design = stats.model_matrix(form)
r_design.colnames = base.levels(f)


# Differential expression analysis and save result

In [14]:
# Fit the data to the design using lmFit from limma
fit = limma.lmFit(r_data, r_design)
form.environment['f'] = f
r_design = stats.model_matrix(form)
r_design.colnames = base.levels(f)

# Fit the data to the design using lmFit from limma
fit = limma.lmFit(r_data, r_design)
# Make a contrasts matrix with the 1st and the last unique values
contrast_matrix = limma.makeContrasts(f"{r_design.colnames[0]}-{r_design.colnames[-1]}", levels=r_design)

# Fit the contrasts matrix to the lmFit data & calculate the bayesian fit
fit2 = limma.contrasts_fit(fit, contrast_matrix)
fit2 = limma.eBayes(fit2)

# topTreat the bayesian fit using the contrasts and add the genelist
r_output = limma.topTreat(fit2, coef=1, genelist=genes, number=np.Inf)
writexl.write_xlsx(r_output, "limma_output.xlsx")

0
'/research/rgs01/home/clusterHome/nmishra/diffexp...


# Display head of R data.frame
Python code to display head of the R data.frame. First I have to use converter then print it.

In [15]:
base = importr('base')
utils = importr('utils')
with localconverter(ro.default_converter + pandas2ri.converter):
  df_head = utils.head(r_output)
print(df_head)

                    ID          logFC          t   P.Value  adj.P.Val  \
ERCC-00131  ERCC-00131     324.666667  17.731300  0.000033   0.003024   
ERCC-00092  ERCC-00092    1174.666667  13.576244  0.000103   0.003422   
ERCC-00116  ERCC-00116    4153.000000  12.960876  0.000125   0.003422   
ERCC-00136  ERCC-00136    6008.000000  12.274932  0.000157   0.003422   
ERCC-00019  ERCC-00019      77.333333  11.569855  0.000202   0.003422   
ERCC-00130  ERCC-00130  120099.000000  11.301389  0.000223   0.003422   

                   B  
ERCC-00131 -0.990088  
ERCC-00092 -1.098259  
ERCC-00116 -1.122760  
ERCC-00136 -1.154040  
ERCC-00019 -1.191437  
ERCC-00130 -1.207309  
