# RECOMBAT BATCH Correction

Either ReCombat or Z-score compared to control were used to compare data across biological repeats. This workflow will show the ReCombat algorithm across conditions across biological repeats. This run assumes no ground truth run, but that can be adjusted for in the ReCombat source code. Batch correction (and Z-score comparisons) works best on gaussian distributions, so we performed log transformations on biomarker expressions. This code will walk through batch correction for p16 expression for young control for two biorepeats (note for the paper we performed a two-axis correction as we had p21 costained with one of the remaining four biomarkers.

In [None]:
import numpy as np
import pandas as pd
import os
from glob import glob
from reComBat import reComBat
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import preprocessing

# Preprocessing 

In [None]:
data=pd.read_csv('\\path to Batch correction csv\\') # read in the dataframe

data['P16_Logged']=np.log10(data['P16_2'])#log transforming and smoothing distribution of P16
data['Batch_Corrected']=[]
BATCH_1 = data.filter(like = 'Bio1', axis=0) # filter out Biorepeat 1
BATCH_2 = data.filter(like = 'Bio2', axis=0) # filter our Biorepeat 2

# Now lets Perform the Batch Correction?

In [7]:
def re_pycombat_application(data,BATCH_1,BATCH_2, biomarker, condition): #datasetm, 
        # first we need to only keep the cells in eatch batch that match the cell condition, and express the biomarker
        # first we transfor the first biomarker
        B1=BATCH_1.dropna(subset=[biomarker]); # only keep cells expresing the biomarker
        B1=B1[B1['label']==condition]; # keep cells matching the condition, in this case young control
        B1=B1[[biomarker]].T # transpose the matrix so it can be used by the reCombat algorithm
              
        #perform the same for the second biorepeat
        B2=BATCH_2.dropna(subset=[biomarker]); # only keep cells expresing the biomarker
        B2=B2[B2['label']==condition]; # keep cells matching the condition, in this case young control
        B2=B2[[biomarker]].T # transpose the matrix so it can be used by the reCombat algorithm             
              
        #transform so now it is conducive to the reCombat Batch Correction
        df_expression = pd.concat([B1,B2],join="inner",axis=1).astype('float64').T # so we can do math
        #initialize bathch and datasets
        batch=[]
        datasets=[B1,B2]
        for j in range(len(datasets)): # here we will ascribe batch numbers to each cell in 
            batch.extend([j for _ in range(len(datasets[j].columns))])
        
              
        batch=pd.Series(batch)
        combat = reComBat(verbose=False)
        combat.fit(df_expression,batch) #fir the combat model on the expression datasheet with the corresponding batch numbers
        
              
              
        df_corrected = combat.transform(df_expression,batch) # get a new excel with the transformed expression
        df_corrected.columns=['Batch_Corrected']  
        for idx, row in df_corrected.iterrows(): # add the corrected expression to the original datasheet
              data.loc[idx,'Batch_Corrected']=row['Batch_Corrected']
              
              
        return data
              
              
              

In [None]:
data_corrected=re_pycombat_application(data,BATCH_1, BATCH_2, 'P16_Logged', 'Young Control' ) # run the code