In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

## Create combined in vivo + ex vivo DGE tables for correlation analyses

In [2]:
## directory paths
invivo_dir = '/home/niklas/projects/niche_environments_FIBROSIS/IPF_cell_atlas_reference_CPC/01_data/DGE_IPF_vs_healthy/210616_CPC_IPF_reference_DGE_IPF_vs_healthy_'
exvivo_dir = '/home/niklas/projects/niche_environments_FIBROSIS/PCLS_human/01_data/ASK_joint/DGE_treatment_vs_CC/211206_PCLS_human_ASK_joint_'

In [3]:
cell_type_names = ['Alveolar_Epithelium','Airway_Epithelium',
                   'Fibroblasts','SMC_Pericytes','capillary_EC','vascular_EC','lymphatic_EC',
                   'Macrophages','Mast_cells',
                   'B_cells','Plasma_cells','T_cells','NK_cells']

In [4]:
## initialize master tables
master = pd.DataFrame()
master_filtered_pct_expressed = pd.DataFrame()
master_filtered_pval = pd.DataFrame()

In [5]:
for ct in cell_type_names:
    ## read data
    invivo_table = pd.read_csv(invivo_dir + ct + '.csv', index_col = 0)
    exvivo_table = pd.read_csv(exvivo_dir + ct + '_FC_vs_CC_DGE_results.csv', index_col = 0)
    cmp4_table = pd.read_csv(exvivo_dir + ct + '_FC+CMP4_vs_CC_DGE_results.csv', index_col = 0)
    nintedanib_table = pd.read_csv(exvivo_dir + ct + '_FC+Nintedanib_vs_CC_DGE_results.csv', index_col = 0)
    
    ## select important columns only
    invivo_table = invivo_table[['log2fc','qval','pct.healthy','pct.ILD']]
    exvivo_table = exvivo_table[['log2fc','qval','pct.CCs','pct.FCs']]
    cmp4_table = cmp4_table[['log2fc','qval','pct.CCs','pct.FC+CMP4s']]
    nintedanib_table = nintedanib_table[['log2fc','qval','pct.CCs','pct.FC+Nintedanibs']]
    
    ### filter DGE tables by pct expressed
    #invivo_table = invivo_table[invivo_table['pct.ILD'] > 0.10]
    #invivo_table = invivo_table[invivo_table['pct.healthy'] > 0.10]
    #exvivo_table = exvivo_table[exvivo_table['pct.FC'] > 0.10]
    #exvivo_table = exvivo_table[exvivo_table['pct.CC'] > 0.10]
    #cmp4_table = cmp4_table[cmp4_table['pct.FC+CMP4'] > 0.10]
    #cmp4_table = cmp4_table[cmp4_table['pct.CC'] > 0.10]
    #nintedanib_table = nintedanib_table[nintedanib_table['pct.FC+Nintedanib'] > 0.10]
    #nintedanib_table = nintedanib_table[nintedanib_table['pct.CC'] > 0.10]
    
    ## replace exorbitant high logFC values
    invivo_table['log2fc'] = [10 if logfc > 10 else logfc for logfc in invivo_table['log2fc']]
    invivo_table['log2fc'] = [-10 if logfc < -10 else logfc for logfc in invivo_table['log2fc']]
    exvivo_table['log2fc'] = [10 if logfc > 10 else logfc for logfc in exvivo_table['log2fc']]
    exvivo_table['log2fc'] = [-10 if logfc < -10 else logfc for logfc in exvivo_table['log2fc']]
    cmp4_table['log2fc'] = [10 if logfc > 10 else logfc for logfc in cmp4_table['log2fc']]
    cmp4_table['log2fc'] = [-10 if logfc < -10 else logfc for logfc in cmp4_table['log2fc']]
    nintedanib_table['log2fc'] = [10 if logfc > 10 else logfc for logfc in nintedanib_table['log2fc']]
    nintedanib_table['log2fc'] = [-10 if logfc < -10 else logfc for logfc in nintedanib_table['log2fc']]
    
    ## rename columns
    invivo_table.rename(columns = {'log2fc':str(ct + '_invivo_log2FC'),
                                   'qval':str(ct +'_invivo_pval_adj'),
                                   'pct.healthy':str(ct +'_invivo_pct_healthy'),
                                   'pct.ILD':str(ct +'_invivo_pct_ILD')}, inplace = True)
    exvivo_table.rename(columns = {'log2fc':str(ct +'_exvivo_log2FC'),
                                   'qval':str(ct +'_exvivo_pval_adj'),
                                   'pct.CCs':str(ct +'_exvivo_pct_CC'),
                                   'pct.FCs':str(ct +'_exvivo_pct_FC')}, inplace = True)
    cmp4_table.rename(columns = {'log2fc':str(ct +'_CMP4_log2FC'),
                                   'qval':str(ct +'_CMP4_pval_adj'),
                                   'pct.CCs':str(ct +'_CMP4_pct_CC'),
                                   'pct.FC+CMP4s':str(ct +'_CMP4_pct_FC_CMP4')}, inplace = True)
    nintedanib_table.rename(columns = {'log2fc':str(ct +'_Nintedanib_log2FC'),
                                   'qval':str(ct +'_Nintedanib_pval_adj'),
                                   'pct.CCs':str(ct +'_Nintedanib_pct_CC'),
                                   'pct.FC+Nintedanibs':str(ct +'_Nintedanib_pct_FC_Nintedanib')}, inplace = True)
    
    
    ## create separate master table filtered by qval
    invivo_table_filtered_pval = invivo_table[invivo_table[str(ct +'_invivo_pval_adj')] < 0.05 ]
    exvivo_table_filtered_pval = exvivo_table[exvivo_table[str(ct +'_exvivo_pval_adj')] < 0.05 ]
    cmp4_table_filtered_pval = cmp4_table[cmp4_table[str(ct +'_CMP4_pval_adj')] < 0.05 ]
    nintedanib_table_filtered_pval = nintedanib_table[nintedanib_table[str(ct +'_Nintedanib_pval_adj')] < 0.05 ]
    
    ## create separate master table filtered by pct expressed (in addition to qval)
    invivo_table_filtered_pct_expressed = invivo_table_filtered_pval[invivo_table_filtered_pval[str(ct +'_invivo_pct_ILD')] > 0.1]
    exvivo_table_filtered_pct_expressed = exvivo_table_filtered_pval[exvivo_table_filtered_pval[str(ct +'_exvivo_pct_FC')] > 0.1]
    cmp4_table_filtered_pct_expressed = cmp4_table_filtered_pval[cmp4_table_filtered_pval[str(ct +'_CMP4_pct_FC_CMP4')] > 0.1]
    nintedanib_table_filtered_pct_expressed = nintedanib_table_filtered_pval[nintedanib_table_filtered_pval[str(ct +'_Nintedanib_pct_FC_Nintedanib')] > 0.1]
    
    ## merge tables
    comparison = pd.concat([invivo_table, exvivo_table, cmp4_table, nintedanib_table], axis=1, join='outer')
    comparison_filtered_pval  = pd.concat([invivo_table_filtered_pval, exvivo_table_filtered_pval,
                                           cmp4_table_filtered_pval, nintedanib_table_filtered_pval], axis=1, join='outer')
    comparison_filtered_pct_expressed = pd.concat([invivo_table_filtered_pct_expressed, exvivo_table_filtered_pct_expressed,
                                                   cmp4_table_filtered_pct_expressed, nintedanib_table_filtered_pct_expressed], axis=1, join='outer')
    
    ## save results table
    csv_dir = str('/home/niklas/projects/niche_environments_FIBROSIS/PCLS_human/01_data/ASK_joint/DGE_correlation_tables/220114_dge_correlation_table_' + ct + '.csv')
    comparison.to_csv(csv_dir, index = True)
    
    ## add column to 'MASTER' table
    master = pd.concat([master, comparison], axis=1, join='outer')
    master_filtered_pct_expressed = pd.concat([master_filtered_pct_expressed, comparison_filtered_pct_expressed], axis=1, join='outer')
    master_filtered_pval = pd.concat([master_filtered_pval, comparison_filtered_pval], axis=1, join='outer')

In [6]:
master.head(15)

Unnamed: 0_level_0,Alveolar_Epithelium_invivo_log2FC,Alveolar_Epithelium_invivo_pval_adj,Alveolar_Epithelium_invivo_pct_healthy,Alveolar_Epithelium_invivo_pct_ILD,Alveolar_Epithelium_exvivo_log2FC,Alveolar_Epithelium_exvivo_pval_adj,Alveolar_Epithelium_exvivo_pct_CC,Alveolar_Epithelium_exvivo_pct_FC,Alveolar_Epithelium_CMP4_log2FC,Alveolar_Epithelium_CMP4_pval_adj,...,NK_cells_exvivo_pct_CC,NK_cells_exvivo_pct_FC,NK_cells_CMP4_log2FC,NK_cells_CMP4_pval_adj,NK_cells_CMP4_pct_CC,NK_cells_CMP4_pct_FC_CMP4,NK_cells_Nintedanib_log2FC,NK_cells_Nintedanib_pval_adj,NK_cells_Nintedanib_pct_CC,NK_cells_Nintedanib_pct_FC_Nintedanib
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GPX1,10.0,0.0,0.208328,0.233239,,,,,,,...,,,,,,,,,,
NAPRT,10.0,0.0,0.0569,0.035651,-0.775898,5.549926e-06,0.193662,0.151819,-1.067108,6.361869e-10,...,,,,,,,,,,
CRYBG3,10.0,0.0,0.045819,0.05055,-0.267367,0.06602945,0.268486,0.260979,-0.275646,0.03066237,...,,,,,,,,,,
TIFA,10.0,0.0,0.166655,0.089571,,,,,,,...,,,,,,,,,,
LUC7L2,10.0,0.0,0.13739,0.115999,,,,,-0.645198,3.971449e-06,...,,,,,,,1.510993,0.000156,0.016423,0.077441
FAM234A,10.0,0.0,0.083005,0.066513,-0.2676,0.08749059,0.21831,0.217064,-0.482588,0.0001845592,...,,,,,,,,,,
TMEM120A,10.0,0.0,0.097009,0.041149,-0.322019,0.05725964,0.198944,0.196989,-0.573691,0.0001181347,...,,,,,,,,,,
ACAT2,10.0,0.0,0.379844,0.081944,-0.500906,2.927326e-06,0.426937,0.350063,-0.54356,2.552094e-07,...,,,,,,,,,,
TMEM70,10.0,0.0,0.076071,0.018092,-0.876209,6.628555e-08,0.210387,0.145546,-1.020495,2.21303e-10,...,,,,,,,,,,
LRRC75A,10.0,0.0,0.061965,0.012061,0.314567,0.03658633,0.226232,0.31995,,,...,,,,,,,,,,


In [7]:
master.shape

(18050, 208)

In [8]:
master_filtered_pct_expressed.shape

(12859, 208)

In [9]:
master_filtered_pval.shape

(17801, 208)

In [10]:
## save master tables
csv_dir = str('/home/niklas/projects/niche_environments_FIBROSIS/HUMAN_invivo_exvivo_comparison/01_data/ASK_joint/220212_ASK_joint_MASTER_dge_correlation_table.csv')
master.to_csv(csv_dir, index = True)

In [11]:
master_filtered_pct_expressed.shape

(12859, 208)

In [12]:
## save master tables
csv_dir = str('/home/niklas/projects/niche_environments_FIBROSIS/HUMAN_invivo_exvivo_comparison/01_data/ASK_joint/220212_ASK_joint_MASTER_dge_correlation_table_filtered_pct_expr.csv')
master_filtered_pct_expressed.to_csv(csv_dir, index = True)

In [13]:
master_filtered_pval.shape

(17801, 208)

In [14]:
## save master tables
csv_dir = str('/home/niklas/projects/niche_environments_FIBROSIS/HUMAN_invivo_exvivo_comparison/01_data/ASK_joint/220212_ASK_joint_MASTER_dge_correlation_table_filtered_pval.csv')
master_filtered_pval.to_csv(csv_dir, index = True)