This notebook combines all of the tissue, spatial, and network features for each sample along with their clinical/immunhohistochemical features to create the data file used in the regression and prediction analysis.

Other versions of the data are also created for sensitivity tests.

Author: Lucy Van Kleunen

In [1]:
# Loaded needed libraries 
import numpy as np
import pickle
import pandas as pd
import viz_helper
import os

In [2]:
TUMOR = "Tumor"
M1 = "M1 macrophages" 
VASCULAR = "Vascular endothelial cells"
# Different data versions
# missing_na - If cell type is missing from the sample, list them for generated samples as NA
# main (main text) - If cell type is missing from the sample, list them for generated samples as 0 (except distance)
versions = ['missing_na','main']

In [3]:
## Read in the clinical attributes and all of the feature objects and combine into a data file for analysis

# Cell types per sample
type_colors = viz_helper.get_cell_type_colors()
unique_types = list(type_colors.keys())
with open(os.path.join('analysis_objects','cell_types_trimmed.pickle'), 'rb') as handle:
    cell_types = pickle.load(handle)

# ID Map
with open(os.path.join('analysis_objects','tma_fovint_map.pickle'), 'rb') as handle:
    tma_fovint_map = pickle.load(handle)
fovint_tma_map = {}
for tma in tma_fovint_map:
    for fovint in tma_fovint_map[tma]:
        fovint_tma_map[fovint] = tma
        
# Clinical/immunohistochemical Features 
DATA_FILE = os.path.join('Data','tma_info.csv')
df = pd.read_csv(DATA_FILE)
df = df[df["Included"]==1]

# Tissue Features
with open(os.path.join('analysis_objects','type_props_trimmed.pickle'), 'rb') as handle:
    type_props = pickle.load(handle)
    
# Spatial Features 
with open(os.path.join('analysis_objects','median_distances_tumor.pickle'), 'rb') as handle:
    median_distances_tumor = pickle.load(handle)
with open(os.path.join('analysis_objects','median_distances_M1.pickle'), 'rb') as handle:
    median_distances_M1 = pickle.load(handle)
with open(os.path.join('analysis_objects','median_distances_vascular.pickle'), 'rb') as handle:
    median_distances_vascular = pickle.load(handle)

# Network Features 
with open(os.path.join('analysis_objects','region_sizes_samples.pickle'), 'rb') as handle:
    region_sizes_samples = pickle.load(handle)
with open(os.path.join('analysis_objects','assorts_samples.pickle'), 'rb') as handle:
    assorts_samples = pickle.load(handle)
with open(os.path.join('analysis_objects','contact_enrichments_tumor.pickle'), 'rb') as handle:
    contact_enrichments_tumor = pickle.load(handle)
with open(os.path.join('analysis_objects','contact_enrichments_M1.pickle'), 'rb') as handle:
    contact_enrichments_M1 = pickle.load(handle)
with open(os.path.join('analysis_objects','contact_enrichments_vascular.pickle'), 'rb') as handle:
    contact_enrichments_vascular = pickle.load(handle)

In [4]:
for v in versions:
    
    mibi_table = {}
    
    mibi_table['fov_id'] = []
    mibi_table['TMA_ID'] = []

    for key in unique_types:
        mibi_table[key + '_assort'] = [] # cell-cell assortativities
        mibi_table[key + '_prop'] = [] # cell type proportions
        mibi_table[key + '_region'] = [] # mean region size
        if key != TUMOR:
            mibi_table[key + '_tumor_med_dist'] = [] # median distance 
            mibi_table[key + '_tumor_contact'] = [] # contact enrichment score
        if key != M1:
            mibi_table[key + '_M1_med_dist'] = [] # median distance 
            mibi_table[key + '_M1_contact'] = [] # contact enrichment score
        if key != VASCULAR:
            mibi_table[key + '_vascular_med_dist'] = [] # median distance 
            mibi_table[key + '_vascular_contact'] = [] # contact enrichment score

    # For each sample
    for fov_int in cell_types.keys():
        mibi_table['fov_id'].append(fov_int)
        mibi_table['TMA_ID'].append(fovint_tma_map[fov_int])
        for key in unique_types:
            if key in set(cell_types[fov_int]):
                if cell_types[fov_int].count(key) != 1:
                    mibi_table[key + '_assort'].append(assorts_samples[fov_int][key])
                else:
                    if v == 'missing_na':
                        mibi_table[key + '_assort'].append(np.nan)
                    else:
                        mibi_table[key + '_assort'].append(0)
                mibi_table[key + '_prop'].append(type_props[fov_int][key])
                mibi_table[key + '_region'].append(region_sizes_samples[fov_int][key])
                if key != TUMOR:
                    mibi_table[key + '_tumor_med_dist'].append(median_distances_tumor[fov_int][key])
                    mibi_table[key + '_tumor_contact'].append(contact_enrichments_tumor[fov_int][key])
                if key != M1:
                    if median_distances_M1[fov_int]: # has M1 cells in the sample
                        mibi_table[key + '_M1_med_dist'].append(median_distances_M1[fov_int][key])
                        mibi_table[key + '_M1_contact'].append(contact_enrichments_M1[fov_int][key])
                    else:
                        if v == 'missing_na':
                            mibi_table[key + '_M1_contact'].append(np.nan)
                        else:
                            mibi_table[key + '_M1_contact'].append(0)
                        mibi_table[key + '_M1_med_dist'].append(np.nan)
                if key != VASCULAR:
                    if median_distances_vascular[fov_int]: # has vascular cells in the sample
                        mibi_table[key + '_vascular_med_dist'].append(median_distances_vascular[fov_int][key])
                        mibi_table[key + '_vascular_contact'].append(contact_enrichments_vascular[fov_int][key])
                    else:
                        if v == 'missing_na':
                            mibi_table[key + '_vascular_contact'].append(np.nan)
                        else:
                            mibi_table[key + '_vascular_contact'].append(0)
                        mibi_table[key + '_vascular_med_dist'].append(np.nan)
            else:
                if v == 'missing_na':
                    mibi_table[key + '_assort'].append(np.nan)
                    mibi_table[key + '_prop'].append(np.nan)
                    mibi_table[key + '_region'].append(np.nan)
                else:
                    mibi_table[key + '_assort'].append(0)
                    mibi_table[key + '_prop'].append(0)
                    mibi_table[key + '_region'].append(0)
                if key != TUMOR:
                    mibi_table[key + '_tumor_med_dist'].append(np.nan)
                    if v =='missing_na':
                        mibi_table[key + '_tumor_contact'].append(np.nan)
                    else:
                        mibi_table[key + '_tumor_contact'].append(0)
                if key != M1:
                    mibi_table[key + '_M1_med_dist'].append(np.nan)
                    if v == 'missing_na':
                        mibi_table[key + '_M1_contact'].append(np.nan)
                    else:
                        mibi_table[key + '_M1_contact'].append(0)
                if key != VASCULAR:
                    mibi_table[key + '_vascular_med_dist'].append(np.nan)
                    if v == 'missing_na':
                        mibi_table[key + '_vascular_contact'].append(np.nan)
                    else:
                        mibi_table[key + '_vascular_contact'].append(0)
                        
                tma_id = fovint_tma_map[fov_int]
                
    mibi_df = pd.DataFrame(mibi_table)
    
    other_df = df[['TMA ID', 'Overall Survival','Death (1=yes, 0=no)', 'BRCA-mutation (Y=1, N=0, N/A=Not tested)',\
                   'Age at Diagnosis','Progression-free Survival','Recurrence 1=yes','H3K14Ace', 'ATF6', 'DUSP1', 'CBX2',\
                  'OS_high','PFS_high','Primary']].copy()
    rename_dict = {'TMA ID':'TMA_ID','Overall Survival':'OS','Death (1=yes, 0=no)':'Death',\
                   'BRCA-mutation (Y=1, N=0, N/A=Not tested)':'BRCA_Mutation','Age at Diagnosis':'Age',\
                  'Progression-free Survival':'PFS','Recurrence 1=yes':'Recurrence'}
    other_df.rename(columns=rename_dict, inplace=True)
    
    mibi_df = pd.merge(mibi_df,other_df,on='TMA_ID',how='left')
    
    mibi_df.to_csv(os.path.join('Data',f'mibi_table_{v}.csv'),index=False)
    
    if v == "main":
        # primary - Trim to only primary tumors 
        mibi_primary_only = mibi_df[mibi_df["Primary"]==1]
        mibi_primary_only.to_csv(os.path.join('Data',f'mibi_table_primary.csv'),index=False)