# Mx_Coculture: Data preparation

The Co-Culture-Dataset of HeLa and NIH3T3 cells was not available in the typical structure of SpaceM-processed data. Thus, the data is converted to this format to enable the use of the same analysis pipeline.

In [None]:
import os
import platform
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import statistics as st
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import statsmodels.formula.api as smf
import seaborn as sns
import re
from importlib import reload
import json
import sys
sys.path.append('/home/mklein/spacem')
sys.path.append('/home/mklein/FDA_project')

from src.correction import (add_normalization_factors, 
                            correct_quantile_inplace,
                            deconvolution_spacem,
                            get_overlap_data,
                            add_overlap_matrix_spacem
                           )
from src import const 
from SpaceM.lib.modules import (
    overlap_analysis,
    single_cell_analysis_normalization
)

%matplotlib inline
%config InlineBackend.figure_formats = ['retina']

In [None]:
# Parameters
source_path = "/home/mklein/Raw Data/Coculture"
target_path = "/home/mklein/FDA_project/data/Mx_Co_Cultured"
condition_name = "condition"
well_name = "rowcol"
deconv_default_min_overlap = 0.3
analysis_path = "/home/mklein/FDA_project/analysis/Mx_Coculture"
notebooks = [
    "pipeline_01_correction.ipynb",
    "pipeline_02_processing.ipynb",
    "pipeline_03_evaluation.ipynb",
]
project = "Mx_Coculture"


In [None]:
dataset_co_cult = np.load(os.path.join(target_path, 'marks_flitered_fluo.npy'), allow_pickle = True)

dict_headers = ["norm_MM",
    "cell_marks",
    "nucl_fluo",
    "cell_fluo",
    "marks_fluo",
    "marks_cell_overlap",
    "mark_area",
    "overlap_indices",
    "marks_fluo_overlap",
    "cell_area",
    "marks_cell_overlap_indexes",
    "marks_cellLabels",
    "marks_samplingArea",
    "pmi",
    "overLaps"]

cell_area = dataset_co_cult[9]

# reference, which cells are covered by which marks
cell_marks = dataset_co_cult[1]

# size of every mark
mark_area = dataset_co_cult[6]

# reference of overlap area between cells and marks
marks_cell_overlap = dataset_co_cult[5]

In [None]:
mark_regions = pd.DataFrame({'am_id': mark_area.keys(), 'area': mark_area.values()}, index=mark_area.keys()).set_index('am_id')
mark_regions

In [None]:
overlap_regions_list = [] 
for k, v in cell_marks.items():
    if len(v) > 0:
        for i, am in enumerate(v):
            dic = {'cell_id': int(k), 'am_id': int(am), 'area': float(marks_cell_overlap[str(k)][i])}
            overlap_regions_list.append(dic)

overlap_regions = pd.DataFrame(overlap_regions_list)
overlap_regions['overlap_id'] = overlap_regions.index + 1
overlap_regions = overlap_regions.set_index('overlap_id')
overlap_regions

In [None]:
cell_regions_list = []
for k, v in cell_area.items():
    if len(v) == 0:
        area = 0.0
    else:
        area = float(v[0])
    cell_regions_list.append({'cell_id': int(k), 'area': area})
        
cell_regions = pd.DataFrame(cell_regions_list).set_index('cell_id')
cell_regions

In [None]:
ion_intensities = pd.read_csv(os.path.join(target_path, 'sm_annotation_detections.csv'))

ion_intensities = ion_intensities.drop(['Num', 'X', 'Y', 'Z', 'R'], axis=1)

# ion_intensities = ion_intensities[['C16H30O2', "C25H44NO7P", "C45H78NO8P"]]
# ion_intensities = ion_intensities.iloc[selected_pixels]

ion_intensities.index = [str(i) for i in ion_intensities.index]
ion_intensities


In [None]:
am_adata = ad.AnnData(X = ion_intensities.astype(np.float32))

In [None]:
condition_metadata = pd.read_csv(os.path.join(target_path, 'MORPHnMOL.csv'))
condition_metadata.index = [const.CELL_PRE + str(i) for i in condition_metadata.ObjectNumber]
condition_metadata['GFP'] = condition_metadata.Intensity_MeanIntensity_GFP_quantif
condition_metadata['MCherry'] = condition_metadata.Intensity_MeanIntensity_mCherry_quantif
condition_metadata['fluorescence_ratio'] = np.log(condition_metadata.GFP / condition_metadata.MCherry)

#condition_metadata['celltype'] = 'HeLa' if condition_metadata.fluorescence_ratio < 0.8 else 'NIH3T3'
condition_metadata['celltype'] = np.where(condition_metadata.fluorescence_ratio < 0.8, 'HeLa', 'NIH3T3')

print(condition_metadata['celltype'].value_counts())

plot = sns.relplot(data=condition_metadata, x='GFP', y='MCherry', hue='celltype')
plot.set(xscale='log')
plot.set(yscale='log')
plot.set(title='Cell type attributation based on GFP / MCherry fluorescence ratio')

raw_adata = ad.AnnData(X=np.array(condition_metadata[am_adata.var.index.intersection(condition_metadata.columns)]),
    obs=condition_metadata[['ObjectNumber', 'celltype', 'GFP', 'MCherry', 'fluorescence_ratio']], 
    var=am_adata.var.loc[am_adata.var.index.intersection(condition_metadata.columns)])

In [None]:
am_adata = am_adata[:, am_adata.var.index.intersection(condition_metadata.columns)].copy()

In [None]:
r_adata = pd.read_csv(os.path.join(target_path, 'cell_spatiomolecular_matrix.csv'))
r_adata['cell_id'] = r_adata['Unnamed: 0']
del r_adata['Unnamed: 0']
r_adata_df = r_adata.set_index('cell_id').replace(np.nan, 0)

In [None]:
r_cell_adata = raw_adata.copy()
r_cell_adata.X = r_adata_df
r_cell_adata.to_df()

In [None]:
if not os.path.exists(os.path.join(source_path, 'dataset/analysis/ablation_mark_analysis')):
        os.makedirs(os.path.join(source_path, 'dataset/analysis/ablation_mark_analysis'))
if not os.path.exists(os.path.join(source_path, 'dataset/analysis/overlap_analysis2')):
        os.makedirs(os.path.join(source_path, 'dataset/analysis/overlap_analysis2'))
if not os.path.exists(os.path.join(source_path, 'dataset/analysis/single_cell_analysis')):
        os.makedirs(os.path.join(source_path, 'dataset/analysis/single_cell_analysis'))
        
am_adata.write(os.path.join(source_path, 'dataset/analysis/ablation_mark_analysis/spatiomolecular_adata.h5ad'))
# raw_adata.write(os.path.join(source_path, 'dataset/analysis/single_cell_analysis/spatiomolecular_adata.h5ad'))
r_cell_adata.write(os.path.join(source_path, 'dataset/analysis/single_cell_analysis/spatiomolecular_adata.h5ad'))

cell_regions.to_csv(os.path.join(source_path, 'dataset/analysis/overlap_analysis2/cell.regions.csv'))
mark_regions.to_csv(os.path.join(source_path, 'dataset/analysis/overlap_analysis2/ablation_mark.regions.csv'))
overlap_regions.to_csv(os.path.join(source_path, 'dataset/analysis/overlap_analysis2/overlap.regions.csv'))