# Preamble

This is the code to generate the preprocessed single cell data. This requires to install `flowCore` which is not pre-installed in the Docker container and was not tested.
Nevertheless this files gives a general idea on how the data was preprocessed. 

**Note:** The result of this notebook `immuneclock_singlecell_unstim.h5` is provided (see `README.md`).

# General parameters

In [None]:
stimulations = "all"
n_threads = 64
n_jobs_fcs = 64

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

# disable parallelization for BLAS and co.
from corals.threads import set_threads_for_external_libraries
set_threads_for_external_libraries(n_threads=16)

# general
import re
import collections
import pickle
import warnings 
import joblib
import pathlib

# data
import numpy as np
import pandas as pd
import h5py

# ml / stats
import sklearn
import scipy.stats

# plotting
import matplotlib.pyplot as plt

# nalab
from coralsarticle.data.process.fcs.load import map_categories
from coralsarticle.data.process.fcs.load import load_fcs

In [None]:
# even though it says R version 3.6.3 (the one installed on the server) it still uses 4.0.3 (check with `import rpy2.robjects as robjects; robjects.r["version"]`)
%run -m rpy2.situation

# Load Cytof data

In [None]:
data_dir = pathlib.Path("../../nalab-data/data")
fcs_dir = data_dir / "large/fcs/immune_clock/Training_gated_with-stimulations/fcs_files"

In [None]:
%%time
if stimulations == "unstim":
    cytof_filenames = [f for f in list(fcs_dir.glob("*")) if re.match("Gates_PTLG[0-9]*?_[23]_Unstim.*", f.name)]
elif stimulations == "all":
    cytof_filenames = [f for f in list(fcs_dir.glob("*")) if re.match("Gates_PTLG[0-9]*?_[23]_.*", f.name)]
else:
    raise ValueError("Unknown stimulation selector:", stimulations)

print("Number of FCS files:", len(cytof_filenames))

In [None]:
%%time
cytof = load_fcs(cytof_filenames, n_jobs=n_jobs_fcs)

In [None]:
%%time
# extract patient ids
map_code_to_patientid = {
    i:int(re.search(".*PTLG0*(.*?)_", c).group(1)) 
    for i,c in enumerate(cytof["file_name"].cat.categories)}
patient_id_column = cytof["file_name"].cat.codes.apply(lambda x: map_code_to_patientid[x])

In [None]:
%%time
# extract timepoints
def fix_timepoint(t):
    if t == "BL":
        return "T1"
    elif t == "1":
        return "T2"
    elif t == "2":
        return "T3"
    elif t == "3":
        return "PP"
    else:
        raise ValueError(f"Unknown timepoint: {t}")
    
map_code_to_timepoint = {
    c:fix_timepoint(re.search(".*PTLG0.*?_(.*?)_.*", c).group(1)) 
    for c in cytof["file_name"].cat.categories}

timepoint_column = map_categories(cytof["file_name"], map_code_to_timepoint)

In [None]:
%%time
# extract cell types
map_code_to_celltype = {
    c:re.search(".*(Unstim|LPS_100|IL_100|IFNa_100)_(.*?)\.fcs", c).group(2) 
    for c in cytof["file_name"].cat.categories}
cell_type_column = map_categories(cytof["file_name"], map_code_to_celltype)

In [None]:
%%time
# extract stimulation
map_code_to_stimulation = {
    c:re.search(".*(Unstim|LPS_100|IL_100|IFNa_100)_.*?\.fcs", c).group(1) 
    for c in cytof["file_name"].cat.categories}
stimulation_column = map_categories(cytof["file_name"], map_code_to_stimulation)
stimulation_column

In [None]:
# add patient, cell type and timepoint column
cytof.insert(0, "patient_id", patient_id_column)
cytof.insert(1, "timepoint", timepoint_column)
cytof.insert(2, "cell_type", cell_type_column)
cytof.insert(3, "stimulation", stimulation_column)

In [None]:
# drop file columns
cytof.drop(columns="file_name", inplace=True)

In [None]:
out_file = pathlib.Path("../data/processed") / f"immuneclock_singlecell_{stimulations}.h5"
out_file.parent.mkdir(parents=True, exist_ok=True)

cytof.to_hdf(out_file, key="immuneclock_singlecell", format="table")