# Preamble

This is the code to generate the preprocessed single cell data. Some libraries for this are not included.
Nevertheless this files gives a general idea on how the data was preprocessed. The resulting files are provided.

# General parameters

In [1]:
stimulations = "all"
n_threads = 64
n_jobs_fcs = 64

# Imports

In [2]:
%load_ext autoreload
%autoreload 2

# disable parallelization for BLAS and co.
from nalabtools.utils.parallelization import set_threads_for_external_libraries
set_threads_for_external_libraries(n_threads=n_threads)

# general
import re
import collections
import pickle
import warnings 
import joblib
import pathlib

# data
import numpy as np
import pandas as pd
import h5py

# ml / stats
import sklearn
import scipy.stats

# plotting
import matplotlib.pyplot as plt

# nalab
import nalabtools
from nalabtools.pandas.utils import map_categories
import nalabdata

In [3]:
# even though it says R version 3.6.3 (the one installed on the server) it still uses 4.0.3 (check with `import rpy2.robjects as robjects; robjects.r["version"]`)
%run -m rpy2.situation

[1mrpy2 version:[0m
3.4.5
[1mPython version:[0m
3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 16:07:37) 
[GCC 9.3.0]
[1mLooking for R's HOME:[0m
    Environment variable R_HOME: /home/mgbckr/miniconda3/envs/analysis/lib/R
    Calling `R RHOME`: /home/mgbckr/miniconda3/envs/analysis3.7/lib/R
    Environment variable R_LIBS_USER: None
[1mR's additions to LD_LIBRARY_PATH:[0m
/home/mgbckr/miniconda3/envs/analysis/lib/R/lib
[1mR version:[0m
    In the PATH: R version 3.6.3 (2020-02-29) -- "Holding the Windsock"
    Loading R library from rpy2: OK
[1mAdditional directories to load R packages from:[0m
None
[1mC extension compilation:[0m
  include:
  ['/home/mgbckr/miniconda3/envs/analysis/lib/R/include']
  libraries:
  ['R', 'pcre2-8', 'lzma', 'bz2', 'z', 'rt', 'dl', 'm', 'iconv', 'icuuc', 'icui18n']
  library_dirs:
  ['/home/mgbckr/miniconda3/envs/analysis/lib', '/home/mgbckr/miniconda3/envs/analysis/lib/R/lib', '/home/mgbckr/miniconda3/envs/analysis/lib']
  extra_c

# Load Cytof data

In [4]:
import nalabdata.fcs.load
data_dir = pathlib.Path("../../nalab-data/data")
fcs_dir = data_dir / "large/fcs/immune_clock/Training_gated_with-stimulations/fcs_files"

In [5]:
%%time
if stimulations == "unstim":
    cytof_filenames = [f for f in list(fcs_dir.glob("*")) if re.match("Gates_PTLG[0-9]*?_[23]_Unstim.*", f.name)]
elif stimulations == "all":
    cytof_filenames = [f for f in list(fcs_dir.glob("*")) if re.match("Gates_PTLG[0-9]*?_[23]_.*", f.name)]
else:
    raise ValueError("Unknown stimulation selector:", stimulations)

print("Number of FCS files:", len(cytof_filenames))

Number of FCS files: 3328
CPU times: user 35.4 ms, sys: 2.81 ms, total: 38.2 ms
Wall time: 94.1 ms


In [6]:
%%time
cytof = nalabdata.fcs.load.load_fcs(cytof_filenames, n_jobs=n_jobs_fcs)

R[write to console]: Loading required package: foreach

R[write to console]: Loading required package: iterators

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following object is masked from ‘package:flowCore’:

    filter


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




CPU times: user 11min 29s, sys: 3min 4s, total: 14min 33s
Wall time: 14min 45s


In [8]:
%%time
# extract patient ids
map_code_to_patientid = {
    i:int(re.search(".*PTLG0*(.*?)_", c).group(1)) 
    for i,c in enumerate(cytof["file_name"].cat.categories)}
patient_id_column = cytof["file_name"].cat.codes.apply(lambda x: map_code_to_patientid[x])

CPU times: user 41.8 s, sys: 3.3 s, total: 45.1 s
Wall time: 45.1 s


In [9]:
%%time
# extract timepoints
def fix_timepoint(t):
    if t == "BL":
        return "T1"
    elif t == "1":
        return "T2"
    elif t == "2":
        return "T3"
    elif t == "3":
        return "PP"
    else:
        raise ValueError(f"Unknown timepoint: {t}")
    
map_code_to_timepoint = {
    c:fix_timepoint(re.search(".*PTLG0.*?_(.*?)_.*", c).group(1)) 
    for c in cytof["file_name"].cat.categories}

timepoint_column = map_categories(cytof["file_name"], map_code_to_timepoint)

CPU times: user 21.4 s, sys: 753 ms, total: 22.2 s
Wall time: 22.1 s


In [13]:
%%time
# extract cell types
map_code_to_celltype = {
    c:re.search(".*(Unstim|LPS_100|IL_100|IFNa_100)_(.*?)\.fcs", c).group(2) 
    for c in cytof["file_name"].cat.categories}
cell_type_column = map_categories(cytof["file_name"], map_code_to_celltype)

CPU times: user 19.3 s, sys: 777 ms, total: 20.1 s
Wall time: 20.1 s


In [16]:
%%time
# extract stimulation
map_code_to_stimulation = {
    c:re.search(".*(Unstim|LPS_100|IL_100|IFNa_100)_.*?\.fcs", c).group(1) 
    for c in cytof["file_name"].cat.categories}
stimulation_column = map_categories(cytof["file_name"], map_code_to_stimulation)
stimulation_column

CPU times: user 19.8 s, sys: 740 ms, total: 20.5 s
Wall time: 20.5 s


['IFNa_100', 'IFNa_100', 'IFNa_100', 'IFNa_100', 'IFNa_100', ..., 'Unstim', 'Unstim', 'Unstim', 'Unstim', 'Unstim']
Length: 94167627
Categories (4, object): ['IFNa_100', 'IL_100', 'LPS_100', 'Unstim']

In [17]:
# add patient, cell type and timepoint column
cytof.insert(0, "patient_id", patient_id_column)
cytof.insert(1, "timepoint", timepoint_column)
cytof.insert(2, "cell_type", cell_type_column)
cytof.insert(3, "stimulation", stimulation_column)

In [18]:
# drop file columns
cytof.drop(columns="file_name", inplace=True)

In [None]:
out_file = pathlib.Path("../data/processed") / f"immuneclock_singlecell_{stimulations}.h5"
out_file.parent.mkdir(parents=True, exist_ok=True)

cytof.to_hdf(out_file, key="immuneclock_singlecell", format="table")