## Dataset
Google Drive or locally

In [None]:
import sys
from pathlib import Path


if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount("/content/drive")
    data_dir = Path("/content/drive/MyDrive/ahead")
else:
    print(
        "I guess you're using a local jupyter server. "
        "Please run the present notebook side by side the data files."
    )
    data_dir = Path.cwd().parent/"data"

data_dir.is_dir()

Mounted at /content/drive


True

In [None]:
list(data_dir.iterdir())

[PosixPath('/content/drive/MyDrive/ahead/raw_fcs'),
 PosixPath('/content/drive/MyDrive/ahead/EU_label.xlsx'),
 PosixPath('/content/drive/MyDrive/ahead/raw_fcs.zip'),
 PosixPath('/content/drive/MyDrive/ahead/EU_marker_channel_mapping.xlsx'),
 PosixPath('/content/drive/MyDrive/ahead/summary.csv'),
 PosixPath('/content/drive/MyDrive/ahead/train'),
 PosixPath('/content/drive/MyDrive/ahead/val'),
 PosixPath('/content/drive/MyDrive/ahead/500_fsc_ssc_area_plots')]

## Import Packages

In [None]:
# !pip install -qqq FlowCal==1.3.0

In [None]:
!pip install -qqq git+https://github.com/phunc20/ahead.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.3/82.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for ahead (pyproject.toml) ... [?25l[?25hdone
  Building wheel for FlowCal (setup.py) ... [?25l[?25hdone


In [None]:
#import ahead
import FlowCal
import numpy as np
import pandas as pd

## The Excel Files

In [None]:
df_marker_channel = pd.read_excel(data_dir/"EU_marker_channel_mapping.xlsx")
df_label = pd.read_excel(data_dir/"EU_label.xlsx")
df_summary = pd.read_csv(data_dir/"summary.csv")

print(f'df_marker_channel.shape   = {df_marker_channel.shape}')
print(f'df_label.shape            = {df_label.shape}')
print(f'df_summary.shape          = {df_summary.shape}')

df_marker_channel.shape   = (35, 4)
df_label.shape            = (40, 2)
df_summary.shape          = (40, 3)


In [None]:
# allowed_channels = df_marker_channel.loc[
#     df_marker_channel["use"]==1, "PxN(channel)"].to_list()
# allowed_channels

In [None]:
sub_dirs = sorted(subdir.name for subdir in (data_dir/"raw_fcs").iterdir())
sub_dirs

['flowrepo_covid_EU_002_flow_001',
 'flowrepo_covid_EU_003_flow_001',
 'flowrepo_covid_EU_004_flow_001',
 'flowrepo_covid_EU_005_flow_001',
 'flowrepo_covid_EU_006_flow_001',
 'flowrepo_covid_EU_007_flow_001',
 'flowrepo_covid_EU_008_flow_001',
 'flowrepo_covid_EU_009_flow_001',
 'flowrepo_covid_EU_010_flow_001',
 'flowrepo_covid_EU_011_flow_001',
 'flowrepo_covid_EU_012_flow_001',
 'flowrepo_covid_EU_013_flow_001',
 'flowrepo_covid_EU_014_flow_001',
 'flowrepo_covid_EU_015_flow_001',
 'flowrepo_covid_EU_016_flow_001',
 'flowrepo_covid_EU_017_flow_001',
 'flowrepo_covid_EU_018_flow_001',
 'flowrepo_covid_EU_019_flow_001',
 'flowrepo_covid_EU_020_flow_001',
 'flowrepo_covid_EU_021_flow_001',
 'flowrepo_covid_EU_022_flow_001',
 'flowrepo_covid_EU_023_flow_001',
 'flowrepo_covid_EU_030_flow_001',
 'flowrepo_covid_EU_031_flow_001',
 'flowrepo_covid_EU_032_flow_001',
 'flowrepo_covid_EU_033_flow_001',
 'flowrepo_covid_EU_034_flow_001',
 'flowrepo_covid_EU_035_flow_001',
 'flowrepo_covid_EU_

In [None]:
val_file_flow_ids = {
    "flowrepo_covid_EU_034_flow_001",
    "flowrepo_covid_EU_048_flow_001",
    "flowrepo_covid_EU_013_flow_001",
    "flowrepo_covid_EU_004_flow_001",
}
train_file_flow_ids = []
for s in sub_dirs:
    if s in val_file_flow_ids:
        continue
    train_file_flow_ids.append(s)
len(train_file_flow_ids)

36

In [None]:
!pip list | grep ahead

ahead                            0.0.1


In [None]:
!which pip

/usr/local/bin/pip


In [None]:
!which pip3

/usr/local/bin/pip3


In [None]:
!ls -l $(which pip)

-r-xr-xr-x 1 root root 155 Jan  1  2000 /usr/local/bin/pip


In [None]:
!ls -l $(which pip3)

-rwxr-xr-x 1 root root 221 Jul 26 13:27 /usr/local/bin/pip3


In [None]:
!which python

/usr/local/bin/python


In [None]:
from __future__ import annotations
from pathlib import Path

import FlowCal


def get_fsc_ssc_chunks(
    fcs_file: str | Path,
    *,
    chunk_size: int = 500,
    typ: str = "A",
    gate_fraction: float = 0.0,
):
    channels = [f'FSC-{typ}', f'SSC-{typ}']
    s = FlowCal.io.FCSData(str(fcs_file))
    s = FlowCal.transform.to_rfi(s)
    s_gated = FlowCal.gate.high_low(
        s,
        channels=channels,
    )
    if gate_fraction:
        s_gated = FlowCal.gate.density2d(
            s_gated,
            channels=channels,
            gate_fraction=gate_fraction,
        )
    n_events = s_gated.shape[0]
    for k in range(0, n_events, chunk_size):
        chunk = s_gated[k:k+chunk_size, channels]
        yield chunk

In [None]:
from tqdm.auto import tqdm
#from ahead.util import get_fsc_ssc_chunks
#import ahead

In [None]:
df_summary

Unnamed: 0,file_flow_id,n_events,wuhan
0,flowrepo_covid_EU_007_flow_001,1860,False
1,flowrepo_covid_EU_002_flow_001,363314,False
2,flowrepo_covid_EU_004_flow_001,183001,False
3,flowrepo_covid_EU_005_flow_001,298047,False
4,flowrepo_covid_EU_006_flow_001,248917,False
5,flowrepo_covid_EU_003_flow_001,311492,False
6,flowrepo_covid_EU_008_flow_001,10959,False
7,flowrepo_covid_EU_011_flow_001,29363,False
8,flowrepo_covid_EU_012_flow_001,18618,False
9,flowrepo_covid_EU_010_flow_001,17412,False


In [None]:
chunk_size = 10_000
gate_fraction = 0.75
typ = "A"

In [None]:
plot_dir = data_dir/f'{chunk_size}_fsc_ssc_area_plots'
plot_dir.mkdir(exist_ok=True, parents=True)

for file_flow_id in tqdm(train_file_flow_ids):
    fcs_file = next((data_dir/f'raw_fcs/{file_flow_id}').glob("*.fcs"))
    wuhan = df_summary.loc[df_summary["file_flow_id"] == file_flow_id, "wuhan"]
    for k, chunk in enumerate(get_fsc_ssc_chunks(
        fcs_file,
        chunk_size=chunk_size,
        typ="A",
        gate_fraction=gate_fraction,
    )):
        plot_file = plot_dir/f'{int(wuhan)}_{file_flow_id}_chunk_{k}.png'
        FlowCal.plot.density2d(
            chunk,
            mode="scatter",
            savefig=plot_file,
        )

  0%|          | 0/36 [00:00<?, ?it/s]

In [None]:
!ls {plot_dir} | wc -l

357


In [None]:
!ls {plot_dir} | grep "^0_" | wc -l

104


In [None]:
!ls {plot_dir} | grep "^1_" | wc -l

253


In [None]:
!mkdir -p {plot_dir}/healthy
!mv {plot_dir}/0_* {plot_dir}/healthy

In [None]:
!mkdir -p {plot_dir}/sick
!mv {plot_dir}/1_* {plot_dir}/sick

In [None]:
!echo {plot_dir}

/content/drive/MyDrive/ahead/10000_fsc_ssc_area_plots


In [None]:
!mkdir -p {plot_dir}/train
!mv {plot_dir}/sick {plot_dir}/train/
!mv {plot_dir}/sick {plot_dir}/train/

In [None]:
val_plot_dir = data_dir/f'{chunk_size}_fsc_ssc_area_plots/val'
#val_plot_dir.mkdir(exist_ok=True, parents=True)
(val_plot_dir/"healthy").mkdir(exist_ok=True, parents=True)
(val_plot_dir/"sick").mkdir(exist_ok=True, parents=True)

for file_flow_id in tqdm(val_file_flow_ids):
    fcs_file = next((data_dir/f'raw_fcs/{file_flow_id}').glob("*.fcs"))
    wuhan = df_summary.loc[df_summary["file_flow_id"] == file_flow_id, "wuhan"]
    for k, chunk in enumerate(get_fsc_ssc_chunks(
        fcs_file,
        chunk_size=chunk_size,
        typ=typ,
        gate_fraction=gate_fraction,
    )):
        # print(f'{wuhan. = }')
        wuhan = int(wuhan)
        if wuhan:
            plot_file = val_plot_dir/f'sick/{int(wuhan)}_{file_flow_id}_chunk_{k}.png'
        else:
            plot_file = val_plot_dir/f'healthy/{int(wuhan)}_{file_flow_id}_chunk_{k}.png'

        FlowCal.plot.density2d(
            chunk,
            mode="scatter",
            savefig=plot_file,
        )

  0%|          | 0/4 [00:00<?, ?it/s]