In [157]:
import pandas as pd
import numpy as np
from typing import List, Tuple
from pathlib import Path

fastq_directory = Path("/mnt/scratch/monocytes_vil10/data/fastqs")

In [149]:
from itertools import takewhile
  
def find_prefix(strlist: List[str]) -> str:
    res = ''.join(c[0] for c in takewhile(lambda x: 
            all(x[0] == y for y in x), zip(*strlist)))
    return res

In [72]:
def find_suffix(strlist: List[str]) -> str:
    revlist = [_[::-1] for _ in strlist]
    res = ''.join(c[0] for c in takewhile(lambda x: 
            all(x[0] == y for y in x), zip(*revlist)))
    return res[::-1]

In [96]:
subdirs = [_ for _ in fastq_directory.iterdir() if _.is_dir()]

In [76]:
import re

In [89]:
illumina_pattern = re.compile(r"\S+(?=_S\d+_R\d+_\d+\.fastq\.gz)")

In [94]:
[illumina_pattern.match(_)[0] for _ in b]

['mono_vil10_adt_set1_RPI1',
 'mono_vil10_adt_set1_RPI1',
 'mono_vil10_gex_set1_SI-GA-D10_1',
 'mono_vil10_gex_set1_SI-GA-D10_1',
 'mono_vil10_gex_set1_SI-GA-D10_2',
 'mono_vil10_gex_set1_SI-GA-D10_2',
 'mono_vil10_gex_set1_SI-GA-D10_3',
 'mono_vil10_gex_set1_SI-GA-D10_3',
 'mono_vil10_gex_set1_SI-GA-D10_4',
 'mono_vil10_gex_set1_SI-GA-D10_4',
 'mono_vil10_hto_set1_D701',
 'mono_vil10_hto_set1_D701']

In [99]:
subdir = subdirs[0]

In [209]:
subdir.name

'set1'

In [205]:
fastq_files = []
sample_names = []
library_types = []

adt_subdir = subdir.joinpath("adt")
gex_subdir = subdir.joinpath("gex")
hto_subdir = subdir.joinpath("hto")

In [202]:
def extract_subdir_files(
    subdir: Path,
    libtype: str,
) -> Tuple[List[str], List[str], List[str]]:
    seq_files = list(subdir.glob("*.fastq.gz"))
    fq_files = [str(subdir.resolve())]*(len(seq_files)//2)
    names = np.unique([illumina_pattern.match(_.name)[0] for _ in seq_files])
    if libtype == "adt" or libtype == "hto":
        lib_descript = ["Antibody Capture"]*(len(seq_files)//2)
    elif libtype == "gex":
        lib_descript = ["Gene Expression"]*(len(seq_files)//2)
    
    return fq_files, names, lib_descript

In [199]:
extract_subdir_files(subdir = subdir.joinpath("gex"), libtype="gex")

(['/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex',
  '/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex',
  '/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex',
  '/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex'],
 array(['mono_vil10_gex_set1_SI-GA-D10_1',
        'mono_vil10_gex_set1_SI-GA-D10_2',
        'mono_vil10_gex_set1_SI-GA-D10_3',
        'mono_vil10_gex_set1_SI-GA-D10_4'], dtype='<U31'),
 ['Gene Expression', 'Gene Expression', 'Gene Expression', 'Gene Expression'])

In [206]:
if adt_subdir.exists():
    x, y, z = extract_subdir_files(subdir = subdir.joinpath("adt"), libtype="adt")
    fastq_files = np.concatenate([fastq_files, x]).flat
    sample_names = np.concatenate([sample_names, y]).flat
    library_types = np.concatenate([library_types, z]).flat

if gex_subdir.exists():
    x, y, z = extract_subdir_files(subdir = subdir.joinpath("gex"), libtype="gex")
    fastq_files = np.concatenate([fastq_files, x]).flat
    sample_names = np.concatenate([sample_names, y]).flat
    library_types = np.concatenate([library_types, z]).flat
    
if hto_subdir.exists():
    x, y, z = extract_subdir_files(subdir = subdir.joinpath("hto"), libtype="hto")
    fastq_files = np.concatenate([fastq_files, x]).flat
    sample_names = np.concatenate([sample_names, y]).flat
    library_types = np.concatenate([library_types, z]).flat

In [207]:
pd.DataFrame(
    data={
        "fastqs": list(fastq_files),
        "sample": sample_names,
        "library_type": library_types,
    }
)

Unnamed: 0,fastqs,sample,library_type
0,/mnt/scratch/monocytes_vil10/data/fastqs/set1/adt,mono_vil10_adt_set1_RPI1,Antibody Capture
1,/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex,mono_vil10_gex_set1_SI-GA-D10_1,Gene Expression
2,/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex,mono_vil10_gex_set1_SI-GA-D10_2,Gene Expression
3,/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex,mono_vil10_gex_set1_SI-GA-D10_3,Gene Expression
4,/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex,mono_vil10_gex_set1_SI-GA-D10_4,Gene Expression
5,/mnt/scratch/monocytes_vil10/data/fastqs/set1/hto,mono_vil10_hto_set1_D701,Antibody Capture


In [184]:
list(fastq_files)

['/mnt/scratch/monocytes_vil10/data/fastqs/set1/adt',
 '/mnt/scratch/monocytes_vil10/data/fastqs/set1/adt',
 '/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex',
 '/mnt/scratch/monocytes_vil10/data/fastqs/set1/adt',
 '/mnt/scratch/monocytes_vil10/data/fastqs/set1/gex',
 '/mnt/scratch/monocytes_vil10/data/fastqs/set1/hto']

In [185]:
list(sample_names)

['mono_vil10_adt_set1_RPI1',
 'mono_vil10_adt_set1_RPI1',
 'mono_vil10_gex_set1_SI-GA-D10_1',
 'mono_vil10_gex_set1_SI-GA-D10_2',
 'mono_vil10_gex_set1_SI-GA-D10_3',
 'mono_vil10_gex_set1_SI-GA-D10_4',
 'mono_vil10_adt_set1_RPI1',
 'mono_vil10_gex_set1_SI-GA-D10_1',
 'mono_vil10_gex_set1_SI-GA-D10_2',
 'mono_vil10_gex_set1_SI-GA-D10_3',
 'mono_vil10_gex_set1_SI-GA-D10_4',
 'mono_vil10_hto_set1_D701']

In [62]:
libtypes = ["gex", "adt", "hto"]
for y in range(1,9):
    pd.DataFrame(
        data={
            "fastqs": [f"/s/guth-aci/monocytes_vil10/data/fastqs/set{y}/{x}" for x in libtypes],
            "sample": [f"mono_vil10_{x}_set{y}" for x in libtypes],
            "library_type": ["Gene Expression", "Antibody Capture", "Antibody Capture"],
        }
    ).to_csv(f"/mnt/scratch/monocytes_vil10/metadata/libraries_set{y}.csv", index=False)