In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
import gzip, re
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Collect paths to fastq files for each sample 

In [2]:
def num_reads(file):
    """get number of reads in an open file"""
    return int((1 + sum(1 for _ in file)) / 4)


def file_info(filename):
    """get info for a gzipped file"""
    first = None
    with gzip.open(filename, "rt") as f:
        first = f.readline().rstrip()
        # nreads = num_reads(f)

    # return (filename, first, nreads)
    return (filename, first)


# define functions for file meta data generate from file name
def fields(filename):
    """parse filename to get relevant fields"""
    m = re.search(r"/([^/]+?)(_001)?.fastq.gz$", filename)
    assert m is not None
    basename = m.group(1)

    r = dict()
    r["filename"] = filename
    r["basename"] = basename
    r["pair_id"] = re.sub("_(R[12])$", "", basename)

    if basename.startswith("plate"):
        m2 = re.search(r"^(plate\d+)_([A-H]\d+)_(S\d+)_(R[12])$", basename)
        assert m2 is not None
        r["individual"] = "CommonBrain"
        r["sample_id1"] = m2.group(1).lower() + m2.group(2).upper()
        r["sample_id2"] = m2.group(3).upper()
        r["read"] = m2.group(4).upper()
        r["tissue"] = "brain"
        r["tissue_id"] = "CommonBrain"
        r["sample_type"] = "single_cell"
        r["dna_type"] = "mda"
    elif basename.upper().startswith("US"):
        m2 = re.search(
            r"^(US([DH])(\d+))_?([A-H]\d+)_(S\d+)_(R[12])$",
            basename,
            flags=re.IGNORECASE,
        )
        r["individual"] = re.sub("^0+", "", m2.group(3))
        r["sample_id1"] = m2.group(4).upper()
        r["sample_id2"] = m2.group(5).upper()
        r["read"] = m2.group(6).upper()
        r["tissue"] = "HIPPO" if m2.group(2).upper() == "H" else "DLPFC"
        ind = r["individual"] if int(r["individual"]) >= 10 else "0" + r["individual"]
        r["tissue_id"] = "US" + m2.group(2).upper() + ind
        r["sample_type"] = "single_cell"
        r["dna_type"] = "mda"
    elif basename.startswith("gDNA"):
        m2 = re.search(r"^gDNA_(US([DH])(\d+))_(R[12])$", basename, flags=re.IGNORECASE)
        r["individual"] = re.sub("^0+", "", m2.group(3))
        r["sample_id1"] = "bulk"
        r["sample_id2"] = "Sbulk"
        r["read"] = m2.group(4).upper()
        r["tissue"] = "HIPPO" if m2.group(2).upper() == "H" else "DLPFC"
        ind = r["individual"] if int(r["individual"]) >= 10 else "0" + r["individual"]
        r["tissue_id"] = "US" + m2.group(2).upper() + ind
        r["sample_type"] = "bulk"
        r["dna_type"] = "bulk"
    elif "Bulk" in basename:
        r["individual"] = "CommonBrain"
        r["sample_id1"] = "bulk"
        r["sample_id2"] = "Sbulk"
        r["read"] = "R1" if "R1" in basename else "R2"
        r["tissue"] = "DLPFC" if "cor" in basename else "fibroblast"
        r["tissue_id"] = "CommonBrain"
        r["sample_type"] = "bulk"
        r["dna_type"] = "bulk"
    else:
        r = None

    return r


def find_logg_files():
    """
    Find fastq files on filepaths
    """

    # glob for files
    files1 = Path("/iblm/logglun01/fqiu/AWS/for-ndar/SLAV-Seq/").rglob("**/*fastq.gz")
    files2 = Path("/iblm/logglun02/BSMN/SLAV-Seq/").rglob("**/*fastq.gz")
    files3 = Path("/iblm/netapp/data3/mcuoco/sz_slavseq/data/").rglob("*Bulk*fastq.gz")

    # add generators together
    files = list(files1) + list(files2) + list(files3)

    # use threads because first_line is i/o bound
    results = [file_info(str(f)) for f in tqdm(files)]
    print("found {} files".format(len(results)))

    # convert to dataframe
    print("extracting first read from each file")
    logg_files = pd.DataFrame.from_records(results, columns=["filename", "first_read"])

    print(
        f"""
		{logg_files.shape[0]} files
		{len(set(logg_files['first_read']))} unique files based on first read
		"""
    )

    # remove duplicates
    logg_files = logg_files.drop_duplicates(subset=["first_read"]).reset_index(
        drop=True
    )
    # logg_files.set_index("filename", inplace=True, drop=False)

    # extract sample metadata from filenames
    logg_files = pd.DataFrame.from_records(
        [dict(fields(x)) for x in logg_files["filename"]]
    ).merge(logg_files, on="filename")

    # unstack read1 and read2
    logg_files = (
        logg_files.pivot(
            columns="read",
            values="filename",
            index=["pair_id", "individual", "dna_type", "tissue_id"],
        )
        .reset_index()
        .rename(columns={"pair_id": "sample_id", "individual": "donor_id"})
        .dropna(subset=["R1", "R2"])
    )

    print(f"Found {len(logg_files)} unique samples with both R1 and R2 reads")

    return logg_files

## Create the samplesheet

In [3]:
meta = pd.read_csv("slavseq_metadata.tsv", sep="\t").drop(
    ["MDA_PERFORMED", "BULK_PERFORMED"], axis=1
)
meta.columns = meta.columns.str.lower()
meta.set_index("tissue_id", inplace=True)
meta.loc["CommonBrain", "libd_id"] = "CommonBrain"

# cleanup
# extract sample metadata from filenames
logg = find_logg_files().join(meta, on="tissue_id", how="left")  # join with metadata

  0%|          | 0/14098 [00:00<?, ?it/s]

found 14098 files
extracting first read from each file

		14098 files
		8886 unique files based on first read
		
Found 4438 unique samples with both R1 and R2 reads


## Create donor sheeet

In [4]:
# create donorsheet
donors = logg[
    ["donor_id", "brain_id", "sex", "age", "libd_id", "race", "diagnosis"]
].drop_duplicates()

donors.loc[donors["donor_id"] == "CommonBrain", "libd_id"] = "CommonBrain"
# switch

In [None]:
# find all the L1 breakpoint files
files = Path("../resources/chm13v2.0.XY/wgs_calls/30x/").rglob(
    "*breakpoint_pairs_pooled_all.txt.gz"
)
knrgl = {f.parts[-2]: str(f.resolve()) for f in files}
donors["breakpoints"] = donors["libd_id"].map(knrgl)

# find the 30x and 90x megane calls
files = Path("../resources/chm13v2.0.XY/wgs_calls/30x/").rglob(
    "*MEI_final_gaussian_genotyped.bed"
)
knrgl = {f.parts[-2]: str(f.resolve()) for f in files}
donors["megane_30x"] = donors["libd_id"].map(knrgl)

files = Path("../resources/chm13v2.0.XY/wgs_calls/90x/").rglob(
    "*MEI_final_gaussian_genotyped.bed"
)
meta_90x = pd.read_csv("../config/U01_LIBD_wgs.tsv", sep="\t")
for f in files:
    tissue_id = f.parts[-2]
    if tissue_id in meta_90x["WGS_90x_DLPFC"].values:
        libd_id = meta_90x.query("WGS_90x_DLPFC == @tissue_id")["ID"].values[0]
        donors.loc[donors["libd_id"] == libd_id, "megane_90x_DLPFC"] = str(f.resolve())
    elif tissue_id in meta_90x["WGS_90x_HIPPO"].values:
        libd_id = meta_90x.query("WGS_90x_HIPPO == @tissue_id")["ID"].values[0]
        donors.loc[donors["libd_id"] == libd_id, "megane_90x_HIPPO"] = str(f.resolve())

## Write

In [None]:
# write

# everyone
logg.to_csv("all_samples.tsv", sep="\t", index=False)
donors.to_csv("all_donors.tsv", sep="\t", index=False)

# write donor sheets to thirds
df1, df2, df3 = np.array_split(donors, 3)


# Write each part to a separate CSV file
df1.to_csv("onethird_donors1.tsv", index=False, sep="\t")
df2.to_csv("onethird_donors2.tsv", index=False, sep="\t")
df3.to_csv("onethird_donors3.tsv", index=False, sep="\t")