# Collect paths to fastq files for each sample 

In [21]:
import pandas as pd
import glob
import gzip, re
from joblib import Parallel, delayed
from tqdm import tqdm

In [22]:
def num_reads(file):
    """get number of reads in an open file"""
    return int((1 + sum(1 for _ in file)) / 4)


def file_info(filename):
    """get info for a gzipped file"""
    first = None
    with gzip.open(filename, "rt") as f:
        first = f.readline().rstrip()
        # nreads = num_reads(f)

    # return (filename, first, nreads)
    return (filename, first)


# define functions for file meta data generate from file name
def fields(filename):
    """parse filename to get relevant fields"""
    m = re.search("/([^/]+?)(_001)?.fastq.gz$", filename)
    assert m is not None
    basename = m.group(1)

    r = dict()
    r["filename"] = filename
    r["basename"] = basename
    r["pair_id"] = re.sub("_(R[12])$", "", basename)

    if basename.startswith("plate"):
        m2 = re.search("^(plate\d+)_([A-H]\d+)_(S\d+)_(R[12])$", basename)
        assert m2 is not None
        r["individual"] = "CommonBrain"
        r["sample_id1"] = m2.group(1).lower() + m2.group(2).upper()
        r["sample_id2"] = m2.group(3).upper()
        r["read"] = m2.group(4).upper()
        r["tissue"] = "brain"
        r["tissue_id"] = "CommonBrain"
        r["sample_type"] = "single_cell"
        r["dna_type"] = "mda"
    elif basename.upper().startswith("US"):
        m2 = re.search(
            "^(US([DH])(\d+))_?([A-H]\d+)_(S\d+)_(R[12])$",
            basename,
            flags=re.IGNORECASE,
        )
        r["individual"] = re.sub("^0+", "", m2.group(3))
        r["sample_id1"] = m2.group(4).upper()
        r["sample_id2"] = m2.group(5).upper()
        r["read"] = m2.group(6).upper()
        r["tissue"] = "HIPPO" if m2.group(2).upper() == "H" else "DLPFC"
        ind = r["individual"] if int(r["individual"]) >= 10 else "0" + r["individual"]
        r["tissue_id"] = "US" + m2.group(2).upper() + ind
        r["sample_type"] = "single_cell"
        r["dna_type"] = "mda"
    elif basename.startswith("gDNA"):
        m2 = re.search("^gDNA_(US([DH])(\d+))_(R[12])$", basename, flags=re.IGNORECASE)
        r["individual"] = re.sub("^0+", "", m2.group(3))
        r["sample_id1"] = "bulk"
        r["sample_id2"] = "Sbulk"
        r["read"] = m2.group(4).upper()
        r["tissue"] = "HIPPO" if m2.group(2).upper() == "H" else "DLPFC"
        ind = r["individual"] if int(r["individual"]) >= 10 else "0" + r["individual"]
        r["tissue_id"] = "US" + m2.group(2).upper() + ind
        r["sample_type"] = "bulk"
        r["dna_type"] = "bulk"
    else:
        r = None

    return r


def print_unique(df):
    print(
        f"""
		{df.shape[0]} files
		{len(set(df['first_read']))} unique files based on first read
		"""
    )

## find the files

In [19]:
# glob for files
files = glob.glob(
    "/raidixshare_logg01/fqiu/AWS/for-ndar/SLAV-Seq/**/*fastq.gz", recursive=True
) + glob.glob("/raidixshare_log-g/BSMN/SLAV-Seq/**/*fastq.gz", recursive=True)

In [23]:
# use threads because first_line is i/o bound
results = Parallel(n_jobs=4)(delayed(file_info)(f) for f in tqdm(files))
print("found {} files".format(len(results)))

100%|██████████| 14094/14094 [03:55<00:00, 59.92it/s] 


found 14094 files


In [25]:
# convert to dataframe
print("extracting first read from each file")
logg_files = pd.DataFrame.from_records(results, columns=["filename", "first_read"])

print_unique(logg_files)

extracting first read from each file

		14094 files
		8882 unique files based on first read
		


In [26]:
# remove duplicates
logg_files.drop_duplicates(subset=["first_read"], inplace=True)

removing duplicate fastqs


## Create the samplesheet and donorsheet

In [49]:
# extract sample metadata from filenames
df = pd.DataFrame.from_records([dict(fields(x)) for x in logg_files["filename"]]).merge(
    logg_files, on="filename"
)

In [50]:
meta = pd.read_csv("slavseq_metadata.tsv", sep="\t").drop(
    ["MDA_PERFORMED", "BULK_PERFORMED"], axis=1
)
meta.columns = meta.columns.str.lower()
meta.set_index("tissue_id", inplace=True)

In [51]:
# cleanup

df = (
    df.filter(
        items=[
            "filename",
            "read",
            "individual",
            "pair_id",
            "dna_type",
            "tissue_id",
        ],
        axis=1,
    )  # keep only relevant columns
    .pivot(
        columns="read",
        values="filename",
        index=["pair_id", "individual", "dna_type", "tissue_id"],
    )  # pivot to get R1 and R2 in same row
    .reset_index()
    .join(meta, on="tissue_id", how="left")  # join with metadata
    .rename(columns={"pair_id": "sample_id", "individual": "donor_id"})
)

In [56]:
# write samplesheet
(
    df.filter(
        items=["sample_id", "donor_id", "dna_type", "tissue_id", "R1", "R2"], axis=1
    )
    .loc[df["dna_type"] == "mda"]
    .loc[df["donor_id"] == "CommonBrain"]
    .drop(["dna_type"], axis=1)
    .to_csv("commonbrain_samples.tsv", sep="\t", index=False)
)

In [59]:
# write donorsheet
(
    df.filter(
        items=["donor_id", "brain_id", "sex", "age", "libd_id", "race", "diagnosis"],
        axis=1,
    )
    .loc[df["donor_id"] == "CommonBrain"]
    .drop_duplicates()
    .to_csv("commonbrain_donors.tsv", sep="\t", index=False)
)