In [1]:

import pathlib as pl
import pandas as pd

local_path = pl.Path('/home/ebertp/mounts/hhu/project/projects/medbioinf/data')
remote_path = pl.Path('/gpfs/project/projects/medbioinf/data')

out_table = pl.Path('/home/local/work/code/github/project-male-assembly/data/samples.tsv')

samples = pd.read_csv(
    pl.Path('male_samples.tsv'),
    sep='\t',
    header=0
)
samples['sample_id'] = samples.index + 1

def collect_ontul(samples):
    suffix = 'guppy-5.0.11-sup-prom_fastq_pass.fastq.gz'
    subfolders = [pl.Path('hprc_ont'), pl.Path('hgsvc_ontul')]
    
    known_samples = set(samples['sample'])
    collected_data = []
    for s in subfolders:
        data_folder = local_path / s
        for subdir in data_folder.iterdir():
            if not subdir.is_dir():
                continue
            sample = subdir.name
            if sample.startswith('GM'):
                sample = 'NA' + sample[2:]
            if sample not in known_samples:
                continue
            data_files = list(subdir.glob('**/*' + suffix))
            num_files = len(data_files)
            replaced = str(subdir).replace(str(local_path), str(remote_path))
            collected_data.append((sample, replaced, num_files))
    
    df = pd.DataFrame.from_records(
        collected_data,
        columns=['sample', 'ont', 'ont_num']
    )
    samples = samples.merge(df, on='sample', how='outer')
    samples['ont_num'] = samples['ont_num'].fillna(0, inplace=False).astype(int)
    
    return samples


def collect_hifi(samples):
    suffix = '.fastq.gz'
    subfolders = [pl.Path('hprc_hifi'), pl.Path('hgsvc_hifi')]
    
    known_samples = set(samples['sample'])
    collected_data = []
    for s in subfolders:
        data_folder = local_path / s
        for subdir in data_folder.iterdir():
            if not subdir.is_dir():
                continue
            sample = subdir.name
            if sample.startswith('GM'):
                sample = 'NA' + sample[2:]
            if sample not in known_samples:
                continue
            data_files = list(subdir.glob('**/*' + suffix))
            num_files = len(data_files)
            replaced = str(subdir).replace(str(local_path), str(remote_path))
            collected_data.append((sample, replaced, num_files))
    
    df = pd.DataFrame.from_records(
        collected_data,
        columns=['sample', 'hifi', 'hifi_num']
    )
    samples = samples.merge(df, on='sample', how='outer')
    samples['hifi_num'] = samples['hifi_num'].fillna(0, inplace=False).astype(int)
    
    return samples



samples = collect_ontul(samples)
samples = collect_hifi(samples)
samples.fillna('n/a', inplace=True)

samples = samples[['sample_id', 'sample', 'hifi', 'hifi_num', 'ont', 'ont_num']]

samples.to_csv(
    out_table,
    sep='\t',
    header=True,
    index=False
)