In [1]:
import os
import glob

import pandas as pd

from multiprocessing import Pool

In [5]:
files = '/scratch/t.cri.awoodard/indel-filtering/cgpPindel/*/*vcf.gz'
# num_workers = os.cpu_count()
num_workers = 12

In [3]:
def get_pseudo_pon(paths):
    """Get a pseudo panel of normals (PON).
    
    The PON is constructed by aggregating all unique non-PASS variants found
    across all samples.

    Parameters
    ----------
    paths : list of str
        Paths to sample VCF files.

    Returns
    -------
    pandas.DataFrame
        
    """
    from utils import read_vcf

    pseudo_pon = None
    columns = ['CHROM', 'POS', 'REF', 'ALT']
    for path in paths:
        df = read_vcf(path)
        if pseudo_pon is None:
            pseudo_pon = df[columns][df.FILTER != 'PASS']
        else:
            pseudo_pon = pd.concat([pseudo_pon, df[columns][df.FILTER != 'PASS']]).drop_duplicates()
    
    return pseudo_pon

In [6]:
paths = glob.glob(files)
num_chunks = int(float(len(paths)) / num_workers) * 2
chunks = [paths[i:i + num_chunks] for i in range(0, len(paths), num_chunks)]

In [7]:
pool = Pool(processes=num_workers)
pons = pool.map(get_pseudo_pon, chunks)

In [8]:
pseudo_pon = pd.concat(pons).drop_duplicates()

In [9]:
pseudo_pon.to_parquet('/scratch/t.cri.awoodard/indel-filtering/pseudo_pon.parquet')