# Introduction
I originally started doing this on my Macbook with notebook 20160203_release5_npy_hdf5. I then discovered the
bug I had introduced into the release5 build by not using --ignore_all_filters in ApplyRecalibration

After rebuilding release5 on malsrv2/farm3 (see 20160204_create_release_5_farm.ipynb), I decided to create the various
vcf, npy and hdf5 files also on malsrv2
# Plan
- Create new WG vcf
- Create npy and hdf5 files for release5
    - Create mixtures and crosses specific vcfs
    - Create biallelic versions of the above
    - Create WG npy variants and 2d calldata files for these 4 vcfs
    - Create sites only vcf
    - Create npy WG variants files
    - Create chrom npy variants and calldata files
    - Create HDF5 file

# Setup

In [2]:
%run imports.ipynb
%run _shared_setup.ipynb

In [3]:
release5_final_files_dir = '/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0'

In [4]:
bcftools="/nfs/team112_internal/production/tools/bin/bcftools_git"

In [5]:
mixtures_ids = np.array(["PG%04d-C" % n for n in range(389, 416)])
crosses_fns = ['../../meta/crosses_samples_%s.txt' % x for x in ['3D7xHB3', 'HB3xDd2', '7G8xGB4']]
crosses_samples = (etl
    .fromtsv(crosses_fns[0])
    .cat(etl.fromtsv(crosses_fns[1]))
    .cat(etl.fromtsv(crosses_fns[2]))
    .convert('sample', 'replace', 'PG0083-C', '7G8')
    .convert('sample', 'replace', 'PG0084-C', 'GB4')
    .sort('sample')
#     .convert('sample', '7G8', where=lambda rec: rec['sample'] == 'PG0083-C')
#     .convert('sample', 'GB4', where=lambda rec: rec['sample'] == 'PG0084-C')
)
crosses_ids = crosses_samples.values('sample').array()

In [6]:
crosses_ids

array(['7G8', 'GB4', 'PG0004-CW', 'PG0008-CW', 'PG0015-C', 'PG0016-C',
       'PG0017-C', 'PG0018-C', 'PG0019-C', 'PG0020-C', 'PG0021-C',
       'PG0022-Cx', 'PG0023-C', 'PG0024-C', 'PG0025-C', 'PG0026-C',
       'PG0027-C', 'PG0028-C', 'PG0029-Cx', 'PG0030-C', 'PG0031-C',
       'PG0032-Cx', 'PG0033-Cx', 'PG0034-C', 'PG0035-Cx', 'PG0036-C',
       'PG0037-C', 'PG0038-C', 'PG0039-C', 'PG0040-Cx', 'PG0041-C',
       'PG0042-C', 'PG0043-C', 'PG0044-C', 'PG0045-C', 'PG0046-Cx',
       'PG0047-C', 'PG0048-C', 'PG0051-C', 'PG0052-C', 'PG0053-C',
       'PG0054-C', 'PG0055-C', 'PG0056-C', 'PG0057-C', 'PG0058-C',
       'PG0060-C', 'PG0061-C', 'PG0062-C', 'PG0063-C', 'PG0064-C',
       'PG0065-C', 'PG0066-C', 'PG0067-C', 'PG0068-C', 'PG0069-C',
       'PG0070-C', 'PG0071-C', 'PG0072-C', 'PG0074-C', 'PG0077-CW',
       'PG0078-C', 'PG0078-CW', 'PG0079-C', 'PG0079-CW', 'PG0080-C',
       'PG0081-CW', 'PG0082-C', 'PG0085-C', 'PG0086-C', 'PG0086-CW',
       'PG0087-C', 'PG0088-C', 'PG0091-C', 'PG

# Create files

In [7]:
# Create WG VCF file
input_files = ' '.join(
    ["%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release5_final_files_dir, chrom) for chrom in CHROM_VCF_FNS['interim5'].keys()]
)
output_vcf_fn = "%s/SNP_INDEL_WG.combined.filtered.vcf.gz" % (release5_final_files_dir)
if not os.path.exists(output_vcf_fn):
    !{bcftools} concat --output-type z --output {output_vcf_fn} {input_files}
    !{bcftools} index --tbi {output_vcf_fn}


In [8]:
wg_vcf_fn = "%s/SNP_INDEL_WG.combined.filtered.vcf.gz" % (release5_final_files_dir)

In [9]:
# Create mixtures and crosses vcfs
mixtures_samples = ','.join(mixtures_ids)
crosses_samples = ','.join(crosses_ids)

mixtures_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.mixtures')
mixtures_biallelic_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.mixtures_biallelic')
crosses_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.crosses')
crosses_biallelic_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.crosses_biallelic')

if not os.path.exists(mixtures_vcf_fn):
    !{bcftools} view -Oz -o {mixtures_vcf_fn} -s {mixtures_samples} {wg_vcf_fn}
    !{bcftools} index --tbi {mixtures_vcf_fn}

if not os.path.exists(mixtures_biallelic_vcf_fn):
    !{bcftools} view -Ou -s {mixtures_samples} {wg_vcf_fn} | {bcftools} norm -m -any -Oz -o {mixtures_biallelic_vcf_fn}
    !{bcftools} index --tbi {mixtures_biallelic_vcf_fn}

if not os.path.exists(crosses_vcf_fn):
    !{bcftools} view -Oz -o {crosses_vcf_fn} -s {crosses_samples} {wg_vcf_fn}
    !{bcftools} index --tbi {crosses_vcf_fn}

if not os.path.exists(crosses_biallelic_vcf_fn):
    !{bcftools} view -Ou -s {crosses_samples} {wg_vcf_fn} | {bcftools} norm -m -any -Oz -o {crosses_biallelic_vcf_fn}
    !{bcftools} index --tbi {crosses_biallelic_vcf_fn}


In [10]:
def create_variants_npy(vcf_fn=mixtures_vcf_fn, max_alleles=7, rewrite=False):
    output_dir = '%s.vcfnp_cache' % vcf_fn
    print(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if rewrite or not os.path.exists("%s/variants.npy" % output_dir):
        vcfnp.variants(
            vcf_fn,
            dtypes={
                'REF':                      'a10',
                'ALT':                      'a10',
                'RegionType':               'a25',
                'VariantType':              'a40',
                'RU':                       'a40',
                'set':                      'a40',
                'SNPEFF_AMINO_ACID_CHANGE': 'a20',
                'SNPEFF_CODON_CHANGE':      'a20',
                'SNPEFF_EFFECT':            'a33',
                'SNPEFF_EXON_ID':            'a2',
                'SNPEFF_FUNCTIONAL_CLASS':   'a8',
                'SNPEFF_GENE_BIOTYPE':      'a14',
                'SNPEFF_GENE_NAME':         'a20',
                'SNPEFF_IMPACT':             'a8',
                'SNPEFF_TRANSCRIPT_ID':     'a20',
                'culprit':                  'a14',
            },
            arities={
                'ALT':   max_alleles - 1,
                'AF':    max_alleles - 1,
                'AC':    max_alleles - 1,
                'MLEAF': max_alleles - 1,
                'MLEAC': max_alleles - 1,
                'RPA':   max_alleles,
                'ANN':   1,
            },
            fills={
                'VQSLOD': np.nan,
                'QD': np.nan,
                'MQ': np.nan,
                'MQRankSum': np.nan,
                'ReadPosRankSum': np.nan,
                'FS': np.nan,
                'SOR': np.nan,
                'DP': np.nan,
            },
            flatten_filter=True,
            verbose=False,
            cache=True,
            cachedir=output_dir
        )

def create_calldata_npy(vcf_fn=mixtures_vcf_fn, max_alleles=7, rewrite=False):
    output_dir = '%s.vcfnp_cache' % vcf_fn
    print(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if rewrite or not os.path.exists("%s/calldata_2d.npy" % output_dir):
        vcfnp.calldata_2d(
            vcf_fn,
            fields=['AD'],
            dtypes={
                'AD': 'u2',
            },
            arities={
                'AD': max_alleles,
            },
    #         fills={
    #             'AD': np.nan,
    #         },
    #         flatten_filter=True,
            verbose=False,
            cache=True,
            cachedir=output_dir
        )


In [None]:
# Create mixtures and crosses npy files
create_variants_npy(mixtures_vcf_fn)
create_calldata_npy(mixtures_vcf_fn)
create_variants_npy(mixtures_biallelic_vcf_fn, 2)
create_calldata_npy(mixtures_biallelic_vcf_fn, 2)
create_variants_npy(crosses_vcf_fn)
# create_calldata_npy(crosses_vcf_fn)
create_variants_npy(crosses_biallelic_vcf_fn, 2)
create_calldata_npy(crosses_biallelic_vcf_fn, 2)


/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.mixtures.vcf.gz.vcfnp_cache
/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.mixtures.vcf.gz.vcfnp_cache
/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.mixtures_biallelic.vcf.gz.vcfnp_cache
/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.mixtures_biallelic.vcf.gz.vcfnp_cache
/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.crosses.vcf.gz.vcfnp_cache
/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.crosses_biallelic.vcf.gz.vcfnp_cache
/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_WG.combined.filtered.crosses_biallelic.vcf.gz.vcfnp_cache

In [None]:
# Create sites only vcf and npy
wg_sites_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.sites')
!{bcftools} view --drop-genotypes --output-type z --output-file {wg_sites_vcf_fn} {wg_vcf_fn}
!{bcftools} index --tbi {wg_sites_vcf_fn}
