# Plan
- Create new WG vcf with AD fix
- Create npy and hdf5 files for release5
    - Create mixtures and crosses specific vcfs
    - Create biallelic versions of the above
    - Create WG npy variants and 2d calldata files for these 4 vcfs
    - Create sites only vcf
    - Create npy WG variants files
    - Create chrom npy variants and calldata files
    - Create HDF5 file

# Setup

In [1]:
%run imports.ipynb
%run _shared_setup.ipynb

docker image: podpearson/biipy_rdp:latest


In [2]:
release5_final_files_dir = '/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0'

In [None]:
mixtures_ids = np.array(["PG%04d-C" % n for n in range(389, 416)])
crosses_fns = ['../../meta/crosses_samples_%s.txt' % x for x in ['3D7xHB3', 'HB3xDd2', '7G8xGB4']]
crosses_samples = (etl
    .fromtsv(crosses_fns[0])
    .cat(etl.fromtsv(crosses_fns[1]))
    .cat(etl.fromtsv(crosses_fns[2]))
)
crosses_ids = crosses_samples.values('sample').array()

# Create files

In [None]:
# Need to recreate WG file due to AD Number=R bug
wg_vcf_fn = "%s/SNP_INDEL_WG.combined.filtered.vcf.gz" % (release5_final_files_dir)
!mv {wg_vcf_fn} {wg_vcf_fn + '.old'}
!zcat {wg_vcf_fn + '.old'} \
| sed 's/##FORMAT=<ID=AD,Number=./##FORMAT=<ID=AD,Number=R/' \
| bgzip -c > {wg_vcf_fn}
!bcftools index --tbi {wg_vcf_fn}


In [None]:
!rm {wg_vcf_fn + '.old'}

In [None]:
# Create mixtures and crosses vcfs
mixtures_samples = ','.join(mixtures_ids)
crosses_samples = ','.join(crosses_ids)

mixtures_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.mixtures')
mixtures_biallelic_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.mixtures_biallelic')
crosses_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.crosses')
crosses_biallelic_vcf_fn = wg_vcf_fn.replace('filtered', 'filtered.crosses_biallelic')

!bcftools view -Oz -o {mixtures_vcf_fn} -s mixtures_samples wg_vcf_fn
!bcftools view -Ou -s mixtures_samples wg_vcf_fn | bcftools norm -m -any -Oz -o {mixtures_biallelic_vcf_fn}
!bcftools view -Oz -o {crosses_vcf_fn} -s crosses_samples wg_vcf_fn
!bcftools view -Ou -s crosses_samples wg_vcf_fn | bcftools norm -m -any -Oz -o {crosses_biallelic_vcf_fn}

!bcftools index --tbi {mixtures_vcf_fn}
!bcftools index --tbi {mixtures_biallelic_vcf_fn}
!bcftools index --tbi {crosses_vcf_fn}
!bcftools index --tbi {crosses_biallelic_vcf_fn}


In [None]:
def create_variants_npy(vcf_fn=mixtures_vcf_fn):
    output_dir = '%s.vcfnp_cache' % vcf_fn
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    vcfnp.variants(
        vcf_fn,
        dtypes={
            'REF':                      'a10',
            'ALT':                      'a10',
            'RegionType':               'a25',
            'VariantType':              'a40',
            'RU':                       'a40',
            'set':                      'a40',
            'SNPEFF_AMINO_ACID_CHANGE': 'a20',
            'SNPEFF_CODON_CHANGE':      'a20',
            'SNPEFF_EFFECT':            'a33',
            'SNPEFF_EXON_ID':            'a2',
            'SNPEFF_FUNCTIONAL_CLASS':   'a8',
            'SNPEFF_GENE_BIOTYPE':      'a14',
            'SNPEFF_GENE_NAME':         'a20',
            'SNPEFF_IMPACT':             'a8',
            'SNPEFF_TRANSCRIPT_ID':     'a20',
            'culprit':                  'a14',
        },
        arities={
            'ALT':   6,
            'AF':    6,
            'AC':    6,
            'MLEAF': 6,
            'MLEAC': 6,
            'RPA':   7,
            'ANN':   1,
        },
        fills={
            'VQSLOD': np.nan,
            'QD': np.nan,
            'MQ': np.nan,
            'MQRankSum': np.nan,
            'ReadPosRankSum': np.nan,
            'FS': np.nan,
            'SOR': np.nan,
            'DP': np.nan,
        },
        flatten_filter=True,
        verbose=False,
        cache=True,
        cachedir=output_dir
    )
#     !vcf2npy \
#         --vcf {vcf_fn} \
#         --fasta {GENOME_FN} \
#         --output-dir {output_dir} \
#         --array-type variants \
#         --progress 1000000 \
#         --arity ALT:6 \
#         --arity AF:6 \
#         --arity AC:6 \
#         --arity MLEAF:6 \
#         --arity MLEAC:6 \
#         --arity RPA:7 \
#         --arity ANN:1 \
#         --dtype REF:a10 \
#         --dtype ALT:a10 \
#         --dtype RegionType:a25 \
#         --dtype VariantType:a40 \
#         --dtype RU:a40 \
#         --dtype set:a40 \
#         --dtype SNPEFF_AMINO_ACID_CHANGE:a20 \
#         --dtype SNPEFF_CODON_CHANGE:a20 \
#         --dtype SNPEFF_EFFECT:a33 \
#         --dtype SNPEFF_EXON_ID:a2 \
#         --dtype SNPEFF_FUNCTIONAL_CLASS:a8 \
#         --dtype SNPEFF_GENE_BIOTYPE:a14 \
#         --dtype SNPEFF_GENE_NAME:a20 \
#         --dtype SNPEFF_IMPACT:a8 \
#         --dtype SNPEFF_TRANSCRIPT_ID:a20 \
#         --dtype culprit:a14

def create_calldata_npy(vcf_fn=mixtures_vcf_fn, max_alleles=7):
    output_dir = '%s.vcfnp_cache' % vcf_fn
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    vcfnp.calldata_2d(
        vcf_fn,
        fields=['AD'],
        dtypes={
            'AD': 'u1',
        },
        arities={
            'AD': max_alleles,
        },
#         fills={
#             'AD': np.nan,
#         },
        flatten_filter=True,
        verbose=False,
        cache=True,
        cachedir=output_dir
    )


In [None]:
# Create mixtures and crosses npy files
create_variants_npy(mixtures_vcf_fn)
create_calldata_npy(mixtures_vcf_fn)
create_variants_npy(mixtures_biallelic_vcf_fn)
create_calldata_npy(mixtures_biallelic_vcf_fn, 2)
create_variants_npy(crosses_vcf_fn)
create_calldata_npy(crosses_vcf_fn)
create_variants_npy(crosses_biallelic_vcf_fn)
create_calldata_npy(crosses_biallelic_vcf_fn, 2)
