In [1]:
%run _standard_imports.ipynb

python 3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
numpy 1.11.2
scipy 0.18.1
pandas 0.19.0
numexpr 2.6.1
pysam 0.8.4
petl 1.1.0
petlx 1.0.3
vcf 0.6.8
vcfnp 2.2.0
h5py 2.6.0
tables 3.3.0


In [195]:
output_dir = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build'
vrpipe_fileinfo_fn = "%s/pf_60_genotype_gvcfs_200kb.txt" % output_dir
vcf_fofn = "%s/pf_60_genotype_gvcfs_200kb.fofn" % output_dir
vcf_stem = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_{chrom}.final.vcf.gz'

GENOME_FN = "/lustre/scratch116/malaria/pfalciparum/resources/Pfalciparum.genome.fasta"
genome_fn = "%s/Pfalciparum.genome.fasta" % output_dir

!cp {GENOME_FN} {genome_fn}

!mkdir -p {output_dir}/hdf5
!mkdir -p {output_dir}/vcf
!mkdir -p {output_dir}/npy
!mkdir -p {output_dir}/scripts
!mkdir -p {output_dir}/log


In [193]:
genome = pyfasta.Fasta(genome_fn)
genome

<pyfasta.fasta.Fasta at 0x7fb9c7df7748>

In [196]:
fo = open("%s/scripts/vcfnp_variants.sh" % output_dir, 'w')
print('''#!/bin/bash

#set changes bash options
#x prints commands & args as they are executed
set -x
#-e  Exit immediately if a command exits with a non-zero status
set -e
#reports the last program to return a non-0 exit code rather than the exit code of the last problem
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

# Reasons for some of string lengths
# REF and ALT: found an SV with length -192 in one small test chunk, so decided on 300bp as this is a typical insert size
# SNPEFF_AMINO_ACID_CHANGE: maximum 300 bases, therefore 100 alt AA + 1 ref AA + 4 integers for position 
# SNPEFF_CODON_CHANGE: # maximum 300 alt bases + 3 ref bases + "/"
# SNPEFF_EXON_ID: integer for exon number, but kept as string in case any else in there 

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --output-dir %s/npy \
    --array-type variants \
    --task-size 1000 \
    --task-index LSB_JOBINDEX \
    --progress 100 \
    --chromosome $chrom \
    --arity ALT:6 \
    --arity AF:6 \
    --arity AC:6 \
    --arity svlen:6 \
    --dtype REF:a300 \
    --dtype ALT:a300 \
    --dtype MULTIALLELIC:a2 \
    --dtype RegionType:a25 \
    --dtype SNPEFF_AMINO_ACID_CHANGE:a105 \
    --dtype SNPEFF_CODON_CHANGE:a304 \
    --dtype SNPEFF_EFFECT:a33 \
    --dtype SNPEFF_EXON_ID:a2 \
    --dtype SNPEFF_FUNCTIONAL_CLASS:a8 \
    --dtype SNPEFF_GENE_NAME:a20 \
    --dtype SNPEFF_IMPACT:a8 \
    --dtype SNPEFF_TRANSCRIPT_ID:a20 \
    --dtype VARIANT_TYPE:a5 \
    --dtype VariantType:a40 \
    --exclude-field ID''' % (
        genome_fn,
        output_dir,
        )
        , file=fo)
fo.close()


In [197]:
fo = open("%s/scripts/vcfnp_calldata.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

# Reasons for some of string lengths
# PL: number of PLs is n(n+1)/2 where n is number of alleles (7)
# PID: maximum 300 alt bases + 1 ref base + 2 underscores + 7 integers for position 

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --output-dir %s/npy \
    --array-type calldata_2d \
    --task-size 1000 \
    --task-index LSB_JOBINDEX \
    --progress 100 \
    --chromosome $chrom \
    --arity AD:7 \
    --arity PL:28 \
    --dtype PGT:a3 \
    --dtype PID:a310 \
    --exclude-field MIN_DP \
    --exclude-field RGQ \
    --exclude-field SB''' % (
        genome_fn,
        output_dir,
        )
        , file=fo)
fo.close()


In [104]:
2**23

8388608

In [222]:
fo = open("%s/scripts/vcfnp_concat.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
chrom=$2
outbase=$3
# inputs=${vcf}.vcfnp_cache
inputs=%s/npy
output=${outbase}.${chrom}.h5

log=${output}.log

if [ -f ${output}.md5 ]
then
    echo $(date) skipping $chrom >> $log
else
    echo $(date) building $chrom > $log
    vcfnpy2hdf5 \
        --vcf $vcf \
        --input-dir $inputs \
        --input-filename-template {array_type}.${chrom}*.npy \
        --output $output \
        --group $chrom \
        --chunk-size 8388608 \
        --chunk-width 200 \
        --compression gzip \
        --compression-opts 1 \
        &>> $log
        
    md5sum $output > ${output}.md5 
fi''' % (
        output_dir,
        )
      , file=fo)
fo.close()

#     nv=$(ls -1 ${inputs}/v* | wc -l)
#     nc=$(ls -1 ${inputs}/c* | wc -l)
#     echo variants files $nv >> $log
#     echo calldata files $nc >> $log
#     if [ "$nv" -ne "$nc" ]
#     then
#         echo missing npy files
#         exit 1
#     fi


In [225]:
fo = open("%s/scripts/vcfnp_concat.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
outbase=$2
# inputs=${vcf}.vcfnp_cache
inputs=%s/npy
output=${outbase}.h5

log=${output}.log

if [ -f ${output}.md5 ]
then
    echo $(date) skipping $chrom >> $log
else
    echo $(date) building $chrom > $log
    vcfnpy2hdf5 \
        --vcf $vcf \
        --input-dir $inputs \
        --output $output \
        --chunk-size 8388608 \
        --chunk-width 200 \
        --compression gzip \
        --compression-opts 1 \
        &>> $log
        
    md5sum $output > ${output}.md5 
fi''' % (
        output_dir,
        )
      , file=fo)
fo.close()

#     nv=$(ls -1 ${inputs}/v* | wc -l)
#     nc=$(ls -1 ${inputs}/c* | wc -l)
#     echo variants files $nv >> $log
#     echo calldata files $nc >> $log
#     if [ "$nv" -ne "$nc" ]
#     then
#         echo missing npy files
#         exit 1
#     fi


In [200]:
!chmod +x {output_dir}/scripts/vcfnp_variants.sh
!chmod +x {output_dir}/scripts/vcfnp_calldata.sh
!chmod +x {output_dir}/scripts/vcfnp_concat.sh

# Test the three stages

In [198]:
vcf_fn = vcf_stem.format(chrom='Pf3D7_01_v3')
# vcf_fn = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz'

In [201]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_variants.sh {vcf_stem.format(chrom='Pf3D7_01_v3')} Pf3D7_01_v3


+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
+ chrom=Pf3D7_01_v3
+ fasta=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta
+ vcf2npy --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz --fasta /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta --output-dir /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy --array-type variants --task-size 1000 --task-index LSB_JOBINDEX --progress 100 --chromosome Pf3D7_01_v3 --arity ALT:6 --arity AF:6 --arity AC:6 --arity svlen:6 --dtype REF:a300 --dtype ALT:a300 --dtype MULTIALLELIC:a2 --dtype RegionType:a25 --dtype SNPEFF_AMINO_ACID_CHANGE:a105 --dtype SNPEFF_CODON_CHANGE:a304 --dtype SNPEFF_EFFECT:a33 --dtype SNPEFF_EXON_ID:a

In [202]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_variants.sh {vcf_stem.format(chrom='Pf3D7_02_v3')} Pf3D7_02_v3


+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_02_v3.final.vcf.gz
+ chrom=Pf3D7_02_v3
+ fasta=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta
+ vcf2npy --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_02_v3.final.vcf.gz --fasta /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta --output-dir /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy --array-type variants --task-size 1000 --task-index LSB_JOBINDEX --progress 100 --chromosome Pf3D7_02_v3 --arity ALT:6 --arity AF:6 --arity AC:6 --arity svlen:6 --dtype REF:a300 --dtype ALT:a300 --dtype MULTIALLELIC:a2 --dtype RegionType:a25 --dtype SNPEFF_AMINO_ACID_CHANGE:a105 --dtype SNPEFF_CODON_CHANGE:a304 --dtype SNPEFF_EFFECT:a33 --dtype SNPEFF_EXON_ID:a

In [204]:
# npy_fn = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz.vcfnp_cache/variants.Pf3D7_01_v3_101001_102000.npy'
npy_fn = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy/variants.Pf3D7_01_v3_101001_102000.npy'
# npy_fn = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy/variants.Pf3D7_01_v3_100001_110000.npy'
variants = np.load(npy_fn)
print("max %d, min %d" % (np.max(variants['svlen']), np.min(variants['svlen'])))
print(np.unique(variants['num_alleles'], return_counts=True))
variants['svlen']

max 159, min -192
(array([2, 3, 4, 5, 6, 7], dtype=uint8), array([102,  83,  25,  16,  19,  13]))


array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0],
       ..., 
       [-9, -9,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0]], dtype=int32)

In [205]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_calldata.sh {vcf_stem.format(chrom='Pf3D7_01_v3')} Pf3D7_01_v3


+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
+ chrom=Pf3D7_01_v3
+ fasta=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta
+ vcf2npy --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz --fasta /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta --output-dir /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy --array-type calldata_2d --task-size 1000 --task-index LSB_JOBINDEX --progress 100 --chromosome Pf3D7_01_v3 --arity AD:7 --arity PL:28 --dtype PGT:a3 --dtype PID:a310 --exclude-field MIN_DP --exclude-field RGQ --exclude-field SB
[vcf2npy] 2016-11-25 19:04:31.657404 :: loading calldata_2d from /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122

In [206]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_calldata.sh {vcf_stem.format(chrom='Pf3D7_02_v3')} Pf3D7_02_v3


+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_02_v3.final.vcf.gz
+ chrom=Pf3D7_02_v3
+ fasta=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta
+ vcf2npy --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_02_v3.final.vcf.gz --fasta /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta --output-dir /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy --array-type calldata_2d --task-size 1000 --task-index LSB_JOBINDEX --progress 100 --chromosome Pf3D7_02_v3 --arity AD:7 --arity PL:28 --dtype PGT:a3 --dtype PID:a310 --exclude-field MIN_DP --exclude-field RGQ --exclude-field SB
[vcf2npy] 2016-11-25 19:05:42.583082 :: loading calldata_2d from /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122

In [207]:
!{output_dir}/scripts/vcfnp_concat.sh {vcf_fn} {output_dir}/hdf5/_build

+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
+ outbase=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build
+ inputs=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy
+ output=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build.h5
+ log=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build.h5.log
+ '[' -f /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build.h5.md5 ']'
++ date
+ echo Fri Nov 25 19:06:17 GMT 2016 building
++ wc -l
++ ls -1 /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy/variants.Pf3D7_01_v3_101001_102000.npy /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy/variants.Pf3D7_02_v3_101001_102000.npy
+ nv=2

# How much does having dtype PID:a310 effect file size?

In [208]:
!ls -altr {output_dir}/hdf5

total 37713
drwxrwxr-x 8 rp7 team112     4096 Nov 25 17:35 ..
-rw-rw-r-- 1 rp7 team112      130 Nov 25 19:07 _build.h5.md5
-rw-rw-r-- 1 rp7 team112    13978 Nov 25 19:07 _build.h5.log
-rw-rw-r-- 1 rp7 team112 38586889 Nov 25 19:07 _build.h5
drwxrwxr-x 2 rp7 team112     4096 Nov 25 19:07 .


In [199]:
# !rm {vcf_fn}.vcfnp_cache/*
!rm {output_dir}/npy/*
!rm {output_dir}/hdf5/*

rm: cannot remove `/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/*': No such file or directory


In [139]:
# rerun with much smaller PID
fo = open("%s/scripts/vcfnp_calldata.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --array-type calldata_2d \
    --task-size 1000 \
    --task-index LSB_JOBINDEX \
    --progress 100 \
    --chromosome $chrom \
    --arity AD:7 \
    --arity PL:28 \
    --dtype PGT:a3 \
    --dtype PID:a12 \
    --exclude-field MIN_DP \
    --exclude-field RGQ \
    --exclude-field SB''' % (
        genome_fn,
        )
        , file=fo)
fo.close()


In [None]:
!export LSB_JOBINDEX=11 && {output_dir}/scripts/vcfnp_calldata.sh {vcf_fn} Pf3D7_01_v3
!export LSB_JOBINDEX=11 && {output_dir}/scripts/vcfnp_variants.sh {vcf_fn.replace('Pf3D7_01_v3', 'Pf3D7_02_v3')} Pf3D7_02_v3
!export LSB_JOBINDEX=11 && {output_dir}/scripts/vcfnp_calldata.sh {vcf_fn} Pf3D7_01_v3
!export LSB_JOBINDEX=11 && {output_dir}/scripts/vcfnp_calldata.sh {vcf_fn.replace('Pf3D7_01_v3', 'Pf3D7_02_v3')} Pf3D7_02_v3
!{output_dir}/scripts/vcfnp_concat.sh {vcf_fn} {output_dir}/hdf5/_build

In [140]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_variants.sh {vcf_fn} Pf3D7_01_v3

+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
+ chrom=Pf3D7_01_v3
+ fasta=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta
+ vcf2npy --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz --fasta /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta --array-type variants --task-size 1000 --task-index LSB_JOBINDEX --progress 100 --chromosome Pf3D7_01_v3 --arity ALT:6 --arity AF:6 --arity AC:6 --arity svlen:6 --dtype REF:a300 --dtype ALT:a300 --dtype MULTIALLELIC:a2 --dtype RegionType:a25 --dtype SNPEFF_AMINO_ACID_CHANGE:a105 --dtype SNPEFF_CODON_CHANGE:a304 --dtype SNPEFF_EFFECT:a33 --dtype SNPEFF_EXON_ID:a2 --dtype SNPEFF_FUNCTIONAL_CLASS:a8 --dtype SNPEFF_GENE_NAME:a20 --dtype SNPEFF_IMPACT:a8 --dtype

In [141]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_calldata.sh {vcf_fn} Pf3D7_01_v3


+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
+ chrom=Pf3D7_01_v3
+ fasta=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta
+ vcf2npy --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz --fasta /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/Pfalciparum.genome.fasta --array-type calldata_2d --task-size 1000 --task-index LSB_JOBINDEX --progress 100 --chromosome Pf3D7_01_v3 --arity AD:7 --arity PL:28 --dtype PGT:a3 --dtype PID:a12 --exclude-field MIN_DP --exclude-field RGQ --exclude-field SB
[vcf2npy] 2016-11-25 17:24:22.221293 :: loading calldata_2d from /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
[vcf2npy] 2016-11-25 17:24:22.221484 :: extract

In [142]:
!export LSB_JOBINDEX=102 && {output_dir}/scripts/vcfnp_concat.sh {vcf_fn} Pf3D7_01_v3 {output_dir}/hdf5/_build

+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz
+ chrom=Pf3D7_01_v3
+ outbase=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build
+ inputs=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz.vcfnp_cache
+ output=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build.Pf3D7_01_v3.h5
+ log=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build.Pf3D7_01_v3.h5.log
+ '[' -f /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/_build.Pf3D7_01_v3.h5.md5 ']'
++ date
+ echo Fri Nov 25 17:25:31 GMT 2016 building Pf3D7_01_v3
++ wc -l
++ ls -1 /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf3D7_01_v3.final.vcf.gz.vcfnp_cache/varia

In [143]:
!ls -altr {output_dir}/hdf5

total 24353
drwxrwxr-x 7 rp7 team112     4096 Nov 25 16:02 ..
-rw-rw-r-- 1 rp7 team112      142 Nov 25 17:25 _build.Pf3D7_01_v3.h5.md5
-rw-rw-r-- 1 rp7 team112    13874 Nov 25 17:25 _build.Pf3D7_01_v3.h5.log
-rw-rw-r-- 1 rp7 team112 24927201 Nov 25 17:25 _build.Pf3D7_01_v3.h5
drwxrwxr-x 2 rp7 team112     4096 Nov 25 17:25 .


In [152]:
32504643/24927201

1.303982865946321

Based on the above, it seems like having PID large enough to hold all possible might add ~30% to file sizes. Decided that this cost is probably worth it in case we really do get interested in this phasing information

In [181]:
# Stick with the original version
fo = open("%s/scripts/vcfnp_calldata.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

# Reasons for some of string lengths
# PL: number of PLs is n(n+1)/2 where n is number of alleles (7)
# PID: maximum 300 alt bases + 1 ref base + 2 underscores + 7 integers for position 

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --output-dir %s/npy \
    --array-type calldata_2d \
    --task-size 10000 \
    --task-index LSB_JOBINDEX \
    --progress 100 \
    --chromosome $chrom \
    --arity AD:7 \
    --arity PL:28 \
    --dtype PGT:a3 \
    --dtype PID:a310 \
    --exclude-field MIN_DP \
    --exclude-field RGQ \
    --exclude-field SB''' % (
        genome_fn,
        output_dir,
        )
        , file=fo)
fo.close()


# How does the file look?

In [209]:
y = h5py.File('%s/hdf5/_build.h5' % output_dir, 'r')
y

<HDF5 file "_build.h5" (mode r)>

In [210]:
samples = y['samples']
samples.shape

(7182,)

In [211]:
for field in y['calldata'].keys():
    print(field, y['calldata'][field].shape)

AD (403, 7182, 7)
DP (403, 7182)
GQ (403, 7182)
GT (403, 7182)
PGT (403, 7182)
PID (403, 7182)
PL (403, 7182, 28)
genotype (403, 7182, 2)
is_called (403, 7182)
is_phased (403, 7182)


In [213]:
for field in ['GQ', 'GT', 'PGT', 'genotype', 'is_called', 'is_phased', 'PID', 'PL']:
    print(field, np.unique(y['calldata'][field], return_counts=True))

GQ (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=uint8), array([ 386678,    2664,    2122,   93952,    2277,    1795,   71659,
          2386,    2091,   60121,    2409,    1842,   53870,    2555,
          1930,   48264,    2306,    1932,   44030,    2582,    1889,
         40766,    2344,    1884,   38445,    2363,    1725,   35651,
          2365,    1731,   33229,    2033,    1690,   31631,    2080,
          1589,   30140,    1968,    1639,   28291,    1962,    1496,
         27051,    1795,    1474,   25820,    1808,    1582,   24624,
          1674,    1500,   23394,    1426,    156

In [214]:
for field in y['variants'].keys():
    print(field, np.unique(y['variants'][field], return_counts=True))

AC (array([    0,     1,     2,     3,     4,     5,     6,     7,     8,
           9,    10,    11,    12,    13,    14,    15,    16,    17,
          18,    19,    20,    22,    23,    24,    25,    26,    27,
          28,    29,    30,    31,    32,    33,    34,    35,    36,
          37,    38,    39,    40,    41,    42,    43,    44,    48,
          49,    50,    51,    52,    53,    54,    55,    56,    57,
          58,    59,    60,    61,    63,    65,    69,    71,    74,
          75,    76,    77,    78,    80,    84,    85,    86,    88,
          90,    92,    94,    95,    96,    98,   108,   112,   114,
         116,   118,   123,   129,   130,   131,   133,   134,   137,
         139,   141,   143,   148,   152,   153,   156,   158,   160,
         161,   162,   163,   164,   166,   173,   176,   177,   180,
         181,   184,   185,   187,   189,   192,   195,   198,   200,
         203,   208,   209,   211,   214,   218,   222,   225,   227,
         228,   

In [215]:
y['variants']['AC'].shape

(403, 6)

# Full build

In [194]:
for chrom in sorted(genome.keys()):
    print(chrom)

Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3
Pf3D7_API_v3
Pf_M76611


In [218]:
task_size = 1000
for chrom in sorted(genome.keys()):
    vcf_fn = vcf_stem.format(chrom=chrom)
    n_tasks = '1-%s' % ((len(genome[chrom]) // task_size) + 1)
    print(chrom, n_tasks)

    
    #the -t option is because we are sending an array to qsub
    task = "%s/scripts/vcfnp_variants.sh" % output_dir
    !bsub -q normal -G malaria-dk -J "ma[{n_tasks}]" -R"select[mem>8000] rusage[mem=8000] span[hosts=1]" -M 8000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 

    task = "%s/scripts/vcfnp_calldata.sh" % output_dir
    !bsub -q normal -G malaria-dk -J "ma[{n_tasks}]" -R"select[mem>8000] rusage[mem=8000] span[hosts=1]" -M 8000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 

#     !qsub -S /bin/bash -l h_vmem=2G -N vcfnp_variants_$chrom -j y -o {logdir} -t {n_tasks} {task} {vcf_fn} {chrom}

#     task = "%s/scripts/vcfnp_calldata.sh" % output_dir
#     !qsub -S /bin/bash -l h_vmem=6G -N vcfnp_calldata_$chrom -j y -o {logdir} -t {n_tasks} {task} {vcf_fn} {chrom}

Pf3D7_01_v3 1-641
Job <8850191> is submitted to queue <normal>.
Job <8850192> is submitted to queue <normal>.
Pf3D7_02_v3 1-948
Job <8850193> is submitted to queue <normal>.
Job <8850194> is submitted to queue <normal>.
Pf3D7_03_v3 1-1068
Job <8850195> is submitted to queue <normal>.
Job <8850196> is submitted to queue <normal>.
Pf3D7_04_v3 1-1201
Job <8850197> is submitted to queue <normal>.
Job <8850198> is submitted to queue <normal>.
Pf3D7_05_v3 1-1344
Job <8850199> is submitted to queue <normal>.
Job <8850200> is submitted to queue <normal>.
Pf3D7_06_v3 1-1419
Job <8850201> is submitted to queue <normal>.
Job <8850202> is submitted to queue <normal>.
Pf3D7_07_v3 1-1446
Job <8850203> is submitted to queue <normal>.
Job <8850204> is submitted to queue <normal>.
Pf3D7_08_v3 1-1473
Job <8850205> is submitted to queue <normal>.
Job <8850206> is submitted to queue <normal>.
Pf3D7_09_v3 1-1542
Job <8850207> is submitted to queue <normal>.
User <rp7>: Pending job threshold reached. Retryi

In [None]:
!{output_dir}/scripts/vcfnp_concat.sh {vcf_fn} {output_dir}/hdf5/Pf_60

+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz
+ outbase=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/Pf_60
+ inputs=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy
+ output=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/Pf_60.h5
+ log=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/Pf_60.h5.log
+ '[' -f /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/hdf5/Pf_60.h5.md5 ']'
++ date
+ echo Fri Nov 25 21:59:00 GMT 2016 building
+ vcfnpy2hdf5 --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161122_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz --input-dir /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy --output /lustre/scratch109/mala