In [1]:
%run _standard_imports.ipynb

python 3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
numpy 1.11.2
scipy 0.18.1
pandas 0.19.0
numexpr 2.6.1
pysam 0.8.4
petl 1.1.0
petlx 1.0.3
vcf 0.6.8
vcfnp 2.2.0
h5py 2.6.0
tables 3.3.0


In [2]:
output_dir = '/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161201_Pv_30_HDF5_build'
vrpipe_fileinfo_fn = "%s/pv_30_genotype_gvcfs_200kb.txt" % output_dir
vcf_fofn = "%s/pv_30_genotype_gvcfs_20kb.fofn" % output_dir
vcf_stem = '/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pv3.0/20161130_Pv30_final_vcfs/vcf/Pv_30_{chrom}.final.vcf.gz'

nfs_release_dir = '/nfs/team112_internal/production/release_build/Pv/3_0_release_packages'
nfs_final_hdf5_dir = '%s/hdf5' % nfs_release_dir
!mkdir -p {nfs_final_hdf5_dir}

GENOME_FN = "/lustre/scratch109/malaria/pvivax/resources/gatk/PvivaxP01.genome.fasta"
genome_fn = "%s/PvivaxP01.genome.fasta" % output_dir

!mkdir -p {output_dir}/hdf5
!mkdir -p {output_dir}/vcf
!mkdir -p {output_dir}/npy
!mkdir -p {output_dir}/scripts
!mkdir -p {output_dir}/log

!cp {GENOME_FN} {genome_fn}

In [3]:
genome = pyfasta.Fasta(genome_fn)
genome

<pyfasta.fasta.Fasta at 0x7fd162238d68>

In [11]:
transfer_length = 0
for chrom in genome.keys():
    if chrom.startswith('Transfer'):
        transfer_length += len(genome[chrom])
transfer_length

4802351

In [4]:
fo = open("%s/scripts/vcfnp_variants.sh" % output_dir, 'w')
print('''#!/bin/bash

#set changes bash options
#x prints commands & args as they are executed
set -x
#-e  Exit immediately if a command exits with a non-zero status
set -e
#reports the last program to return a non-0 exit code rather than the exit code of the last problem
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --output-dir %s/npy \
    --array-type variants \
    --task-size 20000 \
    --task-index $LSB_JOBINDEX \
    --progress 1000 \
    --chromosome $chrom \
    --arity ALT:6 \
    --arity AF:6 \
    --arity AC:6 \
    --arity svlen:6 \
    --dtype REF:a400 \
    --dtype ALT:a600 \
    --dtype MULTIALLELIC:a2 \
    --dtype RegionType:a25 \
    --dtype SNPEFF_AMINO_ACID_CHANGE:a105 \
    --dtype SNPEFF_CODON_CHANGE:a304 \
    --dtype SNPEFF_EFFECT:a33 \
    --dtype SNPEFF_EXON_ID:a2 \
    --dtype SNPEFF_FUNCTIONAL_CLASS:a8 \
    --dtype SNPEFF_GENE_NAME:a20 \
    --dtype SNPEFF_IMPACT:a8 \
    --dtype SNPEFF_TRANSCRIPT_ID:a20 \
    --dtype VARIANT_TYPE:a5 \
    --dtype VariantType:a40 \
    --exclude-field ID''' % (
        genome_fn,
        output_dir,
        )
        , file=fo)
fo.close()


In [5]:
fo = open("%s/scripts/vcfnp_calldata.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --output-dir %s/npy \
    --array-type calldata_2d \
    --task-size 20000 \
    --task-index $LSB_JOBINDEX \
    --progress 1000 \
    --chromosome $chrom \
    --arity AD:7 \
    --arity PL:28 \
    --dtype PGT:a3 \
    --dtype PID:a12 \
    --exclude-field MIN_DP \
    --exclude-field RGQ \
    --exclude-field SB''' % (
        genome_fn,
        output_dir,
        )
        , file=fo)
fo.close()



In [6]:
fo = open("%s/scripts/vcfnp_concat.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
outbase=$2
inputs=$3
output=${outbase}.h5

log=${output}.log

if [ -f ${output}.md5 ]
then
    echo $(date) skipping $chrom >> $log
else
    echo $(date) building $chrom > $log
    vcfnpy2hdf5 \
        --vcf $vcf \
        --input-dir $inputs \
        --output $output \
        --chunk-size 8388608 \
        --chunk-width 200 \
        --compression gzip \
        --compression-opts 1 \
        &>> $log
        
    md5sum $output > ${output}.md5 
fi''', file=fo)
fo.close()


In [14]:
task_size = 20000
for chrom in sorted(genome.keys()):
    vcf_fn = vcf_stem.format(chrom=chrom)
    n_tasks = '1-%s' % ((len(genome[chrom]) // task_size) + 1)
    print(chrom, n_tasks)

    task = "%s/scripts/vcfnp_variants.sh" % output_dir
    !bsub -q normal -G malaria-dk -J "v_{chrom[6:8]}[{n_tasks}]" -n2 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 

    task = "%s/scripts/vcfnp_calldata.sh" % output_dir
    !bsub -q normal -G malaria-dk -J "c_{chrom[6:8]}[{n_tasks}]" -n2 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 


PvP01_01_v1 1-52
Job <9184414> is submitted to queue <normal>.
Job <9184415> is submitted to queue <normal>.
PvP01_02_v1 1-48
Job <9184416> is submitted to queue <normal>.
Job <9184417> is submitted to queue <normal>.
PvP01_03_v1 1-45
Job <9184418> is submitted to queue <normal>.
Job <9184419> is submitted to queue <normal>.
PvP01_04_v1 1-51
Job <9184420> is submitted to queue <normal>.
Job <9184421> is submitted to queue <normal>.
PvP01_05_v1 1-77
Job <9184422> is submitted to queue <normal>.
Job <9184423> is submitted to queue <normal>.
PvP01_06_v1 1-53
Job <9184424> is submitted to queue <normal>.
Job <9184426> is submitted to queue <normal>.
PvP01_07_v1 1-83
Job <9184427> is submitted to queue <normal>.
Job <9184428> is submitted to queue <normal>.
PvP01_08_v1 1-89
Job <9184429> is submitted to queue <normal>.
Job <9184430> is submitted to queue <normal>.
PvP01_09_v1 1-112
Job <9184431> is submitted to queue <normal>.
Job <9184432> is submitted to queue <normal>.
PvP01_10_v1 1-78
J

In [15]:
fo = open("%s/scripts/vcfnp_variants_temp.sh" % output_dir, 'w')
print('''#!/bin/bash

#set changes bash options
#x prints commands & args as they are executed
set -x
#-e  Exit immediately if a command exits with a non-zero status
set -e
#reports the last program to return a non-0 exit code rather than the exit code of the last problem
set -o pipefail

vcf=$1
chrom=$2

fasta=%s

vcf2npy \
    --vcf $vcf \
    --fasta $fasta \
    --output-dir %s/npy_temp \
    --array-type variants \
    --task-size 20000 \
    --task-index $LSB_JOBINDEX \
    --progress 1000 \
    --chromosome $chrom \
    --arity ALT:6 \
    --arity AF:6 \
    --arity AC:6 \
    --arity svlen:6 \
    --dtype REF:a400 \
    --dtype ALT:a600 \
    --dtype MULTIALLELIC:a2 \
    --dtype RegionType:a25 \
    --dtype SNPEFF_AMINO_ACID_CHANGE:a105 \
    --dtype SNPEFF_CODON_CHANGE:a304 \
    --dtype SNPEFF_EFFECT:a33 \
    --dtype SNPEFF_EXON_ID:a2 \
    --dtype SNPEFF_FUNCTIONAL_CLASS:a8 \
    --dtype SNPEFF_GENE_NAME:a20 \
    --dtype SNPEFF_IMPACT:a8 \
    --dtype SNPEFF_TRANSCRIPT_ID:a20 \
    --dtype VARIANT_TYPE:a5 \
    --dtype VariantType:a40 \
    --exclude-field ID''' % (
        genome_fn,
        output_dir,
        )
        , file=fo)
fo.close()


In [16]:
# Three variants jobs from the above didn't complete. Killed them, then ran the following
!mkdir -p {output_dir}/npy_temp

task_size = 20000
for chrom in sorted(genome.keys()):
    vcf_fn = vcf_stem.format(chrom=chrom)
    n_tasks = '1-%s' % ((len(genome[chrom]) // task_size) + 1)
    print(chrom, n_tasks)

    task = "%s/scripts/vcfnp_variants_temp.sh" % output_dir
    !bsub -q normal -G malaria-dk -J "v_{chrom[6:8]}[{n_tasks}]" -n2 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 

#     task = "%s/scripts/vcfnp_calldata.sh" % output_dir
#     !bsub -q normal -G malaria-dk -J "c_{chrom[6:8]}[{n_tasks}]" -n2 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 


PvP01_01_v1 1-52
Job <9186484> is submitted to queue <normal>.
PvP01_02_v1 1-48
Job <9186485> is submitted to queue <normal>.
PvP01_03_v1 1-45
Job <9186486> is submitted to queue <normal>.
PvP01_04_v1 1-51
Job <9186488> is submitted to queue <normal>.
PvP01_05_v1 1-77
Job <9186489> is submitted to queue <normal>.
PvP01_06_v1 1-53
Job <9186490> is submitted to queue <normal>.
PvP01_07_v1 1-83
Job <9186492> is submitted to queue <normal>.
PvP01_08_v1 1-89
Job <9186493> is submitted to queue <normal>.
PvP01_09_v1 1-112
Job <9186495> is submitted to queue <normal>.
PvP01_10_v1 1-78
Job <9186497> is submitted to queue <normal>.
PvP01_11_v1 1-107
Job <9186498> is submitted to queue <normal>.
PvP01_12_v1 1-160
Job <9186499> is submitted to queue <normal>.
PvP01_13_v1 1-105
Job <9186501> is submitted to queue <normal>.
PvP01_14_v1 1-158
Job <9186502> is submitted to queue <normal>.
PvP01_API_v1 1-2
Job <9186503> is submitted to queue <normal>.
PvP01_MIT_v1 1-1
Job <9186505> is submitted to que

In [17]:
!mv {output_dir}/npy_temp/v*.npy {output_dir}/npy/

In [18]:
task = "%s/scripts/vcfnp_concat.sh" % output_dir
!bsub -q long -G malaria-dk -J "hdf" -n8 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 \
-o {output_dir}/log/output_%J.log bash {task} {vcf_stem.format(chrom='PvP01_01_v1')} \
{output_dir}/hdf5/Pv_30 {output_dir}/npy


Job <9187327> is submitted to queue <long>.


In [35]:
output_dir

'/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161128_HDF5_build'

In [31]:
task = "%s/scripts/vcfnp_concat.sh" % output_dir
!bsub -q long -G malaria-dk -J "full" -R"select[mem>16000] rusage[mem=16000] span[hosts=1]" -M 16000 \
    -o {output_dir}/log/output_%J.log bash {task} {vcf_stem.format(chrom='Pf3D7_01_v3')} \
    {output_dir}/hdf5/Pf_60 \
    /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161124_HDF5_build/npy


Job <8976857> is submitted to queue <long>.


In [66]:
y = h5py.File('%s/hdf5/Pf_60_npy_no_PID_PGT.h5' % output_dir, 'r')


In [67]:
(etl.wrap(
    np.unique(y['variants']['SNPEFF_EFFECT'], return_counts=True)
)
    .transpose()
    .pushheader('SNPEFF_EFFECT', 'number')
    .sort('number', reverse=True)
    .displayall()
)

SNPEFF_EFFECT,number
b'INTERGENIC',2795496
b'NON_SYNONYMOUS_CODING',1430363
b'SYNONYMOUS_CODING',641017
b'INTRON',449418
b'FRAME_SHIFT',444677
b'INTRAGENIC',89670
b'',70597
b'CODON_INSERTION',47470
b'CODON_DELETION',27491
b'CODON_CHANGE_PLUS_CODON_DELETION',17675


In [13]:
task_size = 20000
for chrom in ['PvP01_00'] + sorted(genome.keys()):
    if chrom.startswith('Pv'):
        vcf_fn = vcf_stem.format(chrom=chrom)
        if chrom == 'PvP01_00':
            chrom_length = transfer_length
        else:
            chrom_length = len(genome[chrom])
        n_tasks = '1-%s' % ((chrom_length // task_size) + 1)
        print(chrom, n_tasks)

        task = "%s/scripts/vcfnp_variants.sh" % output_dir
        !bsub -q normal -G malaria-dk -J "v_{chrom[6:8]}[{n_tasks}]" -n2 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 

        task = "%s/scripts/vcfnp_calldata.sh" % output_dir
        !bsub -q normal -G malaria-dk -J "c_{chrom[6:8]}[{n_tasks}]" -n2 -R"select[mem>32000] rusage[mem=32000] span[hosts=1]" -M 32000 -o {output_dir}/log/output_%J-%I.log bash {task} {vcf_stem.format(chrom=chrom)} {chrom} 


PvP01_00 1-241
Job <9184155> is submitted to queue <normal>.
Job <9184156> is submitted to queue <normal>.
PvP01_01_v1 1-52
Job <9184158> is submitted to queue <normal>.
Job <9184159> is submitted to queue <normal>.
PvP01_02_v1 1-48
Job <9184160> is submitted to queue <normal>.
Job <9184161> is submitted to queue <normal>.
PvP01_03_v1 1-45
Job <9184162> is submitted to queue <normal>.
Job <9184163> is submitted to queue <normal>.
PvP01_04_v1 1-51
Job <9184164> is submitted to queue <normal>.
Job <9184165> is submitted to queue <normal>.
PvP01_05_v1 1-77
Job <9184166> is submitted to queue <normal>.
Job <9184167> is submitted to queue <normal>.
PvP01_06_v1 1-53
Job <9184168> is submitted to queue <normal>.
Job <9184169> is submitted to queue <normal>.
PvP01_07_v1 1-83
Job <9184170> is submitted to queue <normal>.
Job <9184171> is submitted to queue <normal>.
PvP01_08_v1 1-89
Job <9184172> is submitted to queue <normal>.
Job <9184174> is submitted to queue <normal>.
PvP01_09_v1 1-112
Job

In [68]:
(etl.wrap(
    np.unique(y['variants']['CDS'], return_counts=True)
)
    .transpose()
    .pushheader('CDS', 'number')
    .sort('number', reverse=True)
    .displayall()
)

CDS,number
False,3383388
True,2668308


In [88]:
CDS = y['variants']['CDS'][:]
SNPEFF_EFFECT = y['variants']['SNPEFF_EFFECT'][:]
SNP = (y['variants']['VARIANT_TYPE'][:] == b'SNP')
INDEL = (y['variants']['VARIANT_TYPE'][:] == b'INDEL')


In [None]:
np.unique(CDS[SNP], return_counts=True)

In [None]:
2+2

In [85]:
y['variants']['VARIANT_TYPE']

<HDF5 dataset "VARIANT_TYPE": shape (6051696,), type "|S5">

In [89]:
pd.value_counts(INDEL)

False    3846585
True     2205111
dtype: int64

In [90]:
pd.crosstab(SNPEFF_EFFECT[SNP], CDS[SNP])

KeyboardInterrupt: 

In [None]:
2+2

In [70]:
df = pd.DataFrame({'CDS': CDS, 'SNPEFF_EFFECT':SNPEFF_EFFECT})

In [75]:
writer = pd.ExcelWriter("/nfs/users/nfs_r/rp7/SNPEFF_for_Rob.xlsx")
pd.crosstab(SNPEFF_EFFECT, CDS).to_excel(writer)
writer.save()


In [73]:
pd.crosstab(SNPEFF_EFFECT, y['variants']['CHROM'])

col_0,b'Pf3D7_01_v3',b'Pf3D7_02_v3',b'Pf3D7_03_v3',b'Pf3D7_04_v3',b'Pf3D7_05_v3',b'Pf3D7_06_v3',b'Pf3D7_07_v3',b'Pf3D7_08_v3',b'Pf3D7_09_v3',b'Pf3D7_10_v3',b'Pf3D7_11_v3',b'Pf3D7_12_v3',b'Pf3D7_13_v3',b'Pf3D7_14_v3',b'Pf3D7_API_v3',b'Pf_M76611'
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
b'',5596,4486,4532,4303,2779,4566,7188,9494,4928,4304,4485,5923,5576,1779,236,422
b'CODON_CHANGE_PLUS_CODON_DELETION',538,836,844,1321,852,1183,1483,1282,888,1473,1430,1685,1846,1980,34,0
b'CODON_CHANGE_PLUS_CODON_INSERTION',509,648,733,1223,805,986,1306,1143,900,1120,1272,1650,1653,1668,21,1
b'CODON_DELETION',877,1094,1357,1669,1853,1567,1948,1822,1415,2762,2435,2559,2951,3139,41,2
b'CODON_INSERTION',1438,1863,2329,3186,2808,3105,3562,3342,2661,3463,3961,4658,5180,5838,73,3
b'EXON_DELETED',1,0,4,9,3,5,6,1,7,16,10,12,9,10,0,0
b'FRAME_SHIFT',19511,19580,18310,53868,12085,33194,51496,44289,23786,29154,28897,53743,32969,23176,560,59
b'INTERGENIC',112985,133602,147961,155345,126775,164228,170845,188646,216718,224002,235037,259684,316637,342596,353,82
b'INTRAGENIC',5967,7158,2931,11747,10181,5114,6986,2906,1290,3017,3837,10426,6371,11739,0,0
b'INTRON',11908,18307,20796,21499,23556,25586,26280,29842,32779,32630,41676,45010,54601,64948,0,0


In [78]:
np.unique(y['variants']['svlen'], return_counts=True)

(array([-396, -378, -375, -370, -363, -359, -358, -354, -353, -351, -350,
        -349, -348, -347, -346, -345, -343, -342, -341, -340, -339, -338,
        -337, -336, -335, -334, -333, -332, -331, -330, -329, -328, -327,
        -326, -325, -324, -323, -322, -321, -320, -319, -318, -317, -316,
        -315, -314, -313, -312, -311, -310, -309, -308, -307, -306, -305,
        -304, -303, -302, -301, -300, -299, -298, -297, -296, -295, -294,
        -293, -292, -291, -290, -289, -288, -287, -286, -285, -284, -283,
        -282, -281, -280, -279, -278, -277, -276, -275, -274, -273, -272,
        -271, -270, -269, -268, -267, -266, -265, -264, -263, -262, -261,
        -260, -259, -258, -257, -256, -255, -254, -253, -252, -251, -250,
        -249, -248, -247, -246, -245, -244, -243, -242, -241, -240, -239,
        -238, -237, -236, -235, -234, -233, -232, -231, -230, -229, -228,
        -227, -226, -225, -224, -223, -222, -221, -220, -219, -218, -217,
        -216, -215, -214, -213, -212, 

In [32]:
y = h5py.File('%s/hdf5/Pf_60_npy_no_PID_PGT_10pc.h5.h5' % output_dir, 'r')
y

<HDF5 file "Pf_60_npy_no_PID_PGT_10pc.h5.h5" (mode r)>

In [33]:
# for field in y['variants'].keys():
for field in ['svlen']:
    print(field, np.unique(y['variants'][field], return_counts=True))

svlen (array([-375, -358, -354, -353, -348, -345, -340, -339, -337, -336, -335,
       -334, -332, -331, -330, -327, -324, -319, -317, -316, -315, -314,
       -313, -312, -311, -309, -308, -306, -305, -303, -302, -301, -300,
       -299, -298, -297, -296, -295, -294, -293, -292, -291, -290, -289,
       -288, -287, -286, -285, -284, -283, -282, -281, -280, -279, -278,
       -277, -276, -275, -274, -273, -272, -271, -270, -269, -268, -267,
       -266, -265, -264, -263, -262, -261, -260, -259, -258, -257, -256,
       -255, -254, -253, -252, -251, -250, -249, -248, -247, -246, -245,
       -244, -243, -242, -241, -240, -239, -238, -237, -236, -235, -234,
       -233, -232, -231, -230, -229, -228, -227, -226, -225, -224, -223,
       -222, -221, -220, -219, -218, -217, -216, -215, -214, -213, -212,
       -211, -210, -209, -208, -207, -206, -205, -204, -203, -202, -201,
       -200, -199, -198, -197, -196, -195, -194, -193, -192, -191, -190,
       -189, -188, -187, -186, -185, -184, -

In [11]:
!vcfnpy2hdf5 \
    --vcf {vcf_fn} \
    --input-dir {output_dir}/npy_no_PID_PGT_10pc \
    --output {output_dir}/hdf5/Pf_60_no_PID_PGT_10pc.h5 \
    --chunk-size 8388608 \
    --chunk-width 200 \
    --compression gzip \
    --compression-opts 1 \
    &>> {output_dir}/hdf5/Pf_60_no_PID_PGT_10pc.h5.log

!md5sum {output_dir}/hdf5/Pf_60_no_PID_PGT_10pc.h5 > {output_dir}/hdf5/Pf_60_no_PID_PGT_10pc.h5.md5 


/bin/sh: line 1: 14116 Segmentation fault      vcfnpy2hdf5 --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161125_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz --input-dir /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/npy_subset --output /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60_subset_10pc.h5 --chunk-size 8388608 --chunk-width 200 --compression gzip --compression-opts 1 &>>/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60_subset_10pc.h5.log


In [10]:
!vcfnpy2hdf5 \
    --vcf {vcf_fn} \
    --input-dir {output_dir}/npy_subset_1pc \
    --output {output_dir}/hdf5/Pf_60_subset_1pc.h5 \
    --chunk-size 8388608 \
    --chunk-width 200 \
    --compression gzip \
    --compression-opts 1 \
    &>> {output_dir}/hdf5/Pf_60_subset_1pc.h5.log

!md5sum {output_dir}/hdf5/Pf_60_subset_1pc.h5 > {output_dir}/hdf5/Pf_60_subset_1pc.h5.md5 


In [11]:
!vcfnpy2hdf5 \
    --vcf {vcf_fn} \
    --input-dir {output_dir}/npy_subset \
    --output {output_dir}/hdf5/Pf_60_subset_10pc.h5 \
    --chunk-size 8388608 \
    --chunk-width 200 \
    --compression gzip \
    --compression-opts 1 \
    &>> {output_dir}/hdf5/Pf_60_subset_10pc.h5.log

!md5sum {output_dir}/hdf5/Pf_60_subset_10pc.h5 > {output_dir}/hdf5/Pf_60_subset_10pc.h5.md5 


/bin/sh: line 1: 14116 Segmentation fault      vcfnpy2hdf5 --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161125_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz --input-dir /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/npy_subset --output /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60_subset_10pc.h5 --chunk-size 8388608 --chunk-width 200 --compression gzip --compression-opts 1 &>>/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60_subset_10pc.h5.log


In [11]:
!vcfnpy2hdf5 \
    --vcf {vcf_fn} \
    --input-dir {output_dir}/npy_subset \
    --output {output_dir}/hdf5/Pf_60_subset_10pc.h5 \
    --chunk-size 8388608 \
    --chunk-width 200 \
    --compression gzip \
    --compression-opts 1 \
    &>> {output_dir}/hdf5/Pf_60_subset_10pc.h5.log

!md5sum {output_dir}/hdf5/Pf_60_subset_10pc.h5 > {output_dir}/hdf5/Pf_60_subset_10pc.h5.md5 


/bin/sh: line 1: 14116 Segmentation fault      vcfnpy2hdf5 --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161125_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz --input-dir /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/npy_subset --output /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60_subset_10pc.h5 --chunk-size 8388608 --chunk-width 200 --compression gzip --compression-opts 1 &>>/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60_subset_10pc.h5.log


In [9]:
!{output_dir}/scripts/vcfnp_concat.sh {vcf_fn} {output_dir}/hdf5/Pf_60

+ set -e
+ set -o pipefail
+ vcf=/lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161125_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz
+ outbase=/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60
+ inputs=/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/npy
+ output=/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60.h5
+ log=/lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60.h5.log
+ '[' -f /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/hdf5/Pf_60.h5.md5 ']'
++ date
+ echo Sun Nov 27 10:53:45 GMT 2016 building
+ vcfnpy2hdf5 --vcf /lustre/scratch109/malaria/rp7/data/methods-dev/builds/Pf6.0/20161125_Pf60_final_vcfs/vcf/Pf_60_Pf_M76611.final.vcf.gz --input-dir /lustre/scratch111/malaria/rp7/data/methods-dev/builds/Pf6.0/20161127_HDF5_build/npy --output /lustre/scratch111/malaria/rp7/da

In [6]:
fo = open("%s/scripts/vcfnp_concat.sh" % output_dir, 'w')
print('''#!/bin/bash

set -x
set -e
set -o pipefail

vcf=$1
outbase=$2
# inputs=${vcf}.vcfnp_cache
inputs=%s/npy
output=${outbase}.h5

log=${output}.log

if [ -f ${output}.md5 ]
then
    echo $(date) skipping $chrom >> $log
else
    echo $(date) building $chrom > $log
    vcfnpy2hdf5 \
        --vcf $vcf \
        --input-dir $inputs \
        --output $output \
        --chunk-size 8388608 \
        --chunk-width 200 \
        --compression gzip \
        --compression-opts 1 \
        &>> $log
        
    md5sum $output > ${output}.md5 
fi''' % (
        output_dir,
        )
      , file=fo)
fo.close()

#     nv=$(ls -1 ${inputs}/v* | wc -l)
#     nc=$(ls -1 ${inputs}/c* | wc -l)
#     echo variants files $nv >> $log
#     echo calldata files $nc >> $log
#     if [ "$nv" -ne "$nc" ]
#     then
#         echo missing npy files
#         exit 1
#     fi


# Copy files to /nfs

In [3]:
!cp {output_dir}/hdf5/Pv_30.h5 {nfs_final_hdf5_dir}/
!cp {output_dir}/hdf5/Pv_30.h5.md5 {nfs_final_hdf5_dir}/
