# Introduction
This notebook is used to create the Pf3k pilot 6.0 release call set. This call set is exactly the same as the 5.0 call set, except a new defintion of the core genome (from Thomas) is used, and hence filters need changing appropriately.

Following emails on 16/02/2017, it was decided not to use this new call set for analyses.

In [2]:
%run _standard_imports.ipynb

python 3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
numpy 1.11.2
scipy 0.18.1
pandas 0.19.0
numexpr 2.6.1
pysam 0.8.4
petl 1.1.0
petlx 1.0.3
vcf 0.6.8
vcfnp 2.2.0
h5py 2.6.0
tables 3.3.0


In [36]:
temp_dir = '/lustre/scratch118/malaria/team112/personal/rp7/data/methods-dev/pf3k_techbm/20170216_pilot_6_0_callset'
release5_final_files_dir = '/nfs/team112_internal/production/release_build/Pf3K/pilot_5_0'
release6_final_files_dir = '/nfs/team112_internal/production/release_build/Pf3K/interim_pilot_6_0'
alt_core_bed_fn = '%s/regions-20170213.onebased.txt.gz' % release5_final_files_dir

GENOME_FN = "/lustre/scratch118/malaria/team112/pipelines/resources/pf3k_methods/resources/Pfalciparum.genome.fasta"
genome_fn = "%s/Pfalciparum.genome.fasta" % temp_dir


In [32]:
!mkdir -p {temp_dir}
!mkdir -p {release6_final_files_dir}

!mkdir -p {temp_dir}/scripts
!mkdir -p {temp_dir}/log


In [38]:
!cp {GENOME_FN} {genome_fn}
genome = pyfasta.Fasta(genome_fn)
genome

<pyfasta.fasta.Fasta at 0x2adbfae17d30>

# Function to filter one chromosome

In [12]:
annotations_header_fn = "%s/annotations.hdr" % (temp_dir)
fo=open(annotations_header_fn, 'w')
print('##INFO=<ID=NewRegionType,Number=1,Type=String,Description="The type of genome region within which the variant is found. SubtelomericRepeat: repetitive regions at the ends of the chromosomes. SubtelomericHypervariable: subtelomeric region of poor conservation between the 3D7 reference genome and other samples. InternalHypervariable: chromosome-internal region of poor conservation between the 3D7 reference genome and other samples. Centromere: start and end coordinates of the centromere genome annotation. Core_both: core region defined both in crosses paper and by Thomas. Core_Thomas: core region defined only by Thomas (i.e. not core in crosses paper).">', file=fo)
fo.close()

In [22]:
def filter_vcf(
    input_vcf_fn = "%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release5_final_files_dir, 'Pf3D7_01_v3'),
    output_vcf_fn = "%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release6_final_files_dir, 'Pf3D7_01_v3')
):
    !bcftools annotate -a {alt_core_bed_fn} -c CHROM,FROM,TO,NewRegionType --remove 'FILTER' -h {annotations_header_fn} {input_vcf_fn} | \
    bcftools filter --soft-filter 'Non_core' --include 'NewRegionType="Core_both" || NewRegionType="Core_Thomas"' | \
    bcftools filter --soft-filter 'Low_VQSLOD' --include 'VQSLOD>0.0' --mode + --output {output_vcf_fn} --output-type z

In [23]:
filter_vcf()

In [24]:
!bcftools index --tbi {"%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release6_final_files_dir, 'Pf3D7_01_v3')}

# Sanity check results

In [25]:
def create_variants_npy(vcf_fn):
    output_dir = '%s.vcfnp_cache' % vcf_fn
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    vcfnp.variants(
        vcf_fn,
#         fields=['NewRegionType', 'RegionType', 'FILTER', 'VQSLOD'],
        fields=['NewRegionType', 'RegionType', 'FILTER'],
        dtypes={
            'NewRegionType':            'a25',
            'RegionType':               'a25',
        },
        flatten_filter=True,
        progress=100000,
        verbose=True,
        cache=True,
        cachedir=output_dir
    )


In [26]:
create_variants_npy("%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release6_final_files_dir, 'Pf3D7_01_v3'))

[vcfnp] 2017-02-16 09:43:13.732328 :: caching is enabled
[vcfnp] 2017-02-16 09:43:13.734344 :: no cache file found
[vcfnp] 2017-02-16 09:43:13.735887 :: building array
[vcfnp] 2017-02-16 09:47:08.435299 :: 100000 rows in 234.69s; batch in 234.69s (426 rows/s)
[vcfnp] 2017-02-16 09:49:32.785427 :: 159946 rows in 379.04s (421 rows/s)
[vcfnp] 2017-02-16 09:49:32.786914 :: saving to cache file /nfs/team112_internal/production/release_build/Pf3K/pilot_6_0/SNP_INDEL_Pf3D7_01_v3.combined.filtered.vcf.gz.vcfnp_cache/variants.npy


In [27]:
create_variants_npy("%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release5_final_files_dir, 'Pf3D7_01_v3'))

[vcfnp] 2017-02-16 09:50:07.101182 :: caching is enabled
[vcfnp] 2017-02-16 09:50:07.105952 :: no cache file found
[vcfnp] 2017-02-16 09:50:07.107077 :: building array
[vcfnp] 2017-02-16 09:54:02.343486 :: 100000 rows in 235.22s; batch in 235.22s (425 rows/s)
[vcfnp] 2017-02-16 09:56:27.664961 :: 159946 rows in 380.54s (420 rows/s)
[vcfnp] 2017-02-16 09:56:27.666740 :: saving to cache file /nfs/team112_internal/production/release_build/Pf3K/pilot_5_0/SNP_INDEL_Pf3D7_01_v3.combined.filtered.vcf.gz.vcfnp_cache/variants.npy


In [28]:
release_5_variants = np.load("%s.vcfnp_cache/variants.npy" % "%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release5_final_files_dir, 'Pf3D7_01_v3'))
release_6_variants = np.load("%s.vcfnp_cache/variants.npy" % "%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release6_final_files_dir, 'Pf3D7_01_v3'))


In [29]:
np.unique(release_5_variants, return_counts=True)

(array([ (b'', b'Centromere', True, False, False, False, False, False, False, False, False, False, False, False),
        (b'', b'Centromere', True, False, False, True, False, False, False, False, False, False, False, False),
        (b'', b'Core', False, False, False, False, False, False, False, False, False, False, False, True),
        (b'', b'Core', False, False, False, True, False, False, False, False, False, False, False, False),
        (b'', b'SubtelomericHypervariable', False, False, False, False, False, False, False, False, False, False, False, True),
        (b'', b'SubtelomericHypervariable', False, False, False, False, True, False, False, False, False, False, False, False),
        (b'', b'SubtelomericHypervariable', False, False, False, True, False, False, False, False, False, False, False, False),
        (b'', b'SubtelomericHypervariable', False, False, False, True, True, False, False, False, False, False, False, False),
        (b'', b'SubtelomericRepeat', False, False

In [30]:
np.unique(release_6_variants, return_counts=True)

(array([(b'Centromere', b'Centromere', False, True, False),
        (b'Centromere', b'Centromere', True, True, False),
        (b'Core_Thomas', b'SubtelomericHypervariable', False, False, True),
        (b'Core_Thomas', b'SubtelomericHypervariable', True, False, False),
        (b'Core_both', b'Core', False, False, True),
        (b'Core_both', b'Core', True, False, False),
        (b'SubtelomericHypervariable', b'SubtelomericHypervariable', False, True, False),
        (b'SubtelomericHypervariable', b'SubtelomericHypervariable', True, True, False),
        (b'SubtelomericRepeat', b'SubtelomericRepeat', False, True, False),
        (b'SubtelomericRepeat', b'SubtelomericRepeat', True, True, False)], 
       dtype=[('NewRegionType', 'S25'), ('RegionType', 'S25'), ('FILTER_Low_VQSLOD', '?'), ('FILTER_Non_core', '?'), ('FILTER_PASS', '?')]),
 array([   80,    24,  1523,  8944, 45115, 25405, 11930, 26493, 12849, 27583]))

# Filter all chromosomes

In [34]:
fo = open("%s/scripts/filter_vcf.sh" % temp_dir, 'w')
print('''#!/bin/bash

#set changes bash options
#x prints commands & args as they are executed
set -x
#-e  Exit immediately if a command exits with a non-zero status
set -e
#reports the last program to return a non-0 exit code rather than the exit code of the last problem
set -o pipefail

input_vcf_fn=$1
output_vcf_fn=$2
alt_core_bed_fn=$3
annotations_header_fn=$4

bcftools annotate -a $alt_core_bed_fn -c CHROM,FROM,TO,NewRegionType --remove 'FILTER' -h $annotations_header_fn $input_vcf_fn | \
    bcftools filter --soft-filter 'Non_core' --include 'NewRegionType="Core_both" || NewRegionType="Core_Thomas"' | \
    bcftools filter --soft-filter 'Low_VQSLOD' --include 'VQSLOD>0.0' --mode + --output $output_vcf_fn --output-type z

bcftools index --tbi $output_vcf_fn

''', file=fo)
fo.close()


In [46]:
for chrom in sorted(genome.keys()):
    input_vcf_fn = "%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release5_final_files_dir, chrom)
    output_vcf_fn = "%s/SNP_INDEL_%s.combined.filtered.vcf.gz" % (release6_final_files_dir, chrom)
    
    print(chrom)

    task = "%s/scripts/filter_vcf.sh" % temp_dir
    !bsub -q normal -G malaria-dk -J "v_{chrom[6:8]}" -n2 -R"select[mem>4000] rusage[mem=4000] span[hosts=1]" -M 4000 -o {temp_dir}/log/output_%J.log bash {task} {input_vcf_fn} {output_vcf_fn} {alt_core_bed_fn} {annotations_header_fn} 


Pf3D7_01_v3
Job <2936484> is submitted to queue <normal>.
Pf3D7_02_v3
Job <2936485> is submitted to queue <normal>.
Pf3D7_03_v3
Job <2936486> is submitted to queue <normal>.
Pf3D7_04_v3
Job <2936487> is submitted to queue <normal>.
Pf3D7_05_v3
Job <2936488> is submitted to queue <normal>.
Pf3D7_06_v3
Job <2936489> is submitted to queue <normal>.
Pf3D7_07_v3
Job <2936490> is submitted to queue <normal>.
Pf3D7_08_v3
Job <2936492> is submitted to queue <normal>.
Pf3D7_09_v3
Job <2936494> is submitted to queue <normal>.
Pf3D7_10_v3
Job <2936497> is submitted to queue <normal>.
Pf3D7_11_v3
Job <2936499> is submitted to queue <normal>.
Pf3D7_12_v3
Job <2936501> is submitted to queue <normal>.
Pf3D7_13_v3
Job <2936504> is submitted to queue <normal>.
Pf3D7_14_v3
Job <2936505> is submitted to queue <normal>.
Pf3D7_API_v3
Job <2936506> is submitted to queue <normal>.
Pf_M76611
Job <2936508> is submitted to queue <normal>.


# Cleanup

In [None]:
!rm -R /nfs/team112_internal/production/release_build/Pf3K/pilot_6_0
# !rm -R {temp_dir}

In [None]:
2+2