# Setup

In [79]:
%run _shared_setup.ipynb

python 3.4.3 |Anaconda 2.2.0 (64-bit)| (default, Mar  6 2015, 12:03:53) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
numpy 1.9.2
scipy 0.15.1
pandas 0.15.2
numexpr 2.3.1
pysam 0.8.3
petl 1.0.11
petlx 1.0.3
vcf 0.6.7
h5py 2.4.0
tables 3.1.1
vcfplt 0.8
tbl_pgv_metadata length = 5729
tbl_pgv_locations length = 102
tbl_pf3k_metadata length = 2512
tbl_pf_solaris length = 10879
tbl_assembled_samples length = 11


In [2]:
release4_vcfnp_dir = collections.OrderedDict()
release4_vcfnp_dir['7G8'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/WG.7G8.vcf.gz.vcfnp_cache'
release4_vcfnp_dir['GB4'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/WG.GB4.vcf.gz.vcfnp_cache'
release4_vcfnp_dir['KH02'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/WG.ERS740936.vcf.gz.vcfnp_cache'
release4_vcfnp_dir['KE01'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/WG.ERS740937.vcf.gz.vcfnp_cache'
release4_vcfnp_dir['GN01'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/WG.ERS740940.vcf.gz.vcfnp_cache'

validation5_vcfnp_dir = collections.OrderedDict()
validation5_vcfnp_dir['7G8'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/validation_WG.7G8.vcf.gz.vcfnp_cache'
validation5_vcfnp_dir['GB4'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/validation_WG.GB4.vcf.gz.vcfnp_cache'
validation5_vcfnp_dir['KH02'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/validation_WG.KH02.vcf.gz.vcfnp_cache'
validation5_vcfnp_dir['KE01'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/validation_WG.KE01.vcf.gz.vcfnp_cache'
validation5_vcfnp_dir['GN01'] = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/vcfnp/validation_WG.GN01.vcf.gz.vcfnp_cache'


# Analysis

In [71]:
def create_variants_array(sample='7G8'):
    variants_array = np.load(release4_vcfnp_dir[sample] + '/variants.npy')
    calls_array = np.load(release4_vcfnp_dir[sample] + '/calldata_2d.npy')
    flt_spanning_del = np.array([x[0] != b'*' for x in variants_array['ALT']])
    print(np.unique(flt_spanning_del, return_counts=True))
    alt_allele_num = np.array([int(x.astype(str)[0]) for x in calls_array['GT'][:,0]])
    is_het = np.array([x[0] != x[1] for x in calls_array['GT'][:,0]])
    highest_cov_allele = np.argmax(calls_array['AD'][:,0,:], axis=1)
    alt_allele_num[is_het] = highest_cov_allele[is_het]
    flt_minor_het = (alt_allele_num > 0)
    print(np.unique(flt_minor_het, return_counts=True))
    alt_allele_num = alt_allele_num-1
    alt_allele_num[alt_allele_num == -1] = 0
    alts = variants_array['ALT'][np.arange(len(alt_allele_num)), alt_allele_num]
    variants_array = np.lib.recfunctions.append_fields(
        variants_array,
        'alt',
        alts
    ).data
    flt_all = (
        flt_spanning_del &
        flt_minor_het
    )
    return(variants_array[flt_all])


In [72]:
variants_array = create_variants_array()

(array([False,  True], dtype=bool), array([ 79484, 109747]))
(array([False,  True], dtype=bool), array([ 45885, 143346]))


In [74]:
variants_array.shape

(74726,)

In [81]:
np.unique(variants_array['SNPEFF_EFFECT'], return_counts=True)

(array([b'', b'CODON_CHANGE_PLUS_CODON_DELETION',
        b'CODON_CHANGE_PLUS_CODON_INSERTION', b'CODON_DELETION',
        b'CODON_INSERTION', b'FRAME_SHIFT', b'INTERGENIC', b'INTRAGENIC',
        b'INTRON', b'NON_SYNONYMOUS_CODING', b'SPLICE_SITE_ACCEPTOR',
        b'SPLICE_SITE_DONOR', b'START_LOST', b'STOP_GAINED', b'STOP_LOST',
        b'SYNONYMOUS_CODING', b'SYNONYMOUS_STOP'], 
       dtype='|S33'),
 array([  736,   102,   206,   367,  1936,  9486, 39416,  1371,  7103,
        10101,     8,    11,     1,   255,     3,  3619,     5]))

In [78]:
flt_pass = variants_array['VQSLOD'] > 0
print(np.unique(flt_pass, return_counts=True))
flt_core = variants_array['RegionType'] == b'Core'
print(np.unique(flt_core, return_counts=True))
flt_core_pass = (flt_pass & flt_core)
print(np.unique(flt_core_pass, return_counts=True))


(array([False,  True], dtype=bool), array([33783, 40943]))
(array([False,  True], dtype=bool), array([31958, 42768]))
(array([False,  True], dtype=bool), array([36254, 38472]))


In [76]:
variants_array['RegionType']

array([b'SubtelomericRepeat', b'SubtelomericRepeat', b'SubtelomericRepeat',
       ..., b'', b'', b''], 
      dtype='|S25')

In [88]:
np.unique((variants_array[['is_snp', 'SNPEFF_EFFECT']]), return_counts=True)
# , np.in1d(variants_array['SNPEFF_EFFECT'], [b'INTERGENIC', b'INTRAGENIC'])))

(array([(False, b''), (False, b'CODON_CHANGE_PLUS_CODON_DELETION'),
        (False, b'CODON_CHANGE_PLUS_CODON_INSERTION'),
        (False, b'CODON_DELETION'), (False, b'CODON_INSERTION'),
        (False, b'FRAME_SHIFT'), (False, b'INTERGENIC'),
        (False, b'INTRAGENIC'), (False, b'INTRON'),
        (False, b'NON_SYNONYMOUS_CODING'), (False, b'SPLICE_SITE_ACCEPTOR'),
        (False, b'SPLICE_SITE_DONOR'), (False, b'STOP_GAINED'), (True, b''),
        (True, b'CODON_CHANGE_PLUS_CODON_DELETION'),
        (True, b'CODON_CHANGE_PLUS_CODON_INSERTION'),
        (True, b'CODON_DELETION'), (True, b'CODON_INSERTION'),
        (True, b'FRAME_SHIFT'), (True, b'INTERGENIC'),
        (True, b'INTRAGENIC'), (True, b'INTRON'),
        (True, b'NON_SYNONYMOUS_CODING'), (True, b'SPLICE_SITE_ACCEPTOR'),
        (True, b'SPLICE_SITE_DONOR'), (True, b'START_LOST'),
        (True, b'STOP_GAINED'), (True, b'STOP_LOST'),
        (True, b'SYNONYMOUS_CODING'), (True, b'SYNONYMOUS_STOP')], 
       dtype=[('

In [86]:
variants_array[['is_snp', 'SNPEFF_EFFECT']]

array([(True, b'INTERGENIC'), (False, b'INTERGENIC'),
       (False, b'INTERGENIC'), ..., (False, b'CODON_INSERTION'),
       (False, b'CODON_INSERTION'), (True, b'NON_SYNONYMOUS_CODING')], 
      dtype=[('is_snp', '?'), ('SNPEFF_EFFECT', 'S33')])

In [90]:
etl.fromarray(variants_array).display(index_header=True)

0|CHROM,1|POS,2|ID,3|REF,4|ALT,5|QUAL,6|FILTER,7|num_alleles,8|is_snp,9|svlen,10|AC,11|AF,12|AN,13|BaseQRankSum,14|ClippingRankSum,15|DP,16|DS,17|END,18|FS,19|GC,20|HaplotypeScore,21|InbreedingCoeff,22|MLEAC,23|MLEAF,24|MQ,25|MQRankSum,26|NEGATIVE_TRAIN_SITE,27|POSITIVE_TRAIN_SITE,28|QD,29|RPA,30|RU,31|ReadPosRankSum,32|RegionType,33|SNPEFF_AMINO_ACID_CHANGE,34|SNPEFF_CODON_CHANGE,35|SNPEFF_EFFECT,36|SNPEFF_EXON_ID,37|SNPEFF_FUNCTIONAL_CLASS,38|SNPEFF_GENE_BIOTYPE,39|SNPEFF_GENE_NAME,40|SNPEFF_IMPACT,41|SNPEFF_TRANSCRIPT_ID,42|SOR,43|STR,44|VQSLOD,45|VariantType,46|culprit,47|set,48|alt
b'Pf3D7_01_v3',522,b'.',b'C',[b'T' b''],89147.3,"(False, False, False, True, False, True, False, False, False, False, False, False)",2,True,0,[1 0],[ 0.5 0. ],2,1.2695,-0.056,28,False,0,0.62109,28.57,0.0,0.0,[0 0],[ 0. 0.],36.031,0.60986,False,False,7.5312,[0 0 0],b'',0.36108,b'SubtelomericRepeat',b'',b'',b'INTERGENIC',b'',b'NONE',b'',b'',b'MODIFIER',b'',0.618,False,-19.58,b'MULTIALLELIC_MIXED',b'MQ',b'FilteredInAll',b'T'
b'Pf3D7_01_v3',1259,b'.',b'TAG',[b'T' b''],74697.1,"(False, False, False, True, False, True, False, False, False, False, False, False)",2,False,-2,[1 0],[ 0.5 0. ],2,0.72705,0.21802,5,False,0,3.1738,28.57,0.0,0.0,[0 0],[ 0. 0.],36.125,-0.091003,False,False,8.5938,[0 0 0],b'',-1.999,b'SubtelomericRepeat',b'',b'',b'INTERGENIC',b'',b'NONE',b'',b'',b'MODIFIER',b'',0.463,False,-17.8,b'MULTIALLELIC_MIXED',b'MQ',b'FilteredInAll',b'T'
b'Pf3D7_01_v3',1263,b'.',b'A',[b'AGT' b''],1035950.0,"(False, False, False, True, False, True, False, False, False, False, False, False)",2,False,2,[1 0],[ 0.5 0. ],2,-0.098999,0.13501,4,False,0,1.3027,28.57,0.0,0.0,[0 0],[ 0. 0.],35.719,-0.97705,False,False,9.4062,[0 0 0],b'',-1.6191,b'SubtelomericRepeat',b'',b'',b'INTERGENIC',b'',b'NONE',b'',b'',b'MODIFIER',b'',0.616,False,-18.09,b'MULTIALLELIC_MIXED',b'MQ',b'FilteredInAll',b'AGT'
b'Pf3D7_01_v3',1264,b'.',b'C',[b'T' b''],117900.0,"(False, False, False, True, False, True, False, False, False, False, False, False)",2,True,0,[1 0],[ 0.5 0. ],2,3.5996,0.21204,2,False,0,0.625,28.57,0.0,0.0,[0 0],[ 0. 0.],36.094,0.41406,False,False,7.7305,[0 0 0],b'',0.20398,b'SubtelomericRepeat',b'',b'',b'INTERGENIC',b'',b'NONE',b'',b'',b'MODIFIER',b'',0.752,False,-18.78,b'MULTIALLELIC_MIXED',b'MQ',b'FilteredInAll',b'T'
b'Pf3D7_01_v3',1297,b'.',b'G',[b'T' b''],4328910.0,"(False, False, False, True, False, True, False, False, False, False, False, False)",2,True,0,[1 0],[ 0.5 0. ],2,4.7383,0.061005,36,False,0,0.54492,42.86,0.0,0.0,[0 0],[ 0. 0.],34.781,-0.44189,False,False,21.703,[0 0 0],b'',-0.60986,b'SubtelomericRepeat',b'',b'',b'INTERGENIC',b'',b'NONE',b'',b'',b'MODIFIER',b'',0.78,False,-9.049,b'SNP',b'MQ',b'FilteredInAll',b'T'


In [91]:
etl.fromarray(variants_array[['CHROM', 'POS', 'REF', 'alt']]).display(index_header=True)

0|CHROM,1|POS,2|REF,3|alt
b'Pf3D7_01_v3',522,b'C',b'T'
b'Pf3D7_01_v3',1259,b'TAG',b'T'
b'Pf3D7_01_v3',1263,b'A',b'AGT'
b'Pf3D7_01_v3',1264,b'C',b'T'
b'Pf3D7_01_v3',1297,b'G',b'T'


In [122]:
import copy
def create_consensus(variants_array,
                     sample='7G8',
                     ref_genome_fn='/lustre/scratch110/malaria/rp7/Pf3k/GATKbuild/Pfalciparum_GeneDB_Aug2015/Pfalciparum.genome.fasta',
                     pos_output_format = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/consensus_alignment/pos/%s.pos',
                     fasta_output_format = '/lustre/scratch110/malaria/rp7/Pf3k/release4_candidate/consensus_alignment/fasta/%s.fasta'):
    ref_dict=SeqIO.to_dict(SeqIO.parse(open(ref_genome_fn), "fasta"))
    pos_dir = os.path.dirname(pos_output_format % sample)
    if not os.path.exists(pos_dir):
        os.makedirs(pos_dir)
    fasta_dir = os.path.dirname(fasta_output_format % sample)
    if not os.path.exists(fasta_dir):
        os.makedirs(fasta_dir)
    pos_fo = open(pos_output_format % sample, 'w')
    fasta_fo = open(fasta_output_format % sample, 'w')
    mutable_ref_dict = copy.deepcopy(ref_dict)
    for chrom in mutable_ref_dict:
        mutable_ref_dict[chrom].seq = mutable_ref_dict[chrom].seq.tomutable()
    previous_new_pos = 1
    current_offset = 0
    current_chrom = ''
    for i, rec in enumerate(etl.fromarray(variants_array).data()):
        if current_chrom != rec[0].decode("utf-8"):
            print("%s\t%d\t%d\t%d\t%d" % (current_chrom, 0, 9999999, previous_new_pos, current_offset), file=pos_fo)  
            current_chrom = rec[0].decode("utf-8")
            current_offset = 0
            previous_new_pos = 1
    #     if current_chrom == '':
    #         current_chrom = rec[0]
        reflen = len(rec[2])
        altlen = len(rec[3])
        pos = int(rec[1])
        new_pos = pos + current_offset
        print("%s\t%d\t%d\t%d\t%d" % (rec[0].decode("utf-8"), pos, new_pos, previous_new_pos, current_offset), file=pos_fo)  
        previous_new_pos = new_pos
        startpos = pos + current_offset - 1
        endpos = pos + current_offset + reflen - 1   
        mutable_ref_dict[rec[0].decode("utf-8")].seq[startpos:endpos] = rec[3]
        current_offset = current_offset + altlen - reflen
#         if i%1000 == 0:
#             print(current_chrom, current_offset)

    print("%s\t%d\t%d\t%d\t%d" % (current_chrom, 0, 9999999, previous_new_pos, current_offset), file=pos_fo)  
    
    for chrom in mutable_ref_dict:
        SeqIO.write(mutable_ref_dict[chrom], fasta_fo, "fasta")
    

In [125]:
def run_fdr_pipeline(sample='7G8'):
    
    print("Creating array %s" % sample)
    variants_array = create_variants_array(sample='7G8')
    flt_pass = variants_array['VQSLOD'] > 0
    flt_core = variants_array['RegionType'] == b'Core'
    flt_core_pass = (flt_pass & flt_core)

    print("Creating consensus %s" % sample)
    create_consensus(variants_array[flt_core_pass], sample='7G8')
    

In [124]:
run_fdr_pipeline()

Creating array 7G8
(array([False,  True], dtype=bool), array([77353, 95081]))
(array([False,  True], dtype=bool), array([ 34176, 138258]))
Creating consensus 7G8
