# Introduction

The aim of this notebook is to determine a sensible threshold for creating a DUST mask. This came about after conversations with Roberto about how to remove the many variants in AT repeats (or near AT repeats) for the PPQ GWAS.

The earlier version of the notbeook (20151027_dustmasker) used petl intervals to find overlaps between regions, but I then realised using numpy boolean arrays would be much more efficient (important when trying out lots of different thresholds)

In [2]:
%run _shared_setup.ipynb

python 3.4.1 |Anaconda 2.1.0 (x86_64)| (default, Sep 10 2014, 17:24:09) 
[GCC 4.2.1 (Apple Inc. build 5577)]
numpy 1.9.2
scipy 0.14.0
pandas 0.14.1
numexpr 2.3.1
pysam 0.8.3
petl 1.0.10
petlx 1.0.3
vcf 0.6.7
h5py 2.3.1
tables 3.1.1
vcfplt 0.8
tbl_pgv_metadata length = 5729
tbl_pgv_locations length = 102
tbl_pf3k_metadata length = 2512
tbl_pf_solaris length = 10879
tbl_assembled_samples length = 11


In [3]:
install_dir = '../opt_4'
REF_GENOME="/lustre/scratch110/malaria/rp7/Pf3k/GATKbuild/Pfalciparum_GeneDB_Aug2015/Pfalciparum.genome.fasta"
regions_fn = '/nfs/users/nfs_r/rp7/src/github/malariagen/pf-crosses/meta/regions-20130225.bed.gz'
regions_fn = '/Users/rpearson/src/github/malariagen/pf-crosses/meta/regions-20130225.bed.gz'
ref_gff = "%s/snpeff/snpEff/data/Pfalciparum_GeneDB_Aug2015/genes.gff" % install_dir
ref_cds_gff = REF_GENOME.replace('.fasta', '.CDS.gff')

In [3]:
# !head -n -34 {ref_gff} | grep -P '\tCDS\t' > {ref_cds_gff}

head: illegal line count -- -34
usage: grep [-abcDEFGHhIiJLlmnOoqRSsUVvwxZ] [-A num] [-B num] [-C[num]]
	[-e pattern] [-f file] [--binary-files=value] [--color=when]
	[--context[=num]] [--directories=action] [--label] [--line-buffered]
	[--null] [pattern] [file ...]


# Download software

In [4]:
# !wget ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/dustmasker/dustmasker -O {install_dir}/dustmasker
# !chmod a+x {install_dir}/dustmasker

--2015-10-28 10:05:23--  ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/dustmasker/dustmasker
           => `../opt_4/dustmasker'
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 2607:f220:41e:250::7
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/agarwala/dustmasker ... done.
==> SIZE dustmasker ... 11688414
==> PASV ... done.    ==> RETR dustmasker ... done.
Length: 11688414 (11M) (unauthoritative)


2015-10-28 10:05:31 (1.85 MB/s) - `../opt_4/dustmasker' saved [11688414]



# Run algorithms on ref genome

In [20]:
ref_dict=SeqIO.to_dict(SeqIO.parse(open(REF_GENOME), "fasta"))
chromosome_lengths = [len(ref_dict[chrom]) for chrom in ref_dict]
tbl_chromosomes=(etl.wrap(zip(ref_dict.keys(), chromosome_lengths))
    .pushheader(['chrom', 'stop'])
    .addfield('start', 0)
    .cut(['chrom', 'start', 'stop'])
    .sort('chrom')
)
tbl_chromosomes

chrom,start,stop
Pf3D7_01_v3,0,640851
Pf3D7_02_v3,0,947102
Pf3D7_03_v3,0,1067971
Pf3D7_04_v3,0,1200490
Pf3D7_05_v3,0,1343557


In [17]:
tbl_regions = (etl
    .fromtsv(regions_fn)
    .pushheader(['chrom', 'start', 'stop', 'region'])
    .convertnumbers()
)
tbl_regions.display(10)

chrom,start,stop,region
Pf3D7_01_v3,0,27336,SubtelomericRepeat
Pf3D7_01_v3,27336,92900,SubtelomericHypervariable
Pf3D7_01_v3,92900,457931,Core
Pf3D7_01_v3,457931,460311,Centromere
Pf3D7_01_v3,460311,575900,Core
Pf3D7_01_v3,575900,616691,SubtelomericHypervariable
Pf3D7_01_v3,616691,640851,SubtelomericRepeat
Pf3D7_02_v3,0,23100,SubtelomericRepeat
Pf3D7_02_v3,23100,105800,SubtelomericHypervariable
Pf3D7_02_v3,105800,447300,Core


In [26]:
iscore_array = collections.OrderedDict()
for chromosomes_row in tbl_chromosomes.data():
    chrom=chromosomes_row[0]
    iscore_array[chrom] = np.zeros(chromosomes_row[2], dtype=bool)
    for regions_row in tbl_regions.selecteq('chrom', chrom).selecteq('region', 'Core').data():
        iscore_array[chrom][regions_row[1]:regions_row[2]] = True

In [39]:
tbl_ref_cds_gff = (
    etl.fromgff3(ref_cds_gff)
    .select(lambda rec: rec['end'] > rec['start'])
    .unpackdict('attributes')
    .select(lambda rec: rec['Parent'].endswith('1')) # Think there are alternate splicings for some genes, here just using first
    .distinct(['seqid', 'start'])
)

In [42]:
tbl_coding_regions = (tbl_ref_cds_gff
    .cut(['seqid', 'start', 'end'])
    .rename('end', 'stop')
    .rename('seqid', 'chrom')
    .convert('start', lambda val: val-1)
)
tbl_coding_regions                   

chrom,start,stop
Pf3D7_01_v3,29509,34762
Pf3D7_01_v3,35887,37126
Pf3D7_01_v3,38981,39923
Pf3D7_01_v3,40153,40207
Pf3D7_01_v3,42366,43617


In [43]:
iscoding_array = collections.OrderedDict()
for chromosomes_row in tbl_chromosomes.data():
    chrom=chromosomes_row[0]
    iscoding_array[chrom] = np.zeros(chromosomes_row[2], dtype=bool)
    for coding_regions_row in tbl_coding_regions.selecteq('chrom', chrom).data():
        iscoding_array[chrom][coding_regions_row[1]:coding_regions_row[2]] = True

In [61]:
def evaluate_dust_threshold(
    dust_level=20,
    verbose=False
):
    masked_genome_fn = "%s.dustmasker.%d.fasta" % (REF_GENOME.replace('.fasta', ''), dust_level)
    
    if verbose:
        print("Running dustmasker %d" % dust_level)
    !{install_dir}/dustmasker \
    -in {REF_GENOME} \
    -outfmt fasta \
    -out {masked_genome_fn} \
    -level {dust_level}

    if verbose:
        print("Reading in fasta %d" % dust_level)
    masked_ref_dict=SeqIO.to_dict(SeqIO.parse(open(masked_genome_fn), "fasta"))

    if verbose:
        print("Creating mask array %d" % dust_level)
    ismasked_array = collections.OrderedDict()
    classification_array = collections.OrderedDict()
    
    genome_length = sum([len(ref_dict[chrom]) for chrom in ref_dict])
    for region_type in [
        'Core coding unmasked',
        'Core coding masked',
        'Core noncoding unmasked',
        'Core noncoding masked',
        'Noncore coding unmasked',
        'Noncore coding masked',
        'Noncore noncoding unmasked',
        'Noncore noncoding masked',
    ]:
        classification_array[region_type] = np.zeros(genome_length, dtype=bool)
        
    offset=0
    for chromosomes_row in tbl_chromosomes.data():
        chrom=chromosomes_row[0]
        if verbose:
            print(chrom)
        chrom_length=chromosomes_row[2]
        ismasked_array[chrom] = which_lower(masked_ref_dict[chrom].seq)
        classification_array['Core coding unmasked'][offset:(offset+chrom_length)] = (
            iscore_array[chrom] & iscoding_array[chrom] & np.logical_not(ismasked_array[chrom])
        )
        classification_array['Core coding masked'][offset:(offset+chrom_length)] = (
            iscore_array[chrom] & iscoding_array[chrom] & ismasked_array[chrom]
        )
        classification_array['Core noncoding unmasked'][offset:(offset+chrom_length)] = (
            iscore_array[chrom] & np.logical_not(iscoding_array[chrom]) & np.logical_not(ismasked_array[chrom])
        )
        classification_array['Core noncoding masked'][offset:(offset+chrom_length)] = (
            iscore_array[chrom] & np.logical_not(iscoding_array[chrom]) & ismasked_array[chrom]
        )
        classification_array['Noncore coding unmasked'][offset:(offset+chrom_length)] = (
            np.logical_not(iscore_array[chrom]) & iscoding_array[chrom] & np.logical_not(ismasked_array[chrom])
        )
        classification_array['Noncore coding masked'][offset:(offset+chrom_length)] = (
            np.logical_not(iscore_array[chrom]) & iscoding_array[chrom] & ismasked_array[chrom]
        )
        classification_array['Noncore noncoding unmasked'][offset:(offset+chrom_length)] = (
            np.logical_not(iscore_array[chrom]) & np.logical_not(iscoding_array[chrom]) & np.logical_not(ismasked_array[chrom])
        )
        classification_array['Noncore noncoding masked'][offset:(offset+chrom_length)] = (
            np.logical_not(iscore_array[chrom]) & np.logical_not(iscoding_array[chrom]) & ismasked_array[chrom]
        )
        offset = offset + chrom_length

    return(classification_array, masked_ref_dict, ismasked_array)


In [68]:
classification_arrays = collections.OrderedDict()
for dust_level in [20, 30, 40, 50, 60, 70]:
    classification_arrays[str(dust_level)] = evaluate_dust_threshold(dust_level, verbose=True)

Running dustmasker 20
Reading in fasta 20
Creating mask array 20
Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3
Pf3D7_API_v3
Pf_M76611
Running dustmasker 30
Reading in fasta 30
Creating mask array 30
Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3
Pf3D7_API_v3
Pf_M76611
Running dustmasker 40
Reading in fasta 40
Creating mask array 40
Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3
Pf3D7_API_v3
Pf_M76611
Running dustmasker 50
Reading in fasta 50
Creating mask array 50
Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3

In [77]:
np.count_nonzero(classification_arrays['20'][0]['Core coding unmasked'])

8195639

In [82]:
for dust_level in [20, 30, 40, 50, 60, 70, 80, 90, 100]:
#     classification_arrays[str(dust_level)] = evaluate_dust_threshold(dust_level, verbose=True)
    number_core_coding_masked = np.count_nonzero(classification_arrays[str(dust_level)][0]['Core coding masked'])
    number_core_coding_unmasked = np.count_nonzero(classification_arrays[str(dust_level)][0]['Core coding unmasked'])
    number_core_noncoding_masked = np.count_nonzero(classification_arrays[str(dust_level)][0]['Core noncoding masked'])
    number_core_noncoding_unmasked = np.count_nonzero(classification_arrays[str(dust_level)][0]['Core noncoding unmasked'])
    proportion_core_coding_masked = number_core_coding_masked / (number_core_coding_masked + number_core_coding_unmasked)
    proportion_core_noncoding_masked = number_core_noncoding_masked / (number_core_noncoding_masked + number_core_noncoding_unmasked)
    print("Dust level %d: %4.1f%% coding and %4.1f%% non-coding masked" % (
            dust_level,
            proportion_core_coding_masked*100,
            proportion_core_noncoding_masked*100
        )
    )
    

Dust level 20: 29.0% coding and 70.9% non-coding masked
Dust level 30: 10.1% coding and 49.1% non-coding masked
Dust level 40:  4.5% coding and 35.7% non-coding masked
Dust level 50:  2.3% coding and 26.7% non-coding masked
Dust level 60:  1.2% coding and 20.0% non-coding masked
Dust level 70: 29.0% coding and 70.9% non-coding masked
Dust level 80: 29.0% coding and 70.9% non-coding masked
Dust level 90: 29.0% coding and 70.9% non-coding masked
Dust level 100: 29.0% coding and 70.9% non-coding masked


In [62]:
classification_array, masked_ref_dict_20, ismasked_array_20 = evaluate_dust_threshold(verbose=True)

Running dustmasker 20
Reading in fasta 20
Creating mask array 20
Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3
Pf3D7_API_v3
Pf_M76611


In [64]:
for region_type in classification_array:
    print(region_type, np.unique(classification_array[region_type], return_counts=True))

Core coding unmasked (array([False,  True], dtype=bool), array([15137200,  8195639]))
Core coding masked (array([False,  True], dtype=bool), array([19977303,  3355536]))
Core noncoding unmasked (array([False,  True], dtype=bool), array([20648686,  2684153]))
Core noncoding masked (array([False,  True], dtype=bool), array([16786060,  6546779]))
Noncore coding unmasked (array([False,  True], dtype=bool), array([22673473,   659366]))
Noncore coding masked (array([False,  True], dtype=bool), array([23226833,   106006]))
Noncore noncoding unmasked (array([False,  True], dtype=bool), array([22469858,   862981]))
Noncore noncoding masked (array([False,  True], dtype=bool), array([22410460,   922379]))


In [65]:
classification_array_50, masked_ref_dict_50, ismasked_array_50 = evaluate_dust_threshold(50, verbose=True)
for region_type in classification_array_50:
    print(region_type, np.unique(classification_array_50[region_type], return_counts=True))

Running dustmasker 50
Reading in fasta 50
Creating mask array 50
Pf3D7_01_v3
Pf3D7_02_v3
Pf3D7_03_v3
Pf3D7_04_v3
Pf3D7_05_v3
Pf3D7_06_v3
Pf3D7_07_v3
Pf3D7_08_v3
Pf3D7_09_v3
Pf3D7_10_v3
Pf3D7_11_v3
Pf3D7_12_v3
Pf3D7_13_v3
Pf3D7_14_v3
Pf3D7_API_v3
Pf_M76611
Core coding unmasked (array([False,  True], dtype=bool), array([12047873, 11284966]))
Core coding masked (array([False,  True], dtype=bool), array([23066630,   266209]))
Core noncoding unmasked (array([False,  True], dtype=bool), array([16567484,  6765355]))
Core noncoding masked (array([False,  True], dtype=bool), array([20867262,  2465577]))
Noncore coding unmasked (array([False,  True], dtype=bool), array([22570815,   762024]))
Noncore coding masked (array([False,  True], dtype=bool), array([23329491,     3348]))
Noncore noncoding unmasked (array([False,  True], dtype=bool), array([21688087,  1644752]))
Noncore noncoding masked (array([False,  True], dtype=bool), array([23192231,   140608]))


In [53]:
ismasked_array_20['Pf3D7_01_v3']

array([False, False,  True, ...,  True,  True,  True], dtype=bool)

In [54]:
np.unique(ismasked_array_20['Pf3D7_01_v3'], return_counts=True)

(array([False,  True], dtype=bool), array([352742, 288109]))

In [16]:
ref_dict_20['Pf3D7_01_v3'].seq

AttributeError: 'Seq' object has no attribute 'Seq'

In [9]:
def !{install_dir}/dustmasker \
-in {REF_GENOME} \
-outfmt interval \
-out {REF_GENOME.replace('.fasta', '.dustmasker.interval')}


In [5]:
!{install_dir}/dustmasker \
-in {REF_GENOME} \
-outfmt fasta \
-out {REF_GENOME.replace('.fasta', '.dustmasker.fasta')}


In [6]:
!{install_dir}/dustmasker \
-in {REF_GENOME} \
-outfmt fasta \
-out {REF_GENOME.replace('.fasta', '.dustmasker.fasta')} \
-level 50



In [8]:
!{install_dir}/windowmasker -mk_counts \
-in {REF_GENOME} \
-checkdup true \
-out {REF_GENOME.replace('.fasta', '.windowmasker.ustat')}

!{install_dir}/windowmasker \
-ustat {REF_GENOME.replace('.fasta', '.windowmasker.ustat')} \
-in {REF_GENOME} \
-out {REF_GENOME.replace('.fasta', '.windowmasker.interval')} \
-dust true \


computing the genome length
pass 1
pass 2


#Convert files to bed

In [10]:
def convert_interval_to_bed(interval_fn=REF_GENOME.replace('.fasta', '.dustmasker.interval'), output_fn=None,
                            annotation=None):
    if output_fn is None:
        output_fn = interval_fn.replace('.interval', '.bed')
    if annotation is None:
        annotation = interval_fn.split('.')[-2]
    print(interval_fn, output_fn, annotation)
    fi = open(interval_fn, 'r')
    fo = open(output_fn, 'w')
    for line in fi.readlines():
        if line.startswith('>'):
            chrom_line = line.split('|')
            if len(chrom_line) == 2:
                chrom = chrom_line[1].replace('\n', '').replace(' ', '')
            else:
                chrom = chrom_line[0].replace('\n', '').replace('>', '')
        else:
            start, end = line.split(' - ')
            print("%s\t%d\t%d\t%s" % (chrom, int(start), int(end), annotation), file=fo)
    fi.close()
    fo.close()
    
    !bgzip -f {output_fn}
    !tabix -p bed -f {output_fn}.gz


In [11]:
convert_interval_to_bed()

/lustre/scratch110/malaria/rp7/Pf3k/GATKbuild/Pfalciparum_GeneDB_Aug2015/Pfalciparum.genome.dustmasker.interval /lustre/scratch110/malaria/rp7/Pf3k/GATKbuild/Pfalciparum_GeneDB_Aug2015/Pfalciparum.genome.dustmasker.bed dustmasker


In [12]:
# convert_interval_to_bed(REF_GENOME.replace('.fasta', '.windowmasker.interval'))

#Find overlaps

In [28]:
np.unique(iscore_array['Pf3D7_01_v3'], return_counts=True)

(array([False,  True], dtype=bool), array([160231, 480620]))

In [50]:
def which_lower(string):
    return np.array([str.islower(x) for x in string])
which_lower('abCDeF') 
# np.array([str.islower(x) for x in 'abCDeF'])

array([ True,  True, False, False,  True, False], dtype=bool)

In [17]:
tbl_core = tbl_regions.selecteq('region', 'Core').cut(['chrom', 'start', 'stop'])
tbl_core

chrom,start,stop
Pf3D7_01_v3,92900,457931
Pf3D7_01_v3,460311,575900
Pf3D7_02_v3,105800,447300
Pf3D7_02_v3,450450,862500
Pf3D7_03_v3,70630,597816


In [18]:
tbl_noncore = petl.transform.intervals.intervalsubtract(
    tbl_chromosomes,
    tbl_core,
    lkey='chrom',
    rkey='chrom'
)
tbl_noncore

chrom,start,stop
Pf3D7_01_v3,0,92900
Pf3D7_01_v3,457931,460311
Pf3D7_01_v3,575900,640851
Pf3D7_02_v3,0,105800
Pf3D7_02_v3,447300,450450


In [19]:
# tbl_noncore = etl.collapsedintervals(tbl_regions.selectne('region', 'Core'), key='chrom')
# tbl_noncore

In [20]:
# etl.wrap(
#     tbl_regions.selectne('region', 'Core')
#     .cut(['chrom', 'start', 'stop'])
#     .collapsedintervals(key='chrom')
# ).pushheader(['chrom', 'start', 'stop']).sort(['chrom', 'start']).displayall()


In [40]:
len(tbl_ref_cds_gff)

13891

In [23]:
# print(len(tbl_ref_cds_gff.unpackdict('attributes')))
# print(len(tbl_ref_cds_gff.unpackdict('attributes').select(lambda rec: not rec['Parent'].endswith('2'))))
# print(len(tbl_ref_cds_gff.unpackdict('attributes').select(lambda rec: rec['Parent'].endswith('1'))))


ValueError: 'attributes' is not in list

In [41]:
tbl_ref_cds_gff.duplicates(['seqid', 'start']).displayall()

seqid,source,type,start,end,score,strand,phase,ID,Parent


In [26]:
tbl_noncoding_regions = petl.transform.intervals.intervalsubtract(
    tbl_chromosomes,
    tbl_coding_regions,
    lkey='chrom',
    rkey='chrom',
    include_stop=False
)
tbl_noncoding_regions

chrom,start,stop
Pf3D7_01_v3,0,29510
Pf3D7_01_v3,34762,35888
Pf3D7_01_v3,37126,38982
Pf3D7_01_v3,39923,40154
Pf3D7_01_v3,40207,42367


In [27]:
tbl_dustmasked = (etl
    .fromtsv(REF_GENOME.replace('.fasta', '.dustmasker.bed.gz'))
    .pushheader(['chrom', 'start', 'stop', 'annotation'])
    .convertnumbers()
    .convert('stop', lambda rec: rec+1)
)
tbl_dustmasked


chrom,start,stop,annotation
Pf3D7_01_v3,2,1114,dustmasker
Pf3D7_01_v3,1117,1211,dustmasker
Pf3D7_01_v3,1269,1441,dustmasker
Pf3D7_01_v3,1591,1716,dustmasker
Pf3D7_01_v3,1774,1838,dustmasker


In [28]:
tbl_not_dustmasked = petl.transform.intervals.intervalsubtract(
    tbl_chromosomes,
    tbl_dustmasked,
    lkey='chrom',
    rkey='chrom'
)
tbl_not_dustmasked


chrom,start,stop
Pf3D7_01_v3,0,2
Pf3D7_01_v3,1114,1117
Pf3D7_01_v3,1211,1269
Pf3D7_01_v3,1441,1591
Pf3D7_01_v3,1716,1774


In [61]:
from petl.transform.intervals import intervalsubtract
tbl_interim_classifcation = (
    intervalsubtract(tbl_coding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom')
    .addfield('annotation', 'coding not dustmasked')
    .cat(
        intervalsubtract(tbl_coding_regions, tbl_not_dustmasked, lkey='chrom', rkey='chrom')
        .addfield('annotation', 'coding dustmasked')
    )
    .cat(
        intervalsubtract(tbl_noncoding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom')
        .addfield('annotation', 'noncoding not dustmasked')
    )
    .cat(
        intervalsubtract(tbl_noncoding_regions, tbl_not_dustmasked, lkey='chrom', rkey='chrom')
        .addfield('annotation', 'noncoding dustmasked')
    )
    .sort(['chrom', 'start'])
)

    

In [186]:
tbl_interim_classifcation

chrom,start,stop,annotation
Pf3D7_01_v3,1,2,noncoding not dustmasked
Pf3D7_01_v3,2,1114,noncoding dustmasked
Pf3D7_01_v3,1114,1117,noncoding not dustmasked
Pf3D7_01_v3,1117,1211,noncoding dustmasked
Pf3D7_01_v3,1211,1269,noncoding not dustmasked


In [None]:
from petl.transform.intervals import intervalsubtract
tbl_region_classifcation = (
    (
        intervalsubtract(
            intervalsubtract(tbl_coding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_noncore, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'core coding notdustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_coding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_core, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'noncore coding notdustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_coding_regions, tbl_not_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_noncore, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'core coding dustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_coding_regions, tbl_not_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_core, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'noncore coding dustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_noncoding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_noncore, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'core noncoding notdustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_noncoding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_core, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'noncore noncoding notdustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_noncoding_regions, tbl_not_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_noncore, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'core noncoding dustmasked')
    )
    .cat(
        intervalsubtract(
            intervalsubtract(tbl_noncoding_regions, tbl_not_dustmasked, lkey='chrom', rkey='chrom'),
            tbl_core, lkey='chrom', rkey='chrom'
        )
        .addfield('annotation', 'noncore noncoding dustmasked')
    )
    .sort(['chrom', 'start'])
)

    

In [30]:
tbl_region_classifcation.display(1000)

chrom,start,stop,annotation
Pf3D7_01_v3,0,2,noncore noncoding notdustmasked
Pf3D7_01_v3,2,1114,noncore noncoding dustmasked
Pf3D7_01_v3,1114,1117,noncore noncoding notdustmasked
Pf3D7_01_v3,1117,1211,noncore noncoding dustmasked
Pf3D7_01_v3,1211,1269,noncore noncoding notdustmasked
Pf3D7_01_v3,1269,1441,noncore noncoding dustmasked
Pf3D7_01_v3,1441,1591,noncore noncoding notdustmasked
Pf3D7_01_v3,1591,1716,noncore noncoding dustmasked
Pf3D7_01_v3,1716,1774,noncore noncoding notdustmasked
Pf3D7_01_v3,1774,1838,noncore noncoding dustmasked


In [38]:
tbl_region_classifcation_Pf3D7_01_v3 = tbl_region_classifcation.selecteq('chrom', 'Pf3D7_01_v3')
def upstream(prv, cur, nxt):
    if prv is None:
        return None
    else:
        return cur.start - prv.stop

tbl_region_classifcation_Pf3D7_01_v3.addfieldusingcontext('diff', upstream).selectne('diff', 0)

chrom,start,stop,annotation,diff
Pf3D7_01_v3,0,2,noncore noncoding notdustmasked,
Pf3D7_01_v3,226681,226892,core coding notdustmasked,-190.0
Pf3D7_01_v3,227075,227121,core coding notdustmasked,-46.0
Pf3D7_01_v3,227121,227128,core coding dustmasked,-7.0
Pf3D7_01_v3,227128,227141,core coding notdustmasked,-13.0


In [40]:
tbl_region_classifcation_Pf3D7_01_v3.selectgt('start', 226500).display(30)

chrom,start,stop,annotation
Pf3D7_01_v3,226520,226611,core noncoding dustmasked
Pf3D7_01_v3,226611,226681,core noncoding notdustmasked
Pf3D7_01_v3,226681,226871,core coding notdustmasked
Pf3D7_01_v3,226681,226892,core coding notdustmasked
Pf3D7_01_v3,226892,226931,core noncoding notdustmasked
Pf3D7_01_v3,226931,226998,core noncoding dustmasked
Pf3D7_01_v3,226998,227005,core noncoding notdustmasked
Pf3D7_01_v3,227005,227017,core noncoding dustmasked
Pf3D7_01_v3,227017,227021,core noncoding notdustmasked
Pf3D7_01_v3,227021,227035,core noncoding dustmasked


In [63]:
(tbl_region_classifcation
.addfield('length', lambda rec: rec['stop'] - rec['start'])
.aggregate('annotation', sum, 'length')
).displayall()

annotation,value
core coding dustmasked,3352526
core coding notdustmasked,8185540
core noncoding dustmasked,6549806
core noncoding notdustmasked,2694263
noncore coding dustmasked,105803
noncore coding notdustmasked,658952
noncore noncoding dustmasked,922666
noncore noncoding notdustmasked,863432


In [None]:
(tbl_region_classifcation
.addfield('length', lambda rec: rec['stop'] - rec['start'])
.aggregate('annotation', sum, 'length')
).displayall()

In [65]:
print(sum([len(ref_dict[chrom]) for chrom in ref_dict]))
print(3352526 + 8185540 + 6549806 + 2694263 + 105803 + 658952 + 922666 + 863432)

23332839
23332988


In [35]:
(tbl_region_classifcation
.selecteq('chrom', 'Pf3D7_01_v3')
.addfield('length', lambda rec: rec['stop'] - rec['start'])
.aggregate('annotation', sum, 'length')
).displayall()

annotation,value
core coding dustmasked,66014
core coding notdustmasked,175870
core noncoding dustmasked,163556
core noncoding notdustmasked,76100
noncore coding dustmasked,5310
noncore coding notdustmasked,34610
noncore noncoding dustmasked,53448
noncore noncoding notdustmasked,66863


In [37]:
print(len(ref_dict['Pf3D7_01_v3']))
print(66014 + 175870 + 163556 + 76100 + 5310 + 34610 + 53448 + 66863)

640851
641771


In [33]:
print(sum([len(ref_dict[chrom]) for chrom in ref_dict]))
print(3385484 + 8281336 + 6549506 + 2693721 + 106182 + 666016 + 922441 + 857969)

23332839
23462655


In [34]:
ref_dict

{'Pf3D7_01_v3': SeqRecord(seq=Seq('tgaaccctaaaacctaaaccctaaaccctaaaccctgaaccctaaaccctgaac...agg', SingleLetterAlphabet()), id='Pf3D7_01_v3', name='Pf3D7_01_v3', description='Pf3D7_01_v3', dbxrefs=[]),
 'Pf3D7_02_v3': SeqRecord(seq=Seq('aaccctaaaccctaaaccctaaaccctaaaccctaaaccctaaacctaaaccct...tca', SingleLetterAlphabet()), id='Pf3D7_02_v3', name='Pf3D7_02_v3', description='Pf3D7_02_v3', dbxrefs=[]),
 'Pf3D7_03_v3': SeqRecord(seq=Seq('taaaccctaaatctctaaaccctaaagctatacctaaaccctgaaggttatacc...tca', SingleLetterAlphabet()), id='Pf3D7_03_v3', name='Pf3D7_03_v3', description='Pf3D7_03_v3', dbxrefs=[]),
 'Pf3D7_04_v3': SeqRecord(seq=Seq('aaccctaaaccctgaaccctaaaccctaaaccctgaaccctgaaccctaaaccc...tta', SingleLetterAlphabet()), id='Pf3D7_04_v3', name='Pf3D7_04_v3', description='Pf3D7_04_v3', dbxrefs=[]),
 'Pf3D7_05_v3': SeqRecord(seq=Seq('ctaaaccctgaaccctaaaccctgaaccctaaaccctaaaccctgaaccctaaa...ggt', SingleLetterAlphabet()), id='Pf3D7_05_v3', name='Pf3D7_05_v3', description='Pf3D7_05_v3', dbxrefs=

In [151]:
print(ref_dict['Pf3D7_01_v3'].seq[29510:29839])
print(ref_dict['Pf3D7_01_v3'].seq[29839:29845])
print(ref_dict['Pf3D7_01_v3'].seq[29834:29850])
print(ref_dict['Pf3D7_01_v3'].seq[29845:29855])
print()
print(ref_dict['Pf3D7_01_v3'].seq[30698:30752])
print(ref_dict['Pf3D7_01_v3'].seq[30693:30757])
print()
print(ref_dict['Pf3D7_01_v3'].seq[30772:30836])
print(ref_dict['Pf3D7_01_v3'].seq[30767:30841])


tggtgacgcaaagtagtggtgggggtgctgctggtagtagtggtgaggaagatgccaaacatgtattggatgaatttgggcaacaagtgtacaatgaaaaagtggaaaagtatgctaattctaaaatatataaagaggcgttgaaaggagatttgtcacaagcatcaattttgagcgaattagctggcacctataaaccatgtgcccttgaatatgaatattataagcatactaatggcggtggtaagggtaaaaggtatccgtgtacagagttaggtgaaaaagtagaaccacgtttttcggatacacttggtggtcagtgtactaac
aaaaaa
ctaacaaaaaaataga
atagaaggta

aaaaactagaatttgaaaaacaaaaagaaaaatatacaaaagaaattaaaaaaa
taaccaaaaactagaatttgaaaaacaaaaagaaaaatatacaaaagaaattaaaaaaaagcat

ataaaaactgcaaatcgaaaaacaactattaataacttatatgtaaaagaattttataaaaaac
taataataaaaactgcaaatcgaaaaacaactattaataacttatatgtaaaagaattttataaaaaacttcaa


In [140]:
tbl_regions.selectgt('start', 29000).display(30)

chrom,start,stop,annotation
Pf3D7_01_v3,29055,29073,noncoding not dustmasked
Pf3D7_01_v3,29073,29435,noncoding dustmasked
Pf3D7_01_v3,29435,29510,noncoding not dustmasked
Pf3D7_01_v3,29510,29839,coding not dustmasked
Pf3D7_01_v3,29839,29845,coding dustmasked
Pf3D7_01_v3,29845,30698,coding not dustmasked
Pf3D7_01_v3,30698,30752,coding dustmasked
Pf3D7_01_v3,30752,30772,coding not dustmasked
Pf3D7_01_v3,30772,30836,coding dustmasked
Pf3D7_01_v3,30836,31134,coding not dustmasked


In [138]:
tbl_coding_notdustmasked = intervalsubtract(tbl_coding_regions, tbl_dustmasked, lkey='chrom', rkey='chrom')
tbl_coding_notdustmasked


chrom,start,stop
Pf3D7_01_v3,29510,29839
Pf3D7_01_v3,29845,30698
Pf3D7_01_v3,30752,30772
Pf3D7_01_v3,30836,31134
Pf3D7_01_v3,31140,31311


In [133]:
tbl_coding_dustmasked = petl.transform.intervals.intervalsubtract(
    tbl_coding_regions,
    tbl_not_dustmasked,
    lkey='chrom',
    rkey='chrom'
)
tbl_coding_dustmasked


chrom,start,stop
Pf3D7_01_v3,29839,29845
Pf3D7_01_v3,30698,30752
Pf3D7_01_v3,30772,30836
Pf3D7_01_v3,31134,31140
Pf3D7_01_v3,31311,31318


In [135]:
tbl_coding_dustmasked.display(30)

chrom,start,stop
Pf3D7_01_v3,29839,29845
Pf3D7_01_v3,30698,30752
Pf3D7_01_v3,30772,30836
Pf3D7_01_v3,31134,31140
Pf3D7_01_v3,31311,31318
Pf3D7_01_v3,31457,31469
Pf3D7_01_v3,31901,32025
Pf3D7_01_v3,32485,32524
Pf3D7_01_v3,32589,32595
Pf3D7_01_v3,32703,32709


In [134]:
tbl_dustmasked.selectgt('start', 29000).display(30)

chrom,start,stop,annotation
Pf3D7_01_v3,29073,29435,dustmasker
Pf3D7_01_v3,29839,29845,dustmasker
Pf3D7_01_v3,30698,30752,dustmasker
Pf3D7_01_v3,30772,30836,dustmasker
Pf3D7_01_v3,31134,31140,dustmasker
Pf3D7_01_v3,31311,31318,dustmasker
Pf3D7_01_v3,31457,31469,dustmasker
Pf3D7_01_v3,31901,32025,dustmasker
Pf3D7_01_v3,32485,32524,dustmasker
Pf3D7_01_v3,32589,32595,dustmasker


In [None]:
tbl_subtract = petl.transform.intervals.intervalsubtract(
    tbl_ref_cds_gff,
    tbl_dustmasker,
    lstart='start',
    lstop='end',
    rstart='start',
    rstop='end',
    lkey='seqid',
    rkey='chrom'
)

In [100]:
tbl_dustmasker

chrom,start,end,annotation
Pf3D7_01_v3,2,1113,dustmasker
Pf3D7_01_v3,1117,1210,dustmasker
Pf3D7_01_v3,1269,1440,dustmasker
Pf3D7_01_v3,1591,1715,dustmasker
Pf3D7_01_v3,1774,1837,dustmasker


In [101]:
tbl_joined = petl.transform.intervals.intervaljoin(
    tbl_ref_cds_gff,
    tbl_dustmasker,
    lstart='start',
    lstop='end',
    rstart='start',
    rstop='end',
    lkey='seqid',
    rkey='chrom'
)

In [102]:
tbl_joined

seqid,source,type,start,end,score,strand,phase,attributes,chrom,start.1,end.1,annotation
Pf3D7_01_v3,chado,CDS,29510,34762,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}",Pf3D7_01_v3,29839,29845,dustmasker
Pf3D7_01_v3,chado,CDS,29510,34762,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}",Pf3D7_01_v3,30698,30752,dustmasker
Pf3D7_01_v3,chado,CDS,29510,34762,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}",Pf3D7_01_v3,30772,30836,dustmasker
Pf3D7_01_v3,chado,CDS,29510,34762,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}",Pf3D7_01_v3,31134,31140,dustmasker
Pf3D7_01_v3,chado,CDS,29510,34762,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}",Pf3D7_01_v3,31311,31318,dustmasker


In [104]:
tbl_subtract = petl.transform.intervals.intervalsubtract(
    tbl_ref_cds_gff,
    tbl_dustmasker,
    lstart='start',
    lstop='end',
    rstart='start',
    rstop='end',
    lkey='seqid',
    rkey='chrom'
)

In [105]:
tbl_subtract

seqid,source,type,start,end,score,strand,phase,attributes
Pf3D7_01_v3,chado,CDS,29510,29839,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}"
Pf3D7_01_v3,chado,CDS,29845,30698,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}"
Pf3D7_01_v3,chado,CDS,30752,30772,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}"
Pf3D7_01_v3,chado,CDS,30836,31134,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}"
Pf3D7_01_v3,chado,CDS,31140,31311,.,+,0,"{'ID': 'PF3D7_0100100.1:exon:1', 'Parent': 'PF3D7_0100100.1'}"


In [110]:
tbl_subtract_r = petl.transform.intervals.intervalsubtract(
    tbl_dustmasker,
    tbl_ref_cds_gff,
    rstart='start',
    rstop='end',
    lstart='start',
    lstop='end',
    rkey='seqid',
    lkey='chrom'
)

In [111]:
tbl_subtract_r

chrom,start,end,annotation
Pf3D7_01_v3,2,1113,dustmasker
Pf3D7_01_v3,1117,1210,dustmasker
Pf3D7_01_v3,1269,1440,dustmasker
Pf3D7_01_v3,1591,1715,dustmasker
Pf3D7_01_v3,1774,1837,dustmasker
