In [50]:
import numpy as np
import pysam
import pysamstats

## Input params

In [22]:
bam_file_path = '/home/diplomski-rad/blade/pb/escherichia-coli-NCTC86/reads-to-ref-sorted.bam'
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/escherichia/coli/escherichia_coli_reference.fasta'
include_indels = True

## Investigate deletions

In [31]:
deletion_pileups = list()

bam_file = pysam.AlignmentFile(bam_file_path)
for contig_id, contig_name in enumerate(bam_file.references):
    done = False
    for record in pysamstats.stat_variation(bam_file, chrom=contig_name, fafile=reference_fasta_path):
        if record['deletions'] > 0:
            deletion_pileups.append(record)
            if len(deletion_pileups) == 1000000:
                done = True
                break
    if done:
        break

In [32]:
num_max_D = 0
for pileup in deletion_pileups:
    pos = pileup['pos']
    ref = pileup['ref']
    A = pileup['A']
    C = pileup['C']
    G = pileup['G']
    T = pileup['T']
    I = pileup['insertions']
    D = pileup['deletions']
    max_D = True if D > np.max([A, C, G, T, I]) else False
    if max_D: 
        num_max_D += 1
        print('pos: {:4}, ref: {:4}, A: {:4}, C: {:4}, G: {:4}, T: {:4}, I: {:4}, D: {:4}, max_D: {}\n'.format(
        pos, ref, A, C, G, T, I, D, max_D))
print(num_max_D)

pos:  301, ref: T   , A:    1, C:    7, G:    1, T:   54, I:    1, D:   84, max_D: True

pos: 15383, ref: C   , A:    0, C:   54, G:    0, T:   10, I:    4, D:   87, max_D: True

pos: 15384, ref: A   , A:   47, C:    6, G:    0, T:   35, I:    0, D:   63, max_D: True

pos: 15406, ref: G   , A:    1, C:    0, G:   90, T:    0, I:    4, D:  101, max_D: True

pos: 15413, ref: G   , A:    1, C:    0, G:   90, T:    2, I:    9, D:   94, max_D: True

pos: 15441, ref: C   , A:    0, C:   92, G:    0, T:    0, I:    5, D:  100, max_D: True

pos: 15481, ref: C   , A:    0, C:   91, G:    1, T:    0, I:   11, D:   99, max_D: True

pos: 15489, ref: G   , A:    1, C:    0, G:   93, T:    3, I:    1, D:   97, max_D: True

pos: 15495, ref: C   , A:    0, C:   84, G:    6, T:    2, I:    4, D:  113, max_D: True

pos: 15515, ref: G   , A:    0, C:    0, G:   92, T:    0, I:    4, D:  101, max_D: True

pos: 15526, ref: C   , A:    1, C:   94, G:    3, T:    0, I:    7, D:   97, max_D: True

pos: 15528,


pos: 32484, ref: A   , A:   39, C:    2, G:    2, T:    8, I:    8, D:   97, max_D: True

pos: 32485, ref: C   , A:    1, C:   44, G:    0, T:    7, I:    6, D:   96, max_D: True

pos: 32486, ref: C   , A:    0, C:   17, G:    9, T:   39, I:    1, D:   83, max_D: True

pos: 34111, ref: T   , A:    2, C:    1, G:    5, T:   58, I:    2, D:   78, max_D: True

pos: 39183, ref: C   , A:    1, C:   38, G:    3, T:    0, I:    0, D:  116, max_D: True

pos: 39184, ref: A   , A:   18, C:    1, G:    2, T:    0, I:    0, D:  137, max_D: True

pos: 39185, ref: T   , A:    1, C:    1, G:    2, T:    4, I:    1, D:  150, max_D: True

pos: 39186, ref: T   , A:    2, C:    3, G:    1, T:    5, I:    0, D:  147, max_D: True

pos: 39187, ref: C   , A:    3, C:   56, G:    1, T:    1, I:    0, D:   97, max_D: True

pos: 39188, ref: G   , A:    1, C:    0, G:   57, T:    1, I:    0, D:   99, max_D: True

pos: 39189, ref: T   , A:    3, C:    6, G:    0, T:   25, I:    0, D:  124, max_D: True

pos: 3919


pos: 270734, ref: G   , A:    0, C:    0, G:   11, T:    0, I:    2, D:   34, max_D: True

pos: 270737, ref: G   , A:    0, C:    0, G:   12, T:    0, I:    1, D:   25, max_D: True

pos: 270742, ref: A   , A:   10, C:    0, G:    0, T:    1, I:    2, D:   14, max_D: True

pos: 270746, ref: C   , A:    1, C:    8, G:    0, T:    0, I:    3, D:   33, max_D: True

pos: 270747, ref: C   , A:    0, C:   10, G:    0, T:    0, I:    5, D:   11, max_D: True

pos: 270758, ref: A   , A:   11, C:    0, G:    0, T:    1, I:    2, D:   14, max_D: True

pos: 270762, ref: G   , A:    0, C:    0, G:   12, T:    0, I:    3, D:   33, max_D: True

pos: 270775, ref: C   , A:    0, C:   11, G:    0, T:    0, I:    9, D:   21, max_D: True

pos: 270796, ref: G   , A:    0, C:    0, G:   13, T:    0, I:    3, D:   19, max_D: True

pos: 270811, ref: T   , A:    0, C:    0, G:    0, T:   13, I:    3, D:   20, max_D: True

pos: 270821, ref: A   , A:   12, C:    0, G:    0, T:    0, I:    5, D:   19, max_D: True


pos: 303625, ref: G   , A:    0, C:    0, G:    6, T:    1, I:    0, D:  139, max_D: True

pos: 303626, ref: C   , A:    0, C:    4, G:    1, T:    0, I:    0, D:  141, max_D: True

pos: 303627, ref: G   , A:    0, C:    0, G:    5, T:    0, I:    0, D:  141, max_D: True

pos: 303628, ref: C   , A:    1, C:    6, G:    0, T:    0, I:    1, D:  139, max_D: True

pos: 303629, ref: A   , A:    4, C:    0, G:    1, T:    1, I:    0, D:  140, max_D: True

pos: 303630, ref: A   , A:    5, C:    0, G:    1, T:    0, I:    0, D:  140, max_D: True

pos: 303631, ref: A   , A:    8, C:    0, G:    0, T:    0, I:    0, D:  138, max_D: True

pos: 303632, ref: T   , A:    0, C:    0, G:    0, T:    7, I:    3, D:  139, max_D: True

pos: 303633, ref: A   , A:    7, C:    0, G:    0, T:    0, I:    0, D:  139, max_D: True

pos: 303634, ref: C   , A:    2, C:    5, G:    0, T:    0, I:    2, D:  139, max_D: True

pos: 303635, ref: A   , A:    5, C:    0, G:    0, T:    0, I:    0, D:  141, max_D: True


pos: 391740, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   91, max_D: True

pos: 391741, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 391742, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 391743, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 391744, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 391745, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    1, D:   91, max_D: True

pos: 391746, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    1, D:   90, max_D: True

pos: 391747, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    1, D:   90, max_D: True

pos: 391748, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 391749, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 391750, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 392598, ref: A   , A:   26, C:    1, G:    0, T:    0, I:    1, D:   92, max_D: True

pos: 392599, ref: C   , A:    2, C:   24, G:    0, T:    0, I:   13, D:   92, max_D: True

pos: 392600, ref: T   , A:    0, C:    1, G:    0, T:   25, I:    0, D:   94, max_D: True

pos: 392601, ref: T   , A:    0, C:    1, G:    0, T:   26, I:   11, D:   91, max_D: True

pos: 392602, ref: C   , A:    0, C:   27, G:    0, T:    0, I:    6, D:   95, max_D: True

pos: 392603, ref: A   , A:   27, C:    0, G:    0, T:    0, I:    8, D:   91, max_D: True

pos: 392604, ref: T   , A:    1, C:    0, G:    0, T:   24, I:    0, D:   97, max_D: True

pos: 392605, ref: T   , A:    0, C:    0, G:    0, T:   26, I:    1, D:   92, max_D: True

pos: 392606, ref: T   , A:    0, C:    2, G:    0, T:   25, I:    1, D:   90, max_D: True

pos: 392607, ref: C   , A:    1, C:   25, G:    0, T:    1, I:    4, D:   91, max_D: True

pos: 392608, ref: A   , A:   27, C:    0, G:    0, T:    0, I:    6, D:   90, max_D: True


pos: 527367, ref: T   , A:    8, C:    1, G:    0, T:   61, I:    0, D:   72, max_D: True

pos: 528166, ref: G   , A:    5, C:    2, G:   45, T:    0, I:    4, D:  123, max_D: True

pos: 551380, ref: T   , A:    0, C:    0, G:    0, T:   73, I:    3, D:   81, max_D: True

pos: 551381, ref: C   , A:    3, C:   65, G:    0, T:    1, I:    3, D:   85, max_D: True

pos: 551382, ref: A   , A:   64, C:    0, G:    0, T:    1, I:    4, D:   89, max_D: True

pos: 551383, ref: G   , A:    0, C:    1, G:   55, T:    0, I:    1, D:   98, max_D: True

pos: 551384, ref: G   , A:    1, C:    1, G:   55, T:    0, I:    1, D:   97, max_D: True

pos: 551385, ref: C   , A:    2, C:   44, G:    0, T:    1, I:    0, D:  107, max_D: True

pos: 551386, ref: C   , A:    0, C:   45, G:    0, T:    0, I:    0, D:  109, max_D: True

pos: 551387, ref: T   , A:    0, C:    0, G:    0, T:   40, I:    0, D:  114, max_D: True

pos: 551388, ref: A   , A:   28, C:    1, G:    1, T:    0, I:    0, D:  124, max_D: True


pos: 567785, ref: A   , A:    3, C:    0, G:    0, T:    0, I:    1, D:   87, max_D: True

pos: 567786, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   89, max_D: True

pos: 567787, ref: T   , A:    1, C:    0, G:    0, T:    0, I:    0, D:   89, max_D: True

pos: 567788, ref: A   , A:    1, C:    0, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 567789, ref: T   , A:    0, C:    0, G:    0, T:    1, I:    1, D:   90, max_D: True

pos: 567790, ref: G   , A:    0, C:    1, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 567791, ref: A   , A:    1, C:    0, G:    0, T:    0, I:    1, D:   89, max_D: True

pos: 567792, ref: G   , A:    0, C:    0, G:    1, T:    0, I:    0, D:   90, max_D: True

pos: 567793, ref: C   , A:    0, C:    1, G:    0, T:    0, I:    0, D:   90, max_D: True

pos: 567794, ref: G   , A:    0, C:    0, G:    1, T:    0, I:    0, D:   89, max_D: True

pos: 567795, ref: C   , A:    0, C:    1, G:    0, T:    0, I:    0, D:   90, max_D: True



pos: 579550, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579551, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579552, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579553, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579554, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579555, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579556, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579557, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579558, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579559, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True

pos: 579560, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   58, max_D: True


pos: 687900, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687901, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687902, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687903, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687904, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687905, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687906, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687907, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687908, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687909, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 687910, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True


pos: 688879, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688880, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688881, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688882, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688883, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688884, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688885, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688886, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688887, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688888, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 688889, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:   84, max_D: True

pos: 719868, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719869, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719870, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719871, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719872, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719873, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719874, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719875, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719876, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719877, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True

pos: 719878, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    0, D:  117, max_D: True


pos: 781550, ref: T   , A:   12, C:    0, G:    0, T:    3, I:    1, D:  152, max_D: True

pos: 781551, ref: G   , A:    0, C:    0, G:   17, T:    0, I:    1, D:  150, max_D: True

pos: 781552, ref: A   , A:   18, C:    0, G:    0, T:    0, I:    1, D:  149, max_D: True

pos: 781553, ref: T   , A:    0, C:    0, G:    1, T:   17, I:    1, D:  149, max_D: True

pos: 781554, ref: T   , A:    0, C:    0, G:    0, T:   17, I:    1, D:  150, max_D: True

pos: 781555, ref: A   , A:    3, C:    0, G:    0, T:    0, I:    0, D:  164, max_D: True

pos: 781556, ref: A   , A:    3, C:    0, G:    1, T:    0, I:    1, D:  163, max_D: True

pos: 781557, ref: G   , A:    0, C:    0, G:   15, T:    1, I:    1, D:  151, max_D: True

pos: 781558, ref: G   , A:    0, C:    0, G:   14, T:    2, I:    1, D:  151, max_D: True

pos: 781559, ref: C   , A:    0, C:   16, G:    1, T:    0, I:   11, D:  150, max_D: True

pos: 781560, ref: A   , A:   15, C:    2, G:    0, T:    0, I:    0, D:  150, max_D: True


**Observations**: <br>
1. There are positions where number of deletions is only slightly greater than other numbers. <br>
1. There are positions where all other numbers are 0 (or really small), but number of deletions is high (or sometimes slow). <br>
1. There are positions where all other numbers are not so small, but number of deletions is high. <br>

**Question**: <br>
What should be the threshold for deciding that some pileup truly is a deletion?


## Investigate insertions

In [48]:
insertion_pileups = list()

bam_file = pysam.AlignmentFile(bam_file_path)
for contig_id, contig_name in enumerate(bam_file.references):
    done = False
    for record in pysamstats.stat_variation(bam_file, chrom=contig_name, fafile=reference_fasta_path):
        if record['insertions'] > 0:
            insertion_pileups.append(record)
            if len(insertion_pileups) == 1000000:
                done = True
                break
    if done:
        break

In [49]:
num_max_I = 0
for pileup in insertion_pileups:
    pos = pileup['pos']
    ref = pileup['ref']
    A = pileup['A']
    C = pileup['C']
    G = pileup['G']
    T = pileup['T']
    I = pileup['insertions']
    D = pileup['deletions']
    max_I = True if I > np.max([A, C, G, T, D]) else False
    if max_I: 
        num_max_I += 1
        print('pos: {:4}, ref: {:4}, A: {:4}, C: {:4}, G: {:4}, T: {:4}, I: {:4}, D: {:4}, max_I: {}\n'.format(
            pos, ref, A, C, G, T, I, D, max_I))
print(num_max_I)

pos: 224787, ref: G   , A:    7, C:    0, G:  138, T:    2, I:  227, D:    4, max_I: True

pos: 227492, ref: G   , A:   78, C:    0, G:   53, T:    3, I:   81, D:   20, max_I: True

pos: 270554, ref: G   , A:    0, C:    0, G:   11, T:    1, I:   14, D:    0, max_I: True

pos: 270556, ref: C   , A:    0, C:   12, G:    0, T:    0, I:   16, D:    3, max_I: True

pos: 270558, ref: A   , A:   12, C:    0, G:    0, T:    0, I:   18, D:    1, max_I: True

pos: 270560, ref: G   , A:    0, C:    0, G:   12, T:    0, I:   13, D:    2, max_I: True

pos: 270561, ref: C   , A:    0, C:   12, G:    0, T:    0, I:   23, D:    2, max_I: True

pos: 270566, ref: G   , A:    0, C:    0, G:   11, T:    0, I:   13, D:   10, max_I: True

pos: 270571, ref: G   , A:    0, C:    0, G:   12, T:    0, I:   18, D:    4, max_I: True

pos: 270573, ref: G   , A:    0, C:    0, G:   10, T:    1, I:   24, D:    2, max_I: True

pos: 270584, ref: G   , A:    0, C:    0, G:   12, T:    0, I:   20, D:    4, max_I: True



pos: 290942, ref: A   , A:    1, C:    0, G:    0, T:    0, I:    3, D:    2, max_I: True

pos: 290943, ref: A   , A:    1, C:    0, G:    0, T:    0, I:    4, D:    1, max_I: True

pos: 290945, ref: C   , A:    0, C:    0, G:    1, T:    0, I:    6, D:    0, max_I: True

pos: 290946, ref: G   , A:    0, C:    0, G:    1, T:    0, I:    7, D:    1, max_I: True

pos: 290947, ref: T   , A:    0, C:    0, G:    0, T:    1, I:    4, D:    2, max_I: True

pos: 290948, ref: C   , A:    0, C:    1, G:    0, T:    0, I:    5, D:    1, max_I: True

pos: 290950, ref: T   , A:    0, C:    0, G:    0, T:    1, I:    5, D:    1, max_I: True

pos: 290952, ref: C   , A:    0, C:    1, G:    0, T:    0, I:    3, D:    0, max_I: True

pos: 290954, ref: G   , A:    0, C:    0, G:    1, T:    0, I:    5, D:    0, max_I: True

pos: 290955, ref: A   , A:    1, C:    0, G:    0, T:    0, I:    7, D:    1, max_I: True

pos: 290956, ref: G   , A:    0, C:    0, G:    1, T:    0, I:    5, D:    0, max_I: True

pos: 567139, ref: G   , A:    1, C:    0, G:   15, T:    0, I:   16, D:    5, max_I: True

pos: 567155, ref: G   , A:    0, C:    0, G:   19, T:    0, I:   20, D:    2, max_I: True

pos: 567181, ref: C   , A:    0, C:   19, G:    1, T:    0, I:   21, D:    3, max_I: True

pos: 567381, ref: T   , A:    0, C:    0, G:    0, T:   20, I:   25, D:    1, max_I: True

pos: 567464, ref: C   , A:    0, C:   20, G:    0, T:    0, I:   21, D:    2, max_I: True

pos: 567647, ref: C   , A:    0, C:   21, G:    0, T:    0, I:   24, D:    1, max_I: True

pos: 567900, ref: C   , A:    0, C:    0, G:    0, T:    0, I:    1, D:    0, max_I: True

pos: 567907, ref: G   , A:    0, C:    0, G:    0, T:    0, I:    1, D:    0, max_I: True

pos: 567912, ref: A   , A:    0, C:    0, G:    0, T:    0, I:    1, D:    0, max_I: True

pos: 567914, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    1, D:    0, max_I: True

pos: 567916, ref: T   , A:    0, C:    0, G:    0, T:    0, I:    1, D:    0, max_I: True


**Observations**: <br>
1. There are positions where number of insertions is only slightly greater than other numbers. <br>
1. There are positions where all other numbers are really small, but number of insertions is high (or sometimes low). 
<br>
1. There are positions where all other numbers are not so small and number of insertions is around twice of the size of others. <br>

**Question**: <br>
What should be the threshold for deciding that some pileup truly is a insertion?
