In [61]:
import progressbar
import pysam
import pysamstats
import numpy as np

def _generate_pileups(bam_file_path, reference_fasta_path, include_indels=True):
    """
    Generate pileups from reads alignment to reference.

    :param contig: name of contig in .fasta file, needed for generating pileups
    :type contig: str
    :param bam_file_path: path to .bam file containing alignments
    :type bam_file_path: str
    :param reference_fasta_path: path to .fasta file
    :type reference_fasta_path: str
    :param include_indels: flag which indicates whether to include indels in
        pileups
    :type include_indels: bool
    :return: pileups (X)
    :rtype: np.ndarray
    """
    bamfile = pysam.AlignmentFile(bam_file_path)

    if include_indels:
        info_of_interest = ['A', 'C', 'G', 'T', 'insertions', 'deletions']
    else:
        info_of_interest = ['A', 'C', 'G', 'T']

    pileups = [np.zeros((bamfile.get_reference_length(contig_name), len(info_of_interest)))
               for contig_name in bamfile.references]
    
    total_length = np.sum([bamfile.get_reference_length(contig_name) for contig_name in bamfile.references])
    with progressbar.ProgressBar(max_value=total_length) as progress_bar:
        for contig_id, contig_name in enumerate(bamfile.references):
            for record in pysamstats.stat_variation(bamfile, chrom=contig_name,
                                                    fafile=reference_fasta_path):
                progress_bar.update(record['pos'])
                for i, info in enumerate(info_of_interest):
                    pileups[contig_id][record['pos']][i] += record[info]
    return np.concatenate(pileups, axis=0), bamfile.references

In [3]:
bam_file_path = '/home/diplomski-rad/blade/pb/klebsiela-pneumoniae-NCTC204-BROKEN/reads-to-ref-sorted.bam'
bamfile = pysam.AlignmentFile(bam_file_path)

In [16]:
bamfile.get_reference_name(2)

'NC_016846.1'

In [17]:
bamfile.references

('NC_016845.1',
 'NC_016838.1',
 'NC_016846.1',
 'NC_016839.1',
 'NC_016840.1',
 'NC_016847.1',
 'NC_016841.1')

In [19]:
for ref in bamfile.references:
    print(bamfile.get_reference_length(ref))

5333942
122799
111195
105974
3751
3353
1308


In [28]:
bam_file_path = '/home/diplomski-rad/blade/pb/klebsiela-pneumoniae-NCTC204-BROKEN/reads-to-ref-sorted.bam'
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
include_indels = False
pileups = _generate_pileups(bam_file_path, reference_fasta_path, include_indels=include_indels)

100% (5682322 of 5682322) |##############| Elapsed Time: 0:04:50 Time:  0:04:504


In [29]:
len(pileups)

7

In [31]:
for pileup in pileups:
    print(pileup.shape)

(5333942, 4)
(122799, 4)
(111195, 4)
(105974, 4)
(3751, 4)
(3353, 4)
(1308, 4)


In [42]:
for pileup in pileups:
    print(np.max(pileup))

457.0
39.0
190.0
0.0
0.0
0.0
0.0


In [47]:
import sys

print(sys.getsizeof(pileups))
for pileup in pileups:
    print(sys.getsizeof(pileup)/(1 << 20))

128
162.7790985107422
3.7476348876953125
3.3935089111328125
3.2341766357421875
0.1145782470703125
0.1024322509765625
0.0400238037109375


In [54]:
bam_file_path = '/home/diplomski-rad/blade/pb/klebsiela-pneumoniae-NCTC204-BROKEN/reads-to-ref-sorted.bam'
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
include_indels = False
pileups = _generate_pileups(bam_file_path, reference_fasta_path, include_indels=include_indels)

100% (5682322 of 5682322) |##############| Elapsed Time: 0:07:25 Time:  0:07:256


In [55]:
print(sys.getsizeof(pileups))
for pileup in pileups:
    print(sys.getsizeof(pileup)/(1 << 20))

128
40.694854736328125
0.9369888305664062
0.8484573364257812
0.808624267578125
0.02872467041015625
0.02568817138671875
0.0100860595703125


In [62]:
bam_file_path = '/home/diplomski-rad/blade/pb/klebsiela-pneumoniae-NCTC204-BROKEN/reads-to-ref-sorted.bam'
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
include_indels = False
pileups = _generate_pileups(bam_file_path, reference_fasta_path, include_indels=include_indels)

100% (5682322 of 5682322) |##############| Elapsed Time: 0:04:52 Time:  0:04:523


In [64]:
print(sys.getsizeof(pileups)/(1 << 20))

173.4108123779297


In [98]:
from Bio import SeqIO

def _generate_ground_truth(reference_fasta_path, ordered_contigs):
    """
    Generates ground truth - nucleus bases from reference.
    
    It parses all contigs.
    
    :param reference_fasta_path: path to .fasta file
    :type reference_fasta_path: str
    :return: nucleus bases from reference (y)
    :rtype: np.ndarray
    """
    record_dict = SeqIO.to_dict(SeqIO.parse(reference_fasta_path, 'fasta'))
    total_options = 5
    y_oh = [np.zeros((len(record_dict[contig_name]), total_options)) for contig_name in ordered_contigs]
    # Last number in shape - 5 - is for letters other than A, C, G and T.
    mapping = {'A': 0, 'a': 0, 'C': 1, 'c': 1, 'G': 2, 'g': 2, 'T': 3, 't': 3}
    
    total_length = np.sum(len(record_dict[contig_name]) for contig_name in ordered_contigs)
    with progressbar.ProgressBar(max_value=total_length) as progress_bar:
        for contig_id, contig_name in enumerate(ordered_contigs):
            contig = record_dict[contig_name]
            print(contig_name, len(contig))
            for position, base in enumerate(contig.seq):
                progress_bar.update(position)
                y_oh[contig_id][position][mapping.get(base, -1)] = 1
    return np.concatenate(y_oh, axis=0)

In [66]:
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
record_dict = SeqIO.to_dict(SeqIO.parse(reference_fasta_path, 'fasta'))

In [69]:
print(record_dict.keys())

dict_keys(['NC_016840.1', 'NC_016839.1', 'NC_016847.1', 'NC_016841.1', 'NC_016846.1', 'NC_016845.1', 'NC_016838.1'])


In [81]:
for key in record_dict:
    print(record_dict[key])
    data = record_dict[key]
    print()
    print(type(data))
    print(len(data))
    print(len(data.seq))
    break

ID: NC_016840.1
Name: NC_016840.1
Description: NC_016840.1 Klebsiella pneumoniae subsp. pneumoniae HS11286 plasmid pKPHS4, complete sequence
Number of features: 0
Seq('TTTTTGAGCAGCGGGCTTTCCGGCGGTTTTCTCCTCTCAGCCCAGCAATGGTGC...GTC', SingleLetterAlphabet())

<class 'Bio.SeqRecord.SeqRecord'>
3751
3751


In [82]:
for key in record_dict:
    data = record_dict[key]
    print(len(data))

3751
105974
3353
1308
111195
5333942
122799


In [95]:
for ref, key in zip(bamfile.references, record_dict):
    print(bamfile.get_reference_length(ref), len(record_dict[key]))

5333942 3751
122799 105974
111195 3353
105974 1308
3751 111195
3353 5333942
1308 122799


In [86]:
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
y = _generate_ground_truth(reference_fasta_path)

100% (5682322 of 5682322) |##############| Elapsed Time: 0:00:32 Time:  0:00:328


In [87]:
print(len(y))

7


In [88]:
for yi in y:
    print(yi.shape)

(3751, 5)
(105974, 5)
(3353, 5)
(1308, 5)
(111195, 5)
(5333942, 5)
(122799, 5)


In [91]:
for yi in y:
    print(sys.getsizeof(yi)/(1 << 20))

0.14319610595703125
4.042694091796875
0.12801361083984375
0.0500030517578125
4.241859436035156
203.47384643554688
4.684516906738281


In [93]:
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
y = _generate_ground_truth(reference_fasta_path)

100% (5682322 of 5682322) |##############| Elapsed Time: 0:00:31 Time:  0:00:31


In [94]:
print(y.shape)

(5682322, 5)


In [99]:
reference_fasta_path = '/home/data/pacific_biosciences/bacteria/klebsiela/pneumoniae/klebsiella_pneumoniae_reference.fasta'
ordered_contigs = bamfile.references
y = _generate_ground_truth(reference_fasta_path, ordered_contigs)

  0% (29226 of 5682322) |                | Elapsed Time: 0:00:00 ETA:   0:00:38

NC_016845.1 5333942


  0% (25853 of 5682322) || Elapsed Time: 0:00:30 ETA:  156608313 days, 10:25:49

NC_016838.1 122799


  0% (31216 of 5682322) | | Elapsed Time: 0:00:31 ETA:  202318686 days, 6:40:54

NC_016846.1 111195


  0% (29446 of 5682322) || Elapsed Time: 0:00:31 ETA:  241677132 days, 20:03:00

NC_016839.1 105974


100% (5682322 of 5682322) |##############| Elapsed Time: 0:00:32 Time:  0:00:32


NC_016840.1 3751
NC_016847.1 3353
NC_016841.1 1308
