In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import shutil
import random
import pprint
import itertools
import functools
import collections

import pysam
import pyranges as pr
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import sys
sys.path.append('/home/users/pjh/scripts/python_genome_package_dev/')

In [3]:
from handygenome import common
from handygenome.common import ChromDict
from handygenome.variant.vcfspec import Vcfspec
from handygenome.sv.breakends import Breakends
from handygenome.variant.variantplus import VariantPlus, VariantPlusList
from handygenome.igvhandle import IGVHandle

[W::hts_idx_load3] The index file is older than the data file: /home/users/pjh/scripts/python_genome_packages/data/popfreq/dbSNP_b155_GRCh37.p13.vcf.gz.csi


In [3]:
FASTA_PATH_HG19 = "/home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta"
FASTA_PATH_HG38 = "/home/users/data/01_reference/human_g1k_v38/Homo_sapiens_assembly38.fasta"

FASTA_HG19 = pysam.FastaFile(FASTA_PATH_HG19)
FASTA_HG38 = pysam.FastaFile(FASTA_PATH_HG38)
CHROMDICT_HG19 = ChromDict(fasta_path=FASTA_PATH_HG19)
CHROMDICT_HG38 = ChromDict(fasta_path=FASTA_PATH_HG38)

# FetchedReads performance check

In [21]:
from handygenome.read import fetchcache
fetchedrefseq = fetchcache.FetchedRefseq(FASTA_HG19)

In [22]:
fetchedrefseq.fetch('1', 12345678, 12345690)

fasta fetch


'ATCATTTCACTC'

In [23]:
fetchedrefseq

<FetchedRefseq object, which consists of:
	<FetchedRefseqSinglechrom object (chrom=1, clusters=[<FetchedRefseqCluster object (chrom:1, start0:12345478, end0:12345890)>])>
>

In [24]:
fetchedrefseq.fetch('1', 12345685, 12345699)

'CACTCTACATACGT'

In [25]:
fetchedrefseq

<FetchedRefseq object, which consists of:
	<FetchedRefseqSinglechrom object (chrom=1, clusters=[<FetchedRefseqCluster object (chrom:1, start0:12345478, end0:12345890)>])>
>

In [26]:
FASTA_HG19.fetch('1', 12345685, 12345699)

'CACTCTACATACGT'

In [27]:
%%timeit
FASTA_HG19.fetch('1', 12345685, 12345699)

868 ns ± 1.13 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [28]:
%%timeit
fetchedrefseq.fetch('1', 12345685, 12345699)

10.4 ms ± 58.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
bam = pysam.AlignmentFile('/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/02_BAM/14/LU-14.tumor.bam')

In [78]:
chrom = '1'
start0 = 12335685
end0 = start0 + 100

fetchedreads = fetchcache.FetchedReads(bam)
reads = list(fetchedreads.fetch(chrom, start0, end0))

In [79]:
fetchedreads

<FetchedReads object, which consists of:
	<FetchedReadsSinglechrom object (chrom=1, clusters=[<FetchedReadsCluster object (chrom=1, fetch_start0=12,335,485, fetch_end0=12,335,985)>])>
>

In [84]:
%%timeit -n 1 -r 10
reads = list(bam.fetch(chrom, start0, end0))

2.82 ms ± 70.5 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [85]:
%%timeit -n 1 -r 10
reads = list(fetchedreads.fetch(chrom, start0, end0))

10.8 ms ± 573 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [86]:
%%timeit -n 1 -r 10
fetchedreads = fetchcache.FetchedReads(bam)
reads = list(fetchedreads.fetch(chrom, start0, end0))

22 ms ± 1.32 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [93]:
fetchedreads.subdata['1'].clusters[0].dict

OrderedDict([(ReadUID(qname='ST-E00130:582:HC7KJALXX:7:2105:18629:38544', flag=83, chrom='1', pos0=12335336),
              <pysam.libcalignedsegment.AlignedSegment at 0x2b3f6a14f1c0>),
             (ReadUID(qname='ST-E00130:582:HC7KJALXX:5:1101:27306:54278', flag=99, chrom='1', pos0=12335338),
              <pysam.libcalignedsegment.AlignedSegment at 0x2b3f6a14fd00>),
             (ReadUID(qname='ST-E00130:582:HC7KJALXX:2:1207:20963:47404', flag=83, chrom='1', pos0=12335339),
              <pysam.libcalignedsegment.AlignedSegment at 0x2b3f6a14fee0>),
             (ReadUID(qname='ST-E00130:582:HC7KJALXX:8:2105:23277:22018', flag=99, chrom='1', pos0=12335340),
              <pysam.libcalignedsegment.AlignedSegment at 0x2b3f6a14f5e0>),
             (ReadUID(qname='ST-E00130:582:HC7KJALXX:5:2103:21115:65353', flag=83, chrom='1', pos0=12335346),
              <pysam.libcalignedsegment.AlignedSegment at 0x2b3f6972f820>),
             (ReadUID(qname='ST-E00130:582:HC7KJALXX:2:1108:17969:3944

In [95]:
from handygenome.read.readhandler import ReadUID
readuid = ReadUID(qname='ST-E00130:582:HC7KJALXX:7:2105:18629:38544', flag=83, chrom='1', pos0=12335336)

In [97]:
%%timeit
fetchedreads.get_read(readuid)

1.23 µs ± 16.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [98]:
%%timeit
for read in bam.fetch('1', 12335336, 12335336 + 1):
    if read.query_name == 'ST-E00130:582:HC7KJALXX:7:2105:18629:38544' and read.flag == 83:
        break

2.79 ms ± 7.19 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
from handygenome.read import pileup as libpileup

In [7]:
chrom = '1'
start0 = 12335685
end0 = start0 + 100
bam = pysam.AlignmentFile('/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/02_BAM/14/LU-14.tumor.bam')

In [22]:
pileup = libpileup.get_pileup(chrom, start0, end0, bam, FASTA_HG19)

In [24]:
pileup.get_ref_seq()

'GTGTTTTGATTTCCAAATTTATAGGTAAATTCACACTTAGAAAATGGTAAACAAAGATATTATGCTTAAGAGTTTTCCTCAGTTTTCTTAAACAAACTGG'

In [15]:
pileup._ref_seq_cache

{(12335685,
  12335785): 'GTGTTTTGATTTCCAAATTTATAGGTAAATTCACACTTAGAAAATGGTAAACAAAGATATTATGCTTAAGAGTTTTCCTCAGTTTTCTTAAACAAACTGG'}

In [18]:
FASTA_HG19.fetch(chrom, start0, end0)

'GTGTTTTGATTTCCAAATTTATAGGTAAATTCACACTTAGAAAATGGTAAACAAAGATATTATGCTTAAGAGTTTTCCTCAGTTTTCTTAAACAAACTGG'

In [25]:
%%timeit
pileup.get_ref_seq()

3.48 µs ± 48.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [29]:
%%timeit
pileup.fasta.fetch(pileup.chrom, pileup.start0, pileup.end0)

3.31 µs ± 48.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [26]:
%%timeit
FASTA_HG19.fetch(chrom, start0, end0)

1.45 µs ± 10.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
