In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [8, 4]
plt.rcParams['font.size'] = 12

In [None]:
WF_OUT = '~/analysis/1000kg_cnv/workflow-out'
SAMPLES = {
    'HG00261': ['1_000', '10_000', '50_000', '25_000'],
    'HG00096': ['1_000', '50_000', '25_000', '10_000'],
}

In [None]:
sample_bam_map = { 
    'HG00261': '/usr/home/lkirk/data/ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00261/alignment/HG00261.mapped.ILLUMINA.bwa.GBR.low_coverage.20130415.bam',
    'HG00096': '/usr/home/lkirk/data/ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00096/alignment/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam' 
} 

In [None]:
def get_counts_path(workflow_dir, sample_id, bin_size, must_exist=True):
    counts_path = Path(workflow_dir).expanduser() / sample_id / bin_size / 'readcounts.npz'
    if counts_path.exists():
        return counts_path
    else:
        if must_exist:
            raise Exception(f'{counts_path} does not exist')


def read_counts(wf_dir, sample_id, bin_size):
    counts_path = get_counts_path(wf_dir, sample_id, bin_size)
    return np.load(counts_path, allow_pickle=False)

In [None]:
# plt.bar(list(bins.keys()), np.array([len(bins[contig]) for contig in bins]))
# plt.xticks(rotation=90)
# plt.show()

http://xarray.pydata.org/en/stable/indexing.html

https://samtools.github.io/hts-specs/SAMv1.pdf

https://github.com/pydata/xarray/issues/1603

In [None]:
# plt.hist(normed_counts, bins=100)
# plt.axvline(normed_counts.mean(), c='r')
# plt.axvline(np.median(normed_counts), c='y')
# plt.show()

In [None]:
readcounts = read_counts(WF_OUT, 'HG00096', '10_000')

In [None]:
CHROMOSOMES = list(map(str, range(1, 22 + 1))) + ['X', 'Y']

In [None]:
int(len(CHROMOSOMES) / 3)

In [None]:
fig, axes = plt.subplots(int(len(CHROMOSOMES) / 3), 3, figsize=(8 * 3, 4 * 8))
axes = axes.flatten()
for i, contig in enumerate(CHROMOSOMES):
    counts = readcounts[contig]
    normed_counts = counts / counts.sum()
    baseline = normed_counts.mean()
#     ax = plt.subplot()
    ax = axes[i]
    ax.plot(normed_counts, 'C0.', alpha=.4)
    ax.plot(-(baseline - normed_counts), 'r', alpha=.4)
    ax.set_title(contig)
fig.tight_layout()
plt.show()

In [None]:
# from bio.countreads import generate_bins
# bins = {contig: dict(zip(('start', 'stop'), np.array(coord).T))
#         for contig, coord in generate_bins(in_bam, 50_000)}
# def get_region_mask(contig, start, stop):
#     contig_bins = bins[contig]
#     return (contig_bins['start'] > start) & (contig_bins['stop'] <= stop)