Skip to content

Commit

Permalink
Working version of kmer plotting script as well as fix to correct_raw…
Browse files Browse the repository at this point in the history
… profiling code.
  • Loading branch information
marcus1487 committed Sep 12, 2016
1 parent d70d766 commit fc31d1e
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 36 deletions.
28 changes: 20 additions & 8 deletions correct_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,24 @@ def correct_raw_data(
def get_aligned_seq_worker(
filenames_q, failed_reads_q, genome_filename, graphmap_path,
basecall_group, corrected_group, timeout, num_cpts_limit):
if DO_PROFILE:
import cProfile
cProfile.runctx(
"""while not filenames_q.empty():
try:
fn = filenames_q.get(block=False)
except Queue.Empty:
break
try:
correct_raw_data(
fn, genome_filename, graphmap_path, basecall_group,
corrected_group, timeout=timeout,
num_cpts_limit=num_cpts_limit)
except Exception as e:
failed_reads_q.put((str(e), fn))""",
globals(), locals(), 'profile.correct_reads.prof')
sys.exit()
while not filenames_q.empty():
try:
fn = filenames_q.get(block=False)
Expand Down Expand Up @@ -680,16 +698,10 @@ def main():
timeout, num_cpts_limit, basecall_group, corrected_group,
write_failed) = parse_arguments()

if DO_PROFILE: num_p = 1

files = [os.path.join(filebase, fn) for fn in os.listdir(filebase)]

if DO_PROFILE:
import cProfile
cProfile.runctx(
"""failed_reads = get_all_reads_data(
files, genome_filename, graphmap_path, basecall_group,
corrected_group, num_p, timeout, num_cpts_limit)""",
globals(), locals(), 'profile.correct_reads.prof')
sys.exit()
failed_reads = get_all_reads_data(
files, genome_filename, graphmap_path, basecall_group,
corrected_group, num_p, timeout, num_cpts_limit)
Expand Down
125 changes: 97 additions & 28 deletions plot_kmer_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,74 +4,143 @@

import numpy as np

from itertools import combinations
from itertools import repeat
from collections import defaultdict

def plot_kmer_dist(files, corrected_group, kmer_val, kmer_thresh, fn_base):
try:
import rpy2.robjects as r
from rpy2.robjects.packages import importr
ggplot = importr("ggplot2")
r.r('''
plotKmerDist <- function(dat){
print(ggplot(dat) + geom_boxplot(aes(x=Trimer, y=Signal, color=Base)) +
theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1, size=8)) +
scale_color_manual(values=c('#00CC00', '#0000CC', '#FFB300', '#CC0000')))
}
''')
plotKmerDist = r.globalenv['plotKmerDist']
r.r('''
plotKmerDistWReadPath <- function(dat){
print(ggplot(dat) + geom_boxplot(aes(x=Trimer, y=Signal, color=Base)) +
theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1, size=8)) +
scale_color_manual(values=c('#00CC00', '#0000CC', '#FFB300', '#CC0000')))
print(ggplot(dat) + geom_path(aes(x=Trimer, y=Signal, group=Read), alpha=0.1) +
theme_bw() + theme(axis.text.x=element_text(angle=60, hjust=1, size=8)) +
scale_color_manual(values=c('#00CC00', '#0000CC', '#FFB300', '#CC0000')))
}
''')
plotKmerDistWReadPath = r.globalenv['plotKmerDistWReadPath']
except:
sys.stderr.write(
'*' * 60 + '\nERROR: Must have rpy2, R and ' +
'R package ggplot2 installed in order to plot.\n' +
'*' * 60 + '\n\n')
raise


def plot_kmer_dist(files, corrected_group, read_mean, kmer_len,
kmer_thresh, fn_base):
all_raw_data = []
for fn in files:
read_data = h5py.File(fn)
if 'Analyses/RawGenomeCorrected' not in read_data:
if 'Analyses/' + corrected_group not in read_data:
continue
seq = ''.join(read_data['Analyses/' + corrected_group +
'/template/Events']['base'])
means = np.array(read_data['Analyses/' + corrected_group +
'/template/Events']['mean'])
'/template/Events']['norm_mean'])
all_raw_data.append((seq, means))

def get_mean_sd(norm_quantiles):
all_trimers = defaultdict(list)
for seq, means in all_raw_data:
read_robust_med = np.percentile(means, norm_quantiles).mean()
#read_mad = np.median(np.abs(means - read_robust_med))
#norm_means = (means - read_robust_med) / read_mad
norm_means = means - read_robust_med

read_trimers = defaultdict(list)
for trimer, event_mean in zip(
[''.join(bs) for bs in zip(
seq[:-2], seq[1:-1], seq[2:])],
norm_means[2:]):
read_trimers[trimer].append(event_mean)
if min(len(x) for x in read_trimers.values()) > kmer_thresh:
for trimer, event_means in read_trimers.items():
all_trimers[trimer].append(np.mean(event_mean))
all_trimers = defaultdict(list)
for read_i, (seq, means) in enumerate(all_raw_data):
read_trimers = defaultdict(list)
for trimer, event_mean in zip(
[''.join(bs) for bs in zip(*[
seq[i:] for i in range(kmer_len)])],
means[kmer_len - 1:]):
read_trimers[trimer].append(event_mean)
if min(len(x) for x in read_trimers.values()) > kmer_thresh:
for trimer, trimer_means in read_trimers.items():
if read_mean:
all_trimers[trimer].append((np.mean(trimer_means), read_i))
else:
all_trimers[trimer].extend(zip(trimer_means, repeat(read_i)))

kmer_levels = [kmer for means, kmer in sorted([
(np.mean(zip(*means)[0]), kmer)
for kmer, means in all_trimers.items()])]

plot_data = [
(kmer, kmer[-1], sig_mean, read_i)
for kmer in kmer_levels
for sig_mean, read_i in all_trimers[kmer]]

trimerDat = r.DataFrame({
'Trimer':r.FactorVector(
r.StrVector(zip(*plot_data)[0]),
ordered=True, levels=r.StrVector(kmer_levels)),
'Base':r.StrVector(zip(*plot_data)[1]),
'Signal':r.FloatVector(zip(*plot_data)[2]),
'Read':r.StrVector(zip(*plot_data)[3])})
# code to plot kmers as tile of colors but adds gridExtra dependency
if False:
kmer_plot_data = [
(kmer_i, pos_i, base) for kmer_i, kmer in enumerate(kmer_leves)
for pos_i, base in enumerate(kmer)]
kmerDat = r.DataFrame({
'Kmer':r.IntVector(zip(*kmer_plot_data)[0]),
'Position':r.IntVector(zip(*kmer_plot_data)[1]),
'Base':r.StrVector(zip(*kmer_plot_data)[2])})

if read_mean:
r.r('pdf("' + fn_base + '.read_mean.pdf", height=7, width=10)')
plotKmerDistWReadPath(trimerDat)
r.r('dev.off()')
else:
r.r('pdf("' + fn_base + '.pdf", height=7, width=10)')
plotKmerDist(trimerDat)
r.r('dev.off()')

return

def parse_arguments():
import argparse
parser = argparse.ArgumentParser(
description='Plot raw signal corrected with correct_raw.' )
description='Plot distribution of signal across kmers.' )
parser.add_argument('fast5_basedir',
help='Directory containing fast5 files.')

parser.add_argument('--corrected-group', default='RawGenomeCorrected_000',
help='FAST5 group to plot created by correct_raw ' +
'script. Default: %(default)s')

parser.add_argument('--kmer-value', default=3, type=int, options=(2,3,4),
parser.add_argument('--kmer-length', default=3, type=int, choices=(2,3,4),
help='Value of K to analyze. Should be one of ' +
'{2,3,4}. Default: %(default)d')
parser.add_argument('--num-trimer-threshold', default=4, type=int,
help='Number of each kmer required to include ' +
'a read in read level averages. Default: %(default)d')

parser.add_argument('--pdf-filebase', default='Nanopore_read_coverage',
parser.add_argument('--read-mean', default=False, action='store_true',
help='Plot kmer event means across reads as opposed '
'to each event.')

parser.add_argument('--pdf-filebase', default='Nanopore_kmer_distribution',
help='Base for PDF to store plots (suffix depends ' +
'on plot type to avoid overwriting plots). ' +
'Default: %(default)s')
args = parser.parse_args()

return (args.fast5_basedir, args.corrected_group,
args.kmer_value, args.num_trimers_thresh, args.pdf_filebase)
args.kmer_length, args.num_trimer_threshold,
args.read_mean, args.pdf_filebase)

def main():
(filebase, corrected_group, kmer_val,
kmer_thresh, fn_base) = parse_arguments()
(filebase, corrected_group, kmer_len,
kmer_thresh, read_mean, fn_base) = parse_arguments()

files = [os.path.join(filebase, fn) for fn in os.listdir(filebase)]
plot_kmer_dist(files, corrected_group, kmer_val, kmer_thresh, fn_base)
plot_kmer_dist(files, corrected_group, read_mean, kmer_len, kmer_thresh, fn_base)

return

Expand Down

0 comments on commit fc31d1e

Please sign in to comment.