Skip to content

Commit

Permalink
Merge branch 'exp/indel-model' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Cooke committed Feb 4, 2019
2 parents 42fa364 + e39686c commit 8f40fc3
Show file tree
Hide file tree
Showing 38 changed files with 1,773 additions and 1,136 deletions.
154 changes: 154 additions & 0 deletions scripts/profiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python3

import argparse
import pandas as pd
from math import log10, ceil

def aggregate_errors(df, aggregators=['period', 'periods']):
if len(aggregators) == 0:
return df
errors_attributes = ['period', 'periods', 'tract_length', 'motif', 'indel_length', 'errors', 'reads']
errors_df = df[errors_attributes]
aggregation_functions = {'period': 'first', 'periods': 'first', 'errors': 'sum', 'reads': 'sum', 'tract_length': 'first'}
if 'indel_length' not in aggregators:
aggregation_functions['motif'] = 'first'
aggregation_functions['reads'] = 'first'
if 'motif' in aggregators:
errors_df = errors_df.groupby(aggregators).aggregate(aggregation_functions, axis='columns')
else:
errors_df = errors_df.groupby(aggregators + ['motif']).aggregate(aggregation_functions, axis='columns')
aggregation_functions['reads'] = 'sum'
else:
aggregation_functions['indel_length'] = 'first'
if 'motif' in aggregators:
aggregation_functions['motif'] = 'first'
errors_df = errors_df.groupby(aggregators).aggregate(aggregation_functions, axis='columns')
errors_df['error_rate'] = errors_df.apply(lambda row: row.errors / max(row.reads, 1), axis=1)
errors_attributes.append('error_rate')
if 'motif' not in aggregators:
del(errors_attributes[errors_attributes.index('motif')])
if 'indel_length' not in aggregators:
del(errors_attributes[errors_attributes.index('indel_length')])
return errors_df[errors_attributes]

def read_indel_profile(csv):
result = pd.read_csv(csv)
result['error_rate'] = result.apply(lambda row: row.errors / max(row.reads, 1), axis=1)
result['tract_length'] = result.apply(lambda row: row.period * row.periods, axis=1)
return result

def make_error_summaries(indel_profile_df):
result = {}
result['complex'] = indel_profile_df.query('motif == "N" and errors > 0')
result['homopolymer'] = indel_profile_df.query('period == 1 and periods > 0 and motif != "N"')
result['dinucleotide'] = indel_profile_df.query(
'period == 2 and periods > 0 and motif not in ["AA", "CC", "GG", "TT"]')
result['trinucleotide'] = indel_profile_df.query(
'period == 3 and periods > 0 and motif not in ["AAA", "CCC", "GGG", "TTT"]')
result['period'] = aggregate_errors(indel_profile_df)
result['length'] = aggregate_errors(indel_profile_df, ['periods', 'period', 'indel_length'])
motif_aggregators = ['periods', 'period', 'motif']
result['homopolymer-motif'] = aggregate_errors(result['homopolymer'], motif_aggregators)
result['dinucleotide-motif'] = aggregate_errors(result['dinucleotide'], motif_aggregators)
result['trinucleotide-motif'] = aggregate_errors(result['trinucleotide'], motif_aggregators)
return result

def read_indel_error_profile_summaries(indel_profile_df):
return make_error_summaries(indel_profile_df)

def read_indel_error_profile_and_summary(csv, model=None):
return read_indel_error_profile_summaries(read_indel_profile(csv))

def rate_to_phred(rate):
return -10 * log10(rate + 1e-100)

def complex_indel_penalty(profile_df):
return 43

def get_repeat_error_df(profile_df, pattern, max_periods):
profile_index = 'period'
query_condition = "periods <= " + str(max_periods)
if type(pattern) == int:
query_condition += " and period == " + str(pattern)
else:
if len(pattern) == 1:
motif = pattern[0]
motif_len = len(motif)
query_condition += " and motif == '" + motif + "'"
else:
motif_len = len(pattern[0])
query_condition += " and (motif == '" + pattern[0] + "'"
for motif in pattern[1:]:
" or motif == '" + motif + "'"
query_condition += ")"
if motif_len == 1:
profile_index = 'homopolymer-motif'
elif motif_len == 2:
profile_index = 'dinucleotide-motif'
elif motif_len == 3:
profile_index = 'trinucleotide-motif'
result = profile_df[profile_index].query(query_condition)
result['phred_error'] = result.apply(lambda row: rate_to_phred(row['error_rate']), axis=1)
return result

def make_empirical_indel_error_model_helper(profile_df, pattern, max_periods=50):
repeat_error_df = get_repeat_error_df(profile_df, pattern, max_periods)
complex_penalty = complex_indel_penalty(profile_df)
penalties = [complex_penalty, complex_penalty]
for periods in range(2, max_periods):
try:
penalties.append(float(list(repeat_error_df.query('periods == ' + str(periods))['phred_error'])[0]))
except IndexError:
break
return [max(int(ceil(penalty)) + 2, 3) for penalty in penalties]

def max_lt(seq, val):
return max(v for v in seq if v < val)

def smooth_empirical_model(open_model, extend_model=10):
# Rules: open[i] <= open[i + 1] + extend[i + 1]
head_penalties = open_model[:2]
max_penalty = max_lt(open_model[2:], 50)
result = [max_penalty, max_penalty]
for i, penalty in enumerate(open_model[2:], 2):
if i < len(open_model) - 1 and penalty < open_model[i + 1]:
penalty = int((result[i - 1] + min(result[i - 1], open_model[i + 1])) / 2)
result.append(max(min(penalty, result[i - 1]), result[i - 1] - extend_model))
result[:2] = head_penalties
return result

def make_octopus_indel_error_model(profile_df):
result = {}
for pattern in [('A', 'T'), ('C', 'G'), 2, 3]:
model = smooth_empirical_model(make_empirical_indel_error_model_helper(profile_df, pattern))
if type(pattern) == int:
result[pattern * 'N'] = model
else:
for motif in pattern:
result[motif] = model
if 'N' not in result:
result['N'] = result['A']
return result

def write_error_model(error_model, out_path):
with open(out_path, 'w') as file:
for motif, model in error_model.items():
file.write(motif + ":" + ','.join([str(p) for p in model]) + '\n')

def main(options):
profiles_df = read_indel_error_profile_and_summary(options.profile)
error_model = make_octopus_indel_error_model(profiles_df)
write_error_model(error_model, options.output)

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-P','--profile',
type=str,
required=True,
help='Octopus data profile')
parser.add_argument('-O','--output',
type=str,
required=True,
help='Output prefix')
parsed, unparsed = parser.parse_known_args()
main(parsed)
21 changes: 8 additions & 13 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -424,24 +424,20 @@ set(CORE_SOURCES
core/models/pairhmm/simd_pair_hmm.hpp
core/models/pairhmm/simd_pair_hmm.cpp

core/models/error/hiseq_indel_error_model.hpp
core/models/error/hiseq_indel_error_model.cpp
core/models/error/x10_indel_error_model.hpp
core/models/error/x10_indel_error_model.cpp
core/models/error/umi_indel_error_model.hpp
core/models/error/umi_indel_error_model.cpp
core/models/error/indel_error_model.hpp
core/models/error/indel_error_model.cpp
core/models/error/repeat_based_indel_error_model.hpp
core/models/error/repeat_based_indel_error_model.cpp
core/models/error/repeat_based_snv_error_model.hpp
core/models/error/repeat_based_snv_error_model.cpp
core/models/error/snv_error_model.hpp
core/models/error/snv_error_model.cpp
core/models/error/hiseq_snv_error_model.hpp
core/models/error/hiseq_snv_error_model.cpp
core/models/error/x10_snv_error_model.hpp
core/models/error/x10_snv_error_model.cpp
core/models/error/umi_snv_error_model.hpp
core/models/error/umi_snv_error_model.cpp
core/models/error/error_model_factory.hpp
core/models/error/error_model_factory.cpp
core/models/error/basic_repeat_based_indel_error_model.hpp
core/models/error/basic_repeat_based_indel_error_model.cpp
core/models/error/custom_repeat_based_indel_error_model.hpp
core/models/error/custom_repeat_based_indel_error_model.cpp

core/models/mutation/somatic_mutation_model.hpp
core/models/mutation/somatic_mutation_model.cpp
Expand All @@ -451,7 +447,6 @@ set(CORE_SOURCES
core/models/mutation/denovo_model.cpp
core/models/mutation/indel_mutation_model.hpp
core/models/mutation/indel_mutation_model.cpp


core/models/reference/individual_reference_likelihood_model.hpp
core/models/reference/individual_reference_likelihood_model.cpp
Expand Down
29 changes: 12 additions & 17 deletions src/config/option_collation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1598,21 +1598,17 @@ bool allow_flank_scoring(const OptionMap& options)
return options.at("inactive-flank-scoring").as<bool>() && !is_very_fast_mode(options);
}

auto make_indel_error_model(const OptionMap& options)
auto make_error_model(const OptionMap& options)
{
if (is_set("sequence-error-model", options)) {
return octopus::make_indel_error_model(options.at("sequence-error-model").as<std::string>());
} else {
return octopus::make_indel_error_model();
}
}

auto make_snv_error_model(const OptionMap& options)
{
if (is_set("sequence-error-model", options)) {
return octopus::make_snv_error_model(options.at("sequence-error-model").as<std::string>());
} else {
return octopus::make_snv_error_model();
const auto& model_label = options.at("sequence-error-model").as<std::string>();
try {
return octopus::make_error_model(model_label);
} catch (const UserError& err) {
try {
const auto model_path = resolve_path(model_label, options);
return octopus::make_error_model(model_path);
} catch (...) {}
throw;
}
}

Expand Down Expand Up @@ -1642,16 +1638,15 @@ AlignedRead::MappingQuality calculate_mapping_quality_cap_trigger(const OptionMa

HaplotypeLikelihoodModel make_likelihood_model(const OptionMap& options, const boost::optional<ReadSetProfile>& read_profile)
{
auto snv_error_model = make_snv_error_model(options);
auto indel_error_model = make_indel_error_model(options);
auto error_model = make_error_model(options);
HaplotypeLikelihoodModel::Config config {};
config.use_mapping_quality = options.at("model-mapping-quality").as<bool>();
config.use_flank_state = allow_flank_scoring(options);
if (config.use_mapping_quality) {
config.mapping_quality_cap = calculate_mapping_quality_cap(options, read_profile);
config.mapping_quality_cap_trigger = calculate_mapping_quality_cap_trigger(options, read_profile);
}
return HaplotypeLikelihoodModel {std::move(snv_error_model), std::move(indel_error_model), config};
return HaplotypeLikelihoodModel {std::move(error_model.snv), std::move(error_model.indel), config};
}

bool allow_model_filtering(const OptionMap& options)
Expand Down
4 changes: 2 additions & 2 deletions src/config/option_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,8 +519,8 @@ OptionMap parse_options(const int argc, const char** argv)
"Include the read mapping quality in the haplotype likelihood calculation")

("sequence-error-model",
po::value<std::string>()->default_value("HiSeq"),
"The sequencer error model to use (HiSeq or xTen)")
po::value<std::string>()->default_value("PCR-free.HiSeq-2500"),
"The sequencer error model to use")

("max-vb-seeds",
po::value<int>()->default_value(12),
Expand Down
1 change: 1 addition & 0 deletions src/core/csr/filters/variant_call_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ VcfRecord::Builder VariantCallFilter::construct_template(const VcfRecord& call)
}
if (output_config_.clear_existing_filters) {
result.clear_filter();
result.clear_all_sample_filters();
}
return result;
}
Expand Down
105 changes: 105 additions & 0 deletions src/core/models/error/basic_repeat_based_indel_error_model.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright (c) 2015-2018 Daniel Cooke
// Use of this source code is governed by the MIT license that can be found in the LICENSE file.

#include "basic_repeat_based_indel_error_model.hpp"

#include <vector>
#include <iterator>
#include <algorithm>

namespace octopus {

namespace {

template <typename T, std::size_t N>
void copy(const std::vector<T>& src, std::array<T, N>& dst) noexcept
{
auto itr = std::copy(std::cbegin(src), std::next(std::cbegin(src), std::min(src.size(), N)), std::begin(dst));
std::fill(itr, std::end(dst), src.back());
}

} // namespace

BasicRepeatBasedIndelErrorModel::BasicRepeatBasedIndelErrorModel(Parameters params)
{
copy(params.AT_homopolymer_open_penalities, AT_homopolymer_open_penalities_);
copy(params.CG_homopolymer_open_penalties, CG_homopolymer_open_penalties_);
copy(params.dinucleotide_repeat_open_penalties, dinucleotide_repeat_open_penalties_);
copy(params.trinucleotide_repeat_open_penalties, trinucleotide_repeat_open_penalties_);
copy(params.homopolymer_extend_penalties, homopolymer_extend_penalties_);
copy(params.dinucleotide_repeat_extend_penalties, dinucleotide_repeat_extend_penalties_);
copy(params.trinucleotide_repeat_extend_penalties, trinucleotide_repeat_extend_penalties_);
complex_open_penalty_ = dinucleotide_repeat_open_penalties_.front();
complex_extend_penalty_ = dinucleotide_repeat_extend_penalties_.front();
}

std::unique_ptr<IndelErrorModel> BasicRepeatBasedIndelErrorModel::do_clone() const
{
return std::make_unique<BasicRepeatBasedIndelErrorModel>(*this);
}

namespace {

template <typename C, typename T>
static auto get_min_penalty(const C& penalties, const T length) noexcept
{
return (length < penalties.size()) ? penalties[length] : penalties.back();
}

} // namespace

BasicRepeatBasedIndelErrorModel::PenaltyType
BasicRepeatBasedIndelErrorModel::get_default_open_penalty() const noexcept
{
return complex_open_penalty_;
}

BasicRepeatBasedIndelErrorModel::PenaltyType
BasicRepeatBasedIndelErrorModel::get_open_penalty(const Sequence& motif, const unsigned length) const noexcept
{
const auto period = motif.size();
const auto periodicity = length / period;
switch (period) {
case 1:
{
if (motif[0] == 'A' || motif[0] == 'T') {
return get_min_penalty(AT_homopolymer_open_penalities_, periodicity);
} else {
return get_min_penalty(CG_homopolymer_open_penalties_, periodicity);
}
}
case 2:
{
auto result = get_min_penalty(dinucleotide_repeat_open_penalties_, periodicity);
if (result > 7 && (motif == "CG" || motif == "GC")) result -= 2;
return result;
}
case 3:
{
return get_min_penalty(trinucleotide_repeat_open_penalties_, periodicity);
}
default:
return get_min_penalty(trinucleotide_repeat_open_penalties_, periodicity);
}
}

BasicRepeatBasedIndelErrorModel::PenaltyType
BasicRepeatBasedIndelErrorModel::get_default_extension_penalty() const noexcept
{
return complex_extend_penalty_;
}

BasicRepeatBasedIndelErrorModel::PenaltyType
BasicRepeatBasedIndelErrorModel::get_extension_penalty(const Sequence& motif, const unsigned length) const noexcept
{
const auto period = motif.size();
const auto periodicity = length / period;
switch (period) {
case 1: return get_min_penalty(homopolymer_extend_penalties_, periodicity);
case 2: return get_min_penalty(dinucleotide_repeat_extend_penalties_, periodicity);
case 3:
default: return get_min_penalty(trinucleotide_repeat_extend_penalties_, periodicity);
}
}

} // namespace octopus
Loading

0 comments on commit 8f40fc3

Please sign in to comment.