Merge branch 'exp/indel-model' into develop

luntergroup · Feb 4, 2019 · 8f40fc3 · 8f40fc3
2 parents 42fa364 + e39686c
commit 8f40fc3
Show file tree

Hide file tree

Showing 38 changed files with 1,773 additions and 1,136 deletions.
diff --git a/scripts/profiler.py b/scripts/profiler.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+from math import log10, ceil
+
+def aggregate_errors(df, aggregators=['period', 'periods']):
+    if len(aggregators) == 0:
+        return df
+    errors_attributes = ['period', 'periods', 'tract_length', 'motif', 'indel_length', 'errors', 'reads']
+    errors_df = df[errors_attributes]
+    aggregation_functions = {'period': 'first', 'periods': 'first', 'errors': 'sum', 'reads': 'sum', 'tract_length': 'first'}
+    if 'indel_length' not in aggregators:
+        aggregation_functions['motif'] = 'first'
+        aggregation_functions['reads'] = 'first'
+        if 'motif' in aggregators:
+            errors_df = errors_df.groupby(aggregators).aggregate(aggregation_functions, axis='columns')
+        else:
+            errors_df = errors_df.groupby(aggregators + ['motif']).aggregate(aggregation_functions, axis='columns')
+        aggregation_functions['reads'] = 'sum'
+    else:
+        aggregation_functions['indel_length'] = 'first'
+    if 'motif' in aggregators:
+        aggregation_functions['motif'] = 'first'
+    errors_df = errors_df.groupby(aggregators).aggregate(aggregation_functions, axis='columns')
+    errors_df['error_rate'] = errors_df.apply(lambda row: row.errors / max(row.reads, 1), axis=1)
+    errors_attributes.append('error_rate')
+    if 'motif' not in aggregators:
+        del(errors_attributes[errors_attributes.index('motif')])
+    if 'indel_length' not in aggregators:
+        del(errors_attributes[errors_attributes.index('indel_length')])
+    return errors_df[errors_attributes]
+
+def read_indel_profile(csv):
+    result = pd.read_csv(csv)
+    result['error_rate'] = result.apply(lambda row: row.errors / max(row.reads, 1), axis=1)
+    result['tract_length'] = result.apply(lambda row: row.period * row.periods, axis=1)
+    return result
+
+def make_error_summaries(indel_profile_df):
+    result = {}
+    result['complex'] = indel_profile_df.query('motif == "N" and errors > 0')
+    result['homopolymer'] = indel_profile_df.query('period == 1 and periods > 0 and motif != "N"')
+    result['dinucleotide'] = indel_profile_df.query(
+        'period == 2 and periods > 0 and motif not in ["AA", "CC", "GG", "TT"]')
+    result['trinucleotide'] = indel_profile_df.query(
+        'period == 3 and periods > 0 and motif not in ["AAA", "CCC", "GGG", "TTT"]')
+    result['period'] = aggregate_errors(indel_profile_df)
+    result['length'] = aggregate_errors(indel_profile_df, ['periods', 'period', 'indel_length'])
+    motif_aggregators = ['periods', 'period', 'motif']
+    result['homopolymer-motif'] = aggregate_errors(result['homopolymer'], motif_aggregators)
+    result['dinucleotide-motif'] = aggregate_errors(result['dinucleotide'], motif_aggregators)
+    result['trinucleotide-motif'] = aggregate_errors(result['trinucleotide'], motif_aggregators)
+    return result
+
+def read_indel_error_profile_summaries(indel_profile_df):
+    return make_error_summaries(indel_profile_df)
+
+def read_indel_error_profile_and_summary(csv, model=None):
+    return read_indel_error_profile_summaries(read_indel_profile(csv))
+
+def rate_to_phred(rate):
+    return -10 * log10(rate + 1e-100)
+
+def complex_indel_penalty(profile_df):
+    return 43
+
+def get_repeat_error_df(profile_df, pattern, max_periods):
+    profile_index = 'period'
+    query_condition = "periods <= " + str(max_periods)
+    if type(pattern) == int:
+        query_condition += " and period == " + str(pattern)
+    else:
+        if len(pattern) == 1:
+            motif = pattern[0]
+            motif_len = len(motif)
+            query_condition += " and motif == '" + motif + "'"
+        else:
+            motif_len = len(pattern[0])
+            query_condition += " and (motif == '" + pattern[0] + "'"
+            for motif in pattern[1:]:
+                " or motif == '" + motif + "'"
+            query_condition += ")"
+        if motif_len == 1:
+            profile_index = 'homopolymer-motif'
+        elif motif_len == 2:
+            profile_index = 'dinucleotide-motif'
+        elif motif_len == 3:
+            profile_index = 'trinucleotide-motif'
+    result = profile_df[profile_index].query(query_condition)
+    result['phred_error'] = result.apply(lambda row: rate_to_phred(row['error_rate']), axis=1)
+    return result
+
+def make_empirical_indel_error_model_helper(profile_df, pattern, max_periods=50):
+    repeat_error_df = get_repeat_error_df(profile_df, pattern, max_periods)
+    complex_penalty = complex_indel_penalty(profile_df)
+    penalties = [complex_penalty, complex_penalty]
+    for periods in range(2, max_periods):
+        try:
+            penalties.append(float(list(repeat_error_df.query('periods == ' + str(periods))['phred_error'])[0]))
+        except IndexError:
+            break
+    return [max(int(ceil(penalty)) + 2, 3) for penalty in penalties]
+
+def max_lt(seq, val):
+    return max(v for v in seq if v < val)
+
+def smooth_empirical_model(open_model, extend_model=10):
+    # Rules: open[i] <= open[i + 1] + extend[i + 1]
+    head_penalties = open_model[:2]
+    max_penalty = max_lt(open_model[2:], 50)
+    result = [max_penalty, max_penalty]
+    for i, penalty in enumerate(open_model[2:], 2):
+        if i < len(open_model) - 1 and penalty < open_model[i + 1]:
+            penalty = int((result[i - 1] + min(result[i - 1], open_model[i + 1])) / 2)
+        result.append(max(min(penalty, result[i - 1]), result[i - 1] - extend_model))
+    result[:2] = head_penalties
+    return result
+
+def make_octopus_indel_error_model(profile_df):
+    result = {}
+    for pattern in [('A', 'T'), ('C', 'G'), 2, 3]:
+        model = smooth_empirical_model(make_empirical_indel_error_model_helper(profile_df, pattern))
+        if type(pattern) == int:
+            result[pattern * 'N'] = model
+        else:
+            for motif in pattern:
+                result[motif] = model
+    if 'N' not in result:
+        result['N'] = result['A']
+    return result
+
+def write_error_model(error_model, out_path):
+    with open(out_path, 'w') as file:
+        for motif, model in error_model.items():
+            file.write(motif + ":" + ','.join([str(p) for p in model]) + '\n')
+
+def main(options):
+    profiles_df = read_indel_error_profile_and_summary(options.profile)
+    error_model = make_octopus_indel_error_model(profiles_df)
+    write_error_model(error_model, options.output)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-P','--profile',
+                        type=str,
+                        required=True,
+                        help='Octopus data profile')
+    parser.add_argument('-O','--output',
+                        type=str,
+                        required=True,
+                        help='Output prefix')
+    parsed, unparsed = parser.parse_known_args()
+    main(parsed)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -424,24 +424,20 @@ set(CORE_SOURCES
     core/models/pairhmm/simd_pair_hmm.hpp
     core/models/pairhmm/simd_pair_hmm.cpp
 
-    core/models/error/hiseq_indel_error_model.hpp
-    core/models/error/hiseq_indel_error_model.cpp
-    core/models/error/x10_indel_error_model.hpp
-    core/models/error/x10_indel_error_model.cpp
-    core/models/error/umi_indel_error_model.hpp
-    core/models/error/umi_indel_error_model.cpp
     core/models/error/indel_error_model.hpp
     core/models/error/indel_error_model.cpp
+    core/models/error/repeat_based_indel_error_model.hpp
+    core/models/error/repeat_based_indel_error_model.cpp
+    core/models/error/repeat_based_snv_error_model.hpp
+    core/models/error/repeat_based_snv_error_model.cpp
     core/models/error/snv_error_model.hpp
     core/models/error/snv_error_model.cpp
-    core/models/error/hiseq_snv_error_model.hpp
-    core/models/error/hiseq_snv_error_model.cpp
-    core/models/error/x10_snv_error_model.hpp
-    core/models/error/x10_snv_error_model.cpp
-    core/models/error/umi_snv_error_model.hpp
-    core/models/error/umi_snv_error_model.cpp
     core/models/error/error_model_factory.hpp
     core/models/error/error_model_factory.cpp
+    core/models/error/basic_repeat_based_indel_error_model.hpp
+    core/models/error/basic_repeat_based_indel_error_model.cpp
+    core/models/error/custom_repeat_based_indel_error_model.hpp
+    core/models/error/custom_repeat_based_indel_error_model.cpp
 
     core/models/mutation/somatic_mutation_model.hpp
     core/models/mutation/somatic_mutation_model.cpp
@@ -451,7 +447,6 @@ set(CORE_SOURCES
     core/models/mutation/denovo_model.cpp
     core/models/mutation/indel_mutation_model.hpp
     core/models/mutation/indel_mutation_model.cpp
-
 
     core/models/reference/individual_reference_likelihood_model.hpp
     core/models/reference/individual_reference_likelihood_model.cpp

diff --git a/src/config/option_collation.cpp b/src/config/option_collation.cpp
@@ -1598,21 +1598,17 @@ bool allow_flank_scoring(const OptionMap& options)
     return options.at("inactive-flank-scoring").as<bool>() && !is_very_fast_mode(options);
 }
 
-auto make_indel_error_model(const OptionMap& options)
+auto make_error_model(const OptionMap& options)
 {
-    if (is_set("sequence-error-model", options)) {
-        return octopus::make_indel_error_model(options.at("sequence-error-model").as<std::string>());
-    } else {
-        return octopus::make_indel_error_model();
-    }
-}
-
-auto make_snv_error_model(const OptionMap& options)
-{
-    if (is_set("sequence-error-model", options)) {
-        return octopus::make_snv_error_model(options.at("sequence-error-model").as<std::string>());
-    } else {
-        return octopus::make_snv_error_model();
+    const auto& model_label = options.at("sequence-error-model").as<std::string>();
+    try {
+        return octopus::make_error_model(model_label);
+    } catch (const UserError& err) {
+        try {
+            const auto model_path = resolve_path(model_label, options);
+            return octopus::make_error_model(model_path);
+        } catch (...) {}
+        throw;
     }
 }
 
@@ -1642,16 +1638,15 @@ AlignedRead::MappingQuality calculate_mapping_quality_cap_trigger(const OptionMa
 
 HaplotypeLikelihoodModel make_likelihood_model(const OptionMap& options, const boost::optional<ReadSetProfile>& read_profile)
 {
-    auto snv_error_model = make_snv_error_model(options);
-    auto indel_error_model = make_indel_error_model(options);
+    auto error_model = make_error_model(options);
     HaplotypeLikelihoodModel::Config config {};
     config.use_mapping_quality = options.at("model-mapping-quality").as<bool>();
     config.use_flank_state = allow_flank_scoring(options);
     if (config.use_mapping_quality) {
         config.mapping_quality_cap = calculate_mapping_quality_cap(options, read_profile);
         config.mapping_quality_cap_trigger = calculate_mapping_quality_cap_trigger(options, read_profile);
     }
-    return HaplotypeLikelihoodModel {std::move(snv_error_model), std::move(indel_error_model), config};
+    return HaplotypeLikelihoodModel {std::move(error_model.snv), std::move(error_model.indel), config};
 }
 
 bool allow_model_filtering(const OptionMap& options)

diff --git a/src/config/option_parser.cpp b/src/config/option_parser.cpp
@@ -519,8 +519,8 @@ OptionMap parse_options(const int argc, const char** argv)
      "Include the read mapping quality in the haplotype likelihood calculation")
 
     ("sequence-error-model",
-     po::value<std::string>()->default_value("HiSeq"),
-     "The sequencer error model to use (HiSeq or xTen)")
+     po::value<std::string>()->default_value("PCR-free.HiSeq-2500"),
+     "The sequencer error model to use")
 
     ("max-vb-seeds",
      po::value<int>()->default_value(12),

diff --git a/src/core/csr/filters/variant_call_filter.cpp b/src/core/csr/filters/variant_call_filter.cpp
@@ -364,6 +364,7 @@ VcfRecord::Builder VariantCallFilter::construct_template(const VcfRecord& call)
     }
     if (output_config_.clear_existing_filters) {
         result.clear_filter();
+        result.clear_all_sample_filters();
     }
     return result;
 }

diff --git a/src/core/models/error/basic_repeat_based_indel_error_model.cpp b/src/core/models/error/basic_repeat_based_indel_error_model.cpp
@@ -0,0 +1,105 @@
+// Copyright (c) 2015-2018 Daniel Cooke
+// Use of this source code is governed by the MIT license that can be found in the LICENSE file.
+
+#include "basic_repeat_based_indel_error_model.hpp"
+
+#include <vector>
+#include <iterator>
+#include <algorithm>
+
+namespace octopus {
+
+namespace {
+
+template <typename T, std::size_t N>
+void copy(const std::vector<T>& src, std::array<T, N>& dst) noexcept
+{
+    auto itr = std::copy(std::cbegin(src), std::next(std::cbegin(src), std::min(src.size(), N)), std::begin(dst));
+    std::fill(itr, std::end(dst), src.back());
+}
+
+} // namespace
+
+BasicRepeatBasedIndelErrorModel::BasicRepeatBasedIndelErrorModel(Parameters params)
+{
+    copy(params.AT_homopolymer_open_penalities, AT_homopolymer_open_penalities_);
+    copy(params.CG_homopolymer_open_penalties, CG_homopolymer_open_penalties_);
+    copy(params.dinucleotide_repeat_open_penalties, dinucleotide_repeat_open_penalties_);
+    copy(params.trinucleotide_repeat_open_penalties, trinucleotide_repeat_open_penalties_);
+    copy(params.homopolymer_extend_penalties, homopolymer_extend_penalties_);
+    copy(params.dinucleotide_repeat_extend_penalties, dinucleotide_repeat_extend_penalties_);
+    copy(params.trinucleotide_repeat_extend_penalties, trinucleotide_repeat_extend_penalties_);
+    complex_open_penalty_ = dinucleotide_repeat_open_penalties_.front();
+    complex_extend_penalty_ = dinucleotide_repeat_extend_penalties_.front();
+}
+
+std::unique_ptr<IndelErrorModel> BasicRepeatBasedIndelErrorModel::do_clone() const
+{
+    return std::make_unique<BasicRepeatBasedIndelErrorModel>(*this);
+}
+
+namespace {
+
+template <typename C, typename T>
+static auto get_min_penalty(const C& penalties, const T length) noexcept
+{
+    return (length < penalties.size()) ? penalties[length] : penalties.back();
+}
+
+} // namespace
+
+BasicRepeatBasedIndelErrorModel::PenaltyType
+BasicRepeatBasedIndelErrorModel::get_default_open_penalty() const noexcept
+{
+    return complex_open_penalty_;
+}
+
+BasicRepeatBasedIndelErrorModel::PenaltyType
+BasicRepeatBasedIndelErrorModel::get_open_penalty(const Sequence& motif, const unsigned length) const noexcept
+{
+    const auto period = motif.size();
+    const auto periodicity = length / period;
+    switch (period) {
+        case 1:
+        {
+            if (motif[0] == 'A' || motif[0] == 'T') {
+                return get_min_penalty(AT_homopolymer_open_penalities_, periodicity);
+            } else {
+                return get_min_penalty(CG_homopolymer_open_penalties_, periodicity);
+            }
+        }
+        case 2:
+        {
+            auto result = get_min_penalty(dinucleotide_repeat_open_penalties_, periodicity);
+            if (result > 7 && (motif == "CG" || motif == "GC")) result -= 2;
+            return result;
+        }
+        case 3:
+        {
+            return get_min_penalty(trinucleotide_repeat_open_penalties_, periodicity);
+        }
+        default:
+            return get_min_penalty(trinucleotide_repeat_open_penalties_, periodicity);
+    }
+}
+
+BasicRepeatBasedIndelErrorModel::PenaltyType
+BasicRepeatBasedIndelErrorModel::get_default_extension_penalty() const noexcept
+{
+    return complex_extend_penalty_;
+}
+
+BasicRepeatBasedIndelErrorModel::PenaltyType
+BasicRepeatBasedIndelErrorModel::get_extension_penalty(const Sequence& motif, const unsigned length) const noexcept
+{
+    const auto period = motif.size();
+    const auto periodicity = length / period;
+    switch (period) {
+        case 1: return get_min_penalty(homopolymer_extend_penalties_, periodicity);
+        case 2: return get_min_penalty(dinucleotide_repeat_extend_penalties_, periodicity);
+        case 3:
+        default: return get_min_penalty(trinucleotide_repeat_extend_penalties_, periodicity);
+    }
+}
+
+} // namespace octopus