-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'exp/indel-model' into develop
- Loading branch information
Showing
38 changed files
with
1,773 additions
and
1,136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import pandas as pd | ||
from math import log10, ceil | ||
|
||
def aggregate_errors(df, aggregators=['period', 'periods']): | ||
if len(aggregators) == 0: | ||
return df | ||
errors_attributes = ['period', 'periods', 'tract_length', 'motif', 'indel_length', 'errors', 'reads'] | ||
errors_df = df[errors_attributes] | ||
aggregation_functions = {'period': 'first', 'periods': 'first', 'errors': 'sum', 'reads': 'sum', 'tract_length': 'first'} | ||
if 'indel_length' not in aggregators: | ||
aggregation_functions['motif'] = 'first' | ||
aggregation_functions['reads'] = 'first' | ||
if 'motif' in aggregators: | ||
errors_df = errors_df.groupby(aggregators).aggregate(aggregation_functions, axis='columns') | ||
else: | ||
errors_df = errors_df.groupby(aggregators + ['motif']).aggregate(aggregation_functions, axis='columns') | ||
aggregation_functions['reads'] = 'sum' | ||
else: | ||
aggregation_functions['indel_length'] = 'first' | ||
if 'motif' in aggregators: | ||
aggregation_functions['motif'] = 'first' | ||
errors_df = errors_df.groupby(aggregators).aggregate(aggregation_functions, axis='columns') | ||
errors_df['error_rate'] = errors_df.apply(lambda row: row.errors / max(row.reads, 1), axis=1) | ||
errors_attributes.append('error_rate') | ||
if 'motif' not in aggregators: | ||
del(errors_attributes[errors_attributes.index('motif')]) | ||
if 'indel_length' not in aggregators: | ||
del(errors_attributes[errors_attributes.index('indel_length')]) | ||
return errors_df[errors_attributes] | ||
|
||
def read_indel_profile(csv): | ||
result = pd.read_csv(csv) | ||
result['error_rate'] = result.apply(lambda row: row.errors / max(row.reads, 1), axis=1) | ||
result['tract_length'] = result.apply(lambda row: row.period * row.periods, axis=1) | ||
return result | ||
|
||
def make_error_summaries(indel_profile_df): | ||
result = {} | ||
result['complex'] = indel_profile_df.query('motif == "N" and errors > 0') | ||
result['homopolymer'] = indel_profile_df.query('period == 1 and periods > 0 and motif != "N"') | ||
result['dinucleotide'] = indel_profile_df.query( | ||
'period == 2 and periods > 0 and motif not in ["AA", "CC", "GG", "TT"]') | ||
result['trinucleotide'] = indel_profile_df.query( | ||
'period == 3 and periods > 0 and motif not in ["AAA", "CCC", "GGG", "TTT"]') | ||
result['period'] = aggregate_errors(indel_profile_df) | ||
result['length'] = aggregate_errors(indel_profile_df, ['periods', 'period', 'indel_length']) | ||
motif_aggregators = ['periods', 'period', 'motif'] | ||
result['homopolymer-motif'] = aggregate_errors(result['homopolymer'], motif_aggregators) | ||
result['dinucleotide-motif'] = aggregate_errors(result['dinucleotide'], motif_aggregators) | ||
result['trinucleotide-motif'] = aggregate_errors(result['trinucleotide'], motif_aggregators) | ||
return result | ||
|
||
def read_indel_error_profile_summaries(indel_profile_df): | ||
return make_error_summaries(indel_profile_df) | ||
|
||
def read_indel_error_profile_and_summary(csv, model=None): | ||
return read_indel_error_profile_summaries(read_indel_profile(csv)) | ||
|
||
def rate_to_phred(rate): | ||
return -10 * log10(rate + 1e-100) | ||
|
||
def complex_indel_penalty(profile_df): | ||
return 43 | ||
|
||
def get_repeat_error_df(profile_df, pattern, max_periods): | ||
profile_index = 'period' | ||
query_condition = "periods <= " + str(max_periods) | ||
if type(pattern) == int: | ||
query_condition += " and period == " + str(pattern) | ||
else: | ||
if len(pattern) == 1: | ||
motif = pattern[0] | ||
motif_len = len(motif) | ||
query_condition += " and motif == '" + motif + "'" | ||
else: | ||
motif_len = len(pattern[0]) | ||
query_condition += " and (motif == '" + pattern[0] + "'" | ||
for motif in pattern[1:]: | ||
" or motif == '" + motif + "'" | ||
query_condition += ")" | ||
if motif_len == 1: | ||
profile_index = 'homopolymer-motif' | ||
elif motif_len == 2: | ||
profile_index = 'dinucleotide-motif' | ||
elif motif_len == 3: | ||
profile_index = 'trinucleotide-motif' | ||
result = profile_df[profile_index].query(query_condition) | ||
result['phred_error'] = result.apply(lambda row: rate_to_phred(row['error_rate']), axis=1) | ||
return result | ||
|
||
def make_empirical_indel_error_model_helper(profile_df, pattern, max_periods=50): | ||
repeat_error_df = get_repeat_error_df(profile_df, pattern, max_periods) | ||
complex_penalty = complex_indel_penalty(profile_df) | ||
penalties = [complex_penalty, complex_penalty] | ||
for periods in range(2, max_periods): | ||
try: | ||
penalties.append(float(list(repeat_error_df.query('periods == ' + str(periods))['phred_error'])[0])) | ||
except IndexError: | ||
break | ||
return [max(int(ceil(penalty)) + 2, 3) for penalty in penalties] | ||
|
||
def max_lt(seq, val): | ||
return max(v for v in seq if v < val) | ||
|
||
def smooth_empirical_model(open_model, extend_model=10): | ||
# Rules: open[i] <= open[i + 1] + extend[i + 1] | ||
head_penalties = open_model[:2] | ||
max_penalty = max_lt(open_model[2:], 50) | ||
result = [max_penalty, max_penalty] | ||
for i, penalty in enumerate(open_model[2:], 2): | ||
if i < len(open_model) - 1 and penalty < open_model[i + 1]: | ||
penalty = int((result[i - 1] + min(result[i - 1], open_model[i + 1])) / 2) | ||
result.append(max(min(penalty, result[i - 1]), result[i - 1] - extend_model)) | ||
result[:2] = head_penalties | ||
return result | ||
|
||
def make_octopus_indel_error_model(profile_df): | ||
result = {} | ||
for pattern in [('A', 'T'), ('C', 'G'), 2, 3]: | ||
model = smooth_empirical_model(make_empirical_indel_error_model_helper(profile_df, pattern)) | ||
if type(pattern) == int: | ||
result[pattern * 'N'] = model | ||
else: | ||
for motif in pattern: | ||
result[motif] = model | ||
if 'N' not in result: | ||
result['N'] = result['A'] | ||
return result | ||
|
||
def write_error_model(error_model, out_path): | ||
with open(out_path, 'w') as file: | ||
for motif, model in error_model.items(): | ||
file.write(motif + ":" + ','.join([str(p) for p in model]) + '\n') | ||
|
||
def main(options): | ||
profiles_df = read_indel_error_profile_and_summary(options.profile) | ||
error_model = make_octopus_indel_error_model(profiles_df) | ||
write_error_model(error_model, options.output) | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-P','--profile', | ||
type=str, | ||
required=True, | ||
help='Octopus data profile') | ||
parser.add_argument('-O','--output', | ||
type=str, | ||
required=True, | ||
help='Output prefix') | ||
parsed, unparsed = parser.parse_known_args() | ||
main(parsed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
src/core/models/error/basic_repeat_based_indel_error_model.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
// Copyright (c) 2015-2018 Daniel Cooke | ||
// Use of this source code is governed by the MIT license that can be found in the LICENSE file. | ||
|
||
#include "basic_repeat_based_indel_error_model.hpp" | ||
|
||
#include <vector> | ||
#include <iterator> | ||
#include <algorithm> | ||
|
||
namespace octopus { | ||
|
||
namespace { | ||
|
||
template <typename T, std::size_t N> | ||
void copy(const std::vector<T>& src, std::array<T, N>& dst) noexcept | ||
{ | ||
auto itr = std::copy(std::cbegin(src), std::next(std::cbegin(src), std::min(src.size(), N)), std::begin(dst)); | ||
std::fill(itr, std::end(dst), src.back()); | ||
} | ||
|
||
} // namespace | ||
|
||
BasicRepeatBasedIndelErrorModel::BasicRepeatBasedIndelErrorModel(Parameters params) | ||
{ | ||
copy(params.AT_homopolymer_open_penalities, AT_homopolymer_open_penalities_); | ||
copy(params.CG_homopolymer_open_penalties, CG_homopolymer_open_penalties_); | ||
copy(params.dinucleotide_repeat_open_penalties, dinucleotide_repeat_open_penalties_); | ||
copy(params.trinucleotide_repeat_open_penalties, trinucleotide_repeat_open_penalties_); | ||
copy(params.homopolymer_extend_penalties, homopolymer_extend_penalties_); | ||
copy(params.dinucleotide_repeat_extend_penalties, dinucleotide_repeat_extend_penalties_); | ||
copy(params.trinucleotide_repeat_extend_penalties, trinucleotide_repeat_extend_penalties_); | ||
complex_open_penalty_ = dinucleotide_repeat_open_penalties_.front(); | ||
complex_extend_penalty_ = dinucleotide_repeat_extend_penalties_.front(); | ||
} | ||
|
||
std::unique_ptr<IndelErrorModel> BasicRepeatBasedIndelErrorModel::do_clone() const | ||
{ | ||
return std::make_unique<BasicRepeatBasedIndelErrorModel>(*this); | ||
} | ||
|
||
namespace { | ||
|
||
template <typename C, typename T> | ||
static auto get_min_penalty(const C& penalties, const T length) noexcept | ||
{ | ||
return (length < penalties.size()) ? penalties[length] : penalties.back(); | ||
} | ||
|
||
} // namespace | ||
|
||
BasicRepeatBasedIndelErrorModel::PenaltyType | ||
BasicRepeatBasedIndelErrorModel::get_default_open_penalty() const noexcept | ||
{ | ||
return complex_open_penalty_; | ||
} | ||
|
||
BasicRepeatBasedIndelErrorModel::PenaltyType | ||
BasicRepeatBasedIndelErrorModel::get_open_penalty(const Sequence& motif, const unsigned length) const noexcept | ||
{ | ||
const auto period = motif.size(); | ||
const auto periodicity = length / period; | ||
switch (period) { | ||
case 1: | ||
{ | ||
if (motif[0] == 'A' || motif[0] == 'T') { | ||
return get_min_penalty(AT_homopolymer_open_penalities_, periodicity); | ||
} else { | ||
return get_min_penalty(CG_homopolymer_open_penalties_, periodicity); | ||
} | ||
} | ||
case 2: | ||
{ | ||
auto result = get_min_penalty(dinucleotide_repeat_open_penalties_, periodicity); | ||
if (result > 7 && (motif == "CG" || motif == "GC")) result -= 2; | ||
return result; | ||
} | ||
case 3: | ||
{ | ||
return get_min_penalty(trinucleotide_repeat_open_penalties_, periodicity); | ||
} | ||
default: | ||
return get_min_penalty(trinucleotide_repeat_open_penalties_, periodicity); | ||
} | ||
} | ||
|
||
BasicRepeatBasedIndelErrorModel::PenaltyType | ||
BasicRepeatBasedIndelErrorModel::get_default_extension_penalty() const noexcept | ||
{ | ||
return complex_extend_penalty_; | ||
} | ||
|
||
BasicRepeatBasedIndelErrorModel::PenaltyType | ||
BasicRepeatBasedIndelErrorModel::get_extension_penalty(const Sequence& motif, const unsigned length) const noexcept | ||
{ | ||
const auto period = motif.size(); | ||
const auto periodicity = length / period; | ||
switch (period) { | ||
case 1: return get_min_penalty(homopolymer_extend_penalties_, periodicity); | ||
case 2: return get_min_penalty(dinucleotide_repeat_extend_penalties_, periodicity); | ||
case 3: | ||
default: return get_min_penalty(trinucleotide_repeat_extend_penalties_, periodicity); | ||
} | ||
} | ||
|
||
} // namespace octopus |
Oops, something went wrong.