In [1]:
from scipy.stats import binom

ks = [41, 47, 51]
read_coverage = 5
count_threshold = 3
variant_rate = 1 / 1500
error_rate = 1 / 1000
read_length = 150

p = variant_rate + error_rate  # for general case
q = 1 - p

def calculate_probalities(p, k):
    prob_no_variant_when_mutation_of_interest_is_on_end = (1 - p) ** k
    prob_clean_flank_both = (1 - p) ** (2 * k)
    prob_variant_on_both_sides = 1 - 2 * prob_no_variant_when_mutation_of_interest_is_on_end + prob_clean_flank_both
    prob_no_variant_within_k_when_mutation_of_interest_is_in_middle = 1 - prob_variant_on_both_sides

    return prob_no_variant_when_mutation_of_interest_is_on_end, prob_no_variant_within_k_when_mutation_of_interest_is_in_middle

for k in ks:
    prob_no_variant_when_mutation_of_interest_is_on_end, prob_no_variant_within_k_when_mutation_of_interest_is_in_middle = calculate_probalities(variant_rate, k)
    prob_no_sequencing_error_when_mutation_of_interest_is_on_end, prob_no_sequencing_error_within_k_when_mutation_of_interest_is_in_middle = calculate_probalities(error_rate, k)
    prob_no_variant_OR_sequencing_error_when_mutation_of_interest_is_on_end, prob_no_variant_OR_sequencing_error_within_k_when_mutation_of_interest_is_in_middle = calculate_probalities(p, k)

    # Edge case: no variant in flank, only sequencing error matters
    prob_that_COUNT_THRESHOLD_out_of_READ_COVERAGE_reads_have_no_sequencing_error_flank = float(binom.sf(count_threshold - 1, read_coverage, prob_no_sequencing_error_when_mutation_of_interest_is_on_end))
    prob_mutation_of_interest_is_detected_all_flank = prob_no_variant_when_mutation_of_interest_is_on_end * prob_that_COUNT_THRESHOLD_out_of_READ_COVERAGE_reads_have_no_sequencing_error_flank

    # Middle case: no variant in flank, only sequencing error matters
    prob_that_COUNT_THRESHOLD_out_of_READ_COVERAGE_reads_have_no_sequencing_error_both = float(binom.sf(count_threshold - 1, read_coverage, prob_no_sequencing_error_within_k_when_mutation_of_interest_is_in_middle))
    prob_mutation_of_interest_is_detected_all_middle = prob_no_variant_within_k_when_mutation_of_interest_is_in_middle * prob_that_COUNT_THRESHOLD_out_of_READ_COVERAGE_reads_have_no_sequencing_error_both

    # What if we have linked variants -- then we need (1) at least count_threshold reads will have the variant in position between (1, read_length-k) and (2) at least count_threshold reads will have no sequencing error in the followin k-1 bases (assumes linked variant is before variant of interest -- symmetrical in other case)
    P_pos = (read_length - k) / read_length  # Probability that variant is in a valid position in the read
    P_variant = variant_rate
    P_no_error = (1 - error_rate) ** (k - 1)  # Probability that the following k-1 bases are error-free
    P_total = P_pos * P_no_error  # Combined probability that a read is usable (has variant in valid position and no sequencing error in the following k-1 bases)
    prob_at_least_threshold = 1 - binom.cdf(count_threshold - 1, read_coverage, P_total)  # Step 4: Binomial model — probability of getting >= count_threshold usable reads

    print(f"Probability of no variant within k={k} bases on one side: {prob_no_variant_when_mutation_of_interest_is_on_end:.6f} (FN = {1 - prob_no_variant_when_mutation_of_interest_is_on_end:.6f})")
    print(f"Probability of no variant within k={k} bases on at least one side: {prob_no_variant_within_k_when_mutation_of_interest_is_in_middle:.6f} (FN = {1 - prob_no_variant_within_k_when_mutation_of_interest_is_in_middle:.6f})")
    print(f"Probability of no sequencing error within k={k} bases on one side: {prob_no_sequencing_error_when_mutation_of_interest_is_on_end:.6f} (FN = {1 - prob_no_sequencing_error_when_mutation_of_interest_is_on_end:.6f})")
    print(f"Probability of no sequencing error within k={k} bases on at least one side: {prob_no_sequencing_error_within_k_when_mutation_of_interest_is_in_middle:.6f} (FN = {1 - prob_no_sequencing_error_within_k_when_mutation_of_interest_is_in_middle:.6f})")
    print(f"Probability of no variant AND no sequencing error within k={k} bases on one side: {prob_no_variant_OR_sequencing_error_when_mutation_of_interest_is_on_end:.6f} (FN = {1 - prob_no_variant_OR_sequencing_error_when_mutation_of_interest_is_on_end:.6f})")
    print(f"Probability of no variant AND no sequencing error within k={k} bases on at least one side: {prob_no_variant_OR_sequencing_error_within_k_when_mutation_of_interest_is_in_middle:.6f} (FN = {1 - prob_no_variant_OR_sequencing_error_within_k_when_mutation_of_interest_is_in_middle:.6f})")
    print(f"Probability of detecting mutation of interest near edge: {prob_mutation_of_interest_is_detected_all_flank:.6f} (FN = {1 - prob_mutation_of_interest_is_detected_all_flank:.6f})")
    print(f"Probability of detecting mutation of interest near middle: {prob_mutation_of_interest_is_detected_all_middle:.6f} (FN = {1 - prob_mutation_of_interest_is_detected_all_middle:.6f})")
    print(f"Probability a single read is with a linked variant is usable: {P_total:.6f}")
    print(f"Probability of getting at least {count_threshold} usable reads out of {read_coverage} reads with linked variant: {prob_at_least_threshold:.6f} (FN = {1 - prob_at_least_threshold:.6f})")
    print()

Probability of no variant within k=41 bases on one side: 0.973028 (FN = 0.026972)
Probability of no variant within k=41 bases on at least one side: 0.999273 (FN = 0.000727)
Probability of no sequencing error within k=41 bases on one side: 0.959809 (FN = 0.040191)
Probability of no sequencing error within k=41 bases on at least one side: 0.998385 (FN = 0.001615)
Probability of no variant AND no sequencing error within k=41 bases on one side: 0.933896 (FN = 0.066104)
Probability of no variant AND no sequencing error within k=41 bases on at least one side: 0.995630 (FN = 0.004370)
Probability of detecting mutation of interest near edge: 0.972434 (FN = 0.027566)
Probability of detecting mutation of interest near middle: 0.999272 (FN = 0.000728)
Probability a single read is with a linked variant is usable: 0.698160
Probability of getting at least 3 usable reads out of 5 reads with linked variant: 0.834477 (FN = 0.165523)

Probability of no variant within k=47 bases on one side: 0.969142 (FN