In [63]:
def posterior_seq_only(p_eg, p_neg, p_eng, p_neng, p_s_e, p_s_ne):
    return (p_s_e * p_eg) / (
        p_s_e * p_eg + 
        p_s_ne * p_neg +
        p_s_e * p_eng +
        p_s_ne * p_neng
    )

def posterior_growth_only(p_eg, p_neg, p_eng, p_neng, p_c_g, p_c_ng):
    return (p_c_g * p_eg) / (
        p_c_g * p_eg + 
        p_c_g * p_neg +
        p_c_ng * p_eng +
        p_c_ng * p_neng
    )

def posterior_combined(p_eg, p_neg, p_eng, p_neng, p_s_e, p_s_ne, p_c_g, p_c_ng):
    return (p_s_e * p_c_g * p_eg) / (
        p_s_e * p_c_g * p_eg + 
        p_s_ne * p_c_g * p_neg +
        p_s_e * p_c_ng * p_eng +
        p_s_ne * p_c_ng * p_neng
    )

E = engineered
NE = not engineered
G = growing
NG = not growing

Data:
S = sequence is flagged
C = count data is flagged

Want posterior Pr{E, G| S} or Pr{E, G | S, C}.

Likelihood
Pr{S, C | E, G} = Pr{S | E, G} * Pr{C | E, G}
 = Pr{S | E} * Pr{C | G}

Likelihood ratios:
Pr{S | E} / Pr{S | NE}
Pr{C | G} / Pr{C | NG}



Prior

In [23]:
sequences = 1e5
threats = 0.1
growing = 1e4

p_eg = threats / sequences
# Assumes anything engineered is growing
p_eng = 0.0
p_neg = (growing - threats) / sequences
p_neng = 1 - p_eg - p_eng - p_neg

Likelihood

In [64]:
p_s_e = 0.9
p_s_ne = 0.2
print(p_s_e / p_s_ne)
p_c_g = 0.90
p_c_ng = 0.10
print(p_c_g / p_c_ng)
p_seq_only = posterior_seq_only(p_eg, p_neg, p_eng, p_neng, p_s_e, p_s_ne)
p_growth_only = posterior_growth_only(p_eg, p_neg, p_eng, p_neng, p_c_g, p_c_ng)
p_combined = posterior_combined(p_eg, p_neg, p_eng, p_neng, p_s_e, p_s_ne, p_c_g, p_c_ng)
print(p_seq_only)
print(p_growth_only)
print(p_combined)

4.5
9.0
4.499984250055124e-06
4.9999999999999996e-06
2.24996062568905e-05


In [62]:
p_s_e = 0.9
p_s_ne = 0.05
posterior_seq_only(p_eg, p_neg, p_eng, p_neng, p_s_e, p_s_ne)

1.7999694005201907e-05

## The part we're using

Base rates (assume the threat is growing):

In [1]:
sequences = 1e6
threats = 1
growing = 1e4

Likelihoods of the Sequence-based test:

In [2]:
# Pr{S|E}
p_s_e = 0.99
# Pr{S|NE}
p_s_ne = 0.1

False and true positives of Sequence-based test alone:

In [3]:
# Number flagged not threats
print(p_s_ne * (sequences - threats))
# Number flagged that are threats
print(p_s_e * (threats))

99999.90000000001
0.99


Likelihoods of the Count-based test:

In [4]:
# Pr{C|G}
p_c_g = 0.95
# Pr{C|NG}
p_c_ng = 0.05

In [5]:
# Number flagged not threats
print(p_c_g * (growing - threats))
print(p_c_ng * (sequences - growing))
# Number flagged that are threats
print(p_c_g * threats)

9499.05
49500.0
0.95


Combined test (assumes flagging by the two tests is independent conditional on (E, G))

In [6]:
# Number flagged not threats
print(p_s_ne * p_c_g * (growing - threats) + p_s_ne * p_c_ng * (sequences - growing))
# Number flagged threats
print(p_s_e * p_c_g * threats)

5899.905000000001
0.9405
