In [89]:
import collections
from collections import Counter
import math
import string
import logging
import numpy as np
import pandas as pd
from itertools import permutations
from sympy.utilities.iterables import partitions
import matplotlib.pyplot as plt 
pd.set_option('precision', 10)
%matplotlib inline

### Test Variables

In [4]:
prior = [1/2, 1/3, 1/6]
n = 10
k = 3

## Functions

---
**gen_col** generates a non-unform random sample of size n according to the prior

In [6]:
def gen_col(n, k, prior):
    return np.random.choice(np.arange(1,k+1), n, p=prior)

In [9]:
a = gen_col(n, k, prior)
print(a)

[2 1 3 2 2 2 1 1 1 1]


---
**col_letter_list** turns nparray of ints to a list of strings.

Uses **letter_dictionary** as helper function

In [77]:
def letter_dictionary():
    return dict(zip(range(1, k+1), string.ascii_lowercase))

In [78]:
letter_dict = letter_dictionary()
print(letter_dict)

{1: 'a', 2: 'b', 3: 'c'}


In [85]:
def col_letter_list(col):
    letter_list = []
    tuple_col = tuple(col)
    for x in tuple_col:
        letter_list.append(letter_dict[x])
    return letter_list

In [87]:
my_letter_list = col_letter_list(a)
print(my_letter_list)

['b', 'a', 'c', 'b', 'b', 'b', 'a', 'a', 'a', 'a']


---
**index_dict** is a way to keep track of indices in list to make guess

Not currently in use

In [110]:
def index_dict(letter_list):
    ind_x = []
    ind_y = []
    ind_z = []
    i = 0
    for x in letter_list:
        if x=='a':
            ind_x.append(i)
        elif x == 'b':
            ind_y.append(i)
        elif x == 'c':
            ind_z.append(i)
        else:
            raise Exception("x != a, b, c")
        i = i + 1
    index_dict = {'x': ind_x, 'y': ind_y, 'z': ind_z}
    return index_dict

In [112]:
my_ind_dict = index_dict(my_letter_list)
print(my_ind_dict)

{'x': [1, 6, 7, 8, 9], 'y': [0, 3, 4, 5], 'z': [2]}


---
**index_dicts** takes the letter lists and records the indices in 3 seperate lists

In [126]:
def index_dicts(letter_list):
    ind_x = []
    ind_y = []
    ind_z = []
    i = 0
    for x in letter_list:
        if x=='a':
            ind_x.append(i)
        elif x == 'b':
            ind_y.append(i)
        elif x == 'c':
            ind_z.append(i)
        else:
            raise Exception("x != a, b, c")
        i = i + 1
    return ind_x, ind_y, ind_z

In [127]:
x, y, z = index_dicts(my_letter_list)
print(x)
print(y)
print(z)

[1, 6, 7, 8, 9]
[0, 3, 4, 5]
[2]


---
**replace_letters** just replaces a, b, c with x, y, z. 

In [119]:
def replace_letters(letter_list):
    enc_list = []
    for x in letter_list:
        if x=='a':
            enc_list.append('x')
        elif x == 'b':
            enc_list.append('y')
        elif x == 'c':
            enc_list.append('z')
        else:
            raise Exception("x != a, b, c")
    return enc_list

In [121]:
my_enc_list = replace_letters(my_letter_list)
print(my_enc_list)

['y', 'x', 'z', 'y', 'y', 'y', 'x', 'x', 'x', 'x']


---
**gen_letter_counts** takes the column w/diseases and counts occurances, returns block sizes

In [160]:
def gen_letter_counts(letter_list):
    my_counter = Counter(letter_list)
    counter_list = sorted(list(dict(my_counter).values()), reverse=True)
    return counter_list

In [104]:
c = gen_letter_counts(my_letter_list)
print(c)

[5, 4, 1]


---
**gen_counts** returns just block sizes in a nparray

Not currently in use, counting off letter list directly

In [19]:
def gen_counts(a):
    unique, counts = np.unique(a, return_counts=True)
    unique_counts_dict = dict(zip(unique, counts)) # not using rn
    return counts

In [52]:
int_lista = gen_counts(a)
print(int_lista)

[5 4 1]


---
**k_dictionary**

Input: pi, the probability distribution

Output: a dictionary with letters for keys. The letters represent individual diseases. Can be used to keep track of individual diseases throughout the gain function calculations.

In [28]:
def k_dictionary(prior):
    return dict(zip(string.ascii_lowercase, prior))

In [30]:
pi_dict = k_dictionary(prior)
print(pi_dict)

{'a': 0.5, 'b': 0.3333333333333333, 'c': 0.16666666666666666}


---
**type_3** calculates all possible probabilies for each column type, given an integer partition as list

Input: 
- an integer partition as list, int_list 
- pi_dict, a dictionary of diseases and probabilities in k 
- k, the number of unique plaintext values/diseases

Output:
- df: columns represent the different blocks. Each row has a different permutation of diseases. The final column is the probability of that particular column type occuring

In [117]:
def type_3(int_list, pi_dict, k):
    int_list_len = len(int_list)
    data = list(permutations(pi_dict.keys(), min(k, int_list_len)))
    df = pd.DataFrame(data = data, columns=int_list)
    if int_list_len == 1:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0])
    elif int_list_len == 2:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1])
    elif int_list_len == 3:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1]) * pow(df.iloc[:,2].map(pi_dict), df.columns.values[2])
    elif int_list_len == 4:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1]) * pow(df.iloc[:,2].map(pi_dict), df.columns.values[2]) * pow(df.iloc[:,3].map(pi_dict), df.columns.values[3])
    elif int_list_len == 5:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1]) * pow(df.iloc[:,2].map(pi_dict), df.columns.values[2]) * pow(df.iloc[:,3].map(pi_dict), df.columns.values[3]) * pow(df.iloc[:,4].map(pi_dict), df.columns.values[4])
    else:
         raise ValueError("The integer partition doesn't make sense")
    return df

In [118]:
dfa = type_3(int_lista, pi_dict, k)
dfa

Unnamed: 0,5,4,1,prob
0,a,b,c,6.43004e-05
1,a,c,b,8.0376e-06
2,b,a,c,4.28669e-05
3,b,c,a,1.5877e-06
4,c,a,b,2.6792e-06
5,c,b,a,7.938e-07


In [139]:
def blocks_to_disease_guess(df):
    s = df.loc[df['prob'].idxmax()]
    guess_df = pd.DataFrame({'block': s.index, 'disease':s.values})
    return guess_df

In [140]:
blocks_to_disease_guess(dfa)

Unnamed: 0,block,disease
0,5,a
1,4,b
2,1,c
3,prob,6.430041152e-05


In [200]:
def make_guess(x, y, z, b2d, enc_list):
    guess = []
    ## Establish mapping
    x_len = len(x)
    x_val = b2d.loc[b2d['block'] == x_len, 'disease'].iloc[0]
    y_len = len(y)
    y_val = b2d.loc[b2d['block'] == y_len, 'disease'].iloc[0]
    if y_val == x_val:
        y_val = b2d.loc[b2d['block'] == y_len, 'disease'].iloc[1]
    z_len = len(z)
    z_val = b2d.loc[b2d['block'] == z_len, 'disease'].iloc[0]
    if z_val == y_val:
        z_val =  b2d.loc[b2d['block'] == y_len, 'disease'].iloc[1]
    n = x_len + y_len + z_len
    ## Fill in guess list
    for i in enc_list:
        if i == 'x':
            guess.append(x_val)
        elif i == 'y':
            guess.append(y_val)
        elif i == 'z':
            guess.append(z_val)
        else:
            raise Exception("i != x, y, z")
    return guess

In [202]:
guess = make_guess(x, y, z, b2d, enc_list)
print(guess)

['b', 'c', 'b', 'a', 'a', 'b', 'a', 'a', 'a', 'a']


In [186]:
def check_guess(letter_list, guess):
    return letter_list == guess

In [187]:
val = check_guess(letter_list, guess)
print(val)

True


In [188]:
col = gen_col(n, k, prior)
print("*** Variables ***")
print("N: ", n)
print("K: ", k)
print("Prior: ", prior)
print("*** Generated Column ***")
print(col)
letter_list = col_letter_list(col)
print("*** Column in Letters ***")
print(letter_list)
x, y, z = index_dicts(letter_list)
print("*** The indices ***")
print("x ", x)
print("y ", y)
print("z ", z)
enc_list = replace_letters(letter_list)
#print("*** The `Encrypted' List ***")
#print(enc_list)
## int_list = gen_counts(col)
int_list = gen_letter_counts(letter_list)
print("int list: ", int_list)
df = type_3(int_list, pi_dict, k)
print("*** DF ***")
print(df)
print("*** Most likely block:disease mapping ***")
b2d = blocks_to_disease_guess(df)
print(b2d)
guess = make_guess(x, y, z, b2d, enc_list)
val = check_guess(letter_list, guess)
print(val)

*** Variables ***
N:  10
K:  3
Prior:  [0.5, 0.3333333333333333, 0.16666666666666666]
*** Generated Column ***
[3 3 3 1 3 2 2 3 1 1]
*** Column in Letters ***
['c', 'c', 'c', 'a', 'c', 'b', 'b', 'c', 'a', 'a']
*** The indices ***
x  [3, 8, 9]
y  [5, 6]
z  [0, 1, 2, 4, 7]
int list:  [5, 3, 2]
*** DF ***
   5  3  2          prob
0  a  b  c  0.0000321502
1  a  c  b  0.0000160751
2  b  a  c  0.0000142890
3  b  c  a  0.0000047630
4  c  a  b  0.0000017861
5  c  b  a  0.0000011907
*** Most likely block:disease mapping ***
  block          disease
0     5                a
1     3                b
2     2                c
3  prob  3.215020576e-05
x  b
y  c
z  a
False


In [194]:
logging.getLogger().setLevel(logging.INFO)

In [None]:
logging.disabled = True

In [203]:
def bayes_test(n, k, prior):
    col = gen_col(n, k, prior)
    letter_list = col_letter_list(col)
    logging.info("*** Plaintext Column *** -\n {}".format(letter_list))
    x, y, z = index_dicts(letter_list)
    enc_list = replace_letters(letter_list)
    int_list = gen_letter_counts(letter_list)
    df = type_3(int_list, pi_dict, k)
    logging.info("*** DF *** -\n {}".format(df))
    b2d = blocks_to_disease_guess(df)
    logging.info("*** Most likely block:disease mapping *** -\n {}".format(b2d))
    guess = make_guess(x, y, z, b2d, enc_list)
    logging.info("*** Guess *** -\n {}".format(guess))
    val = check_guess(letter_list, guess)
    logging.info("*** Correct? *** -\n {}".format(val))
    return val

In [205]:
val = bayes_test(n, k, prior)
print("Correct?? ", val)

INFO:root:*** Plaintext Column *** -
 ['a', 'b', 'c', 'b', 'a', 'c', 'a', 'a', 'a', 'b']
INFO:root:*** DF *** -
    5  3  2          prob
0  a  b  c  0.0000321502
1  a  c  b  0.0000160751
2  b  a  c  0.0000142890
3  b  c  a  0.0000047630
4  c  a  b  0.0000017861
5  c  b  a  0.0000011907
INFO:root:*** Most likely block:disease mapping *** -
   block          disease
0     5                a
1     3                b
2     2                c
3  prob  3.215020576e-05
INFO:root:*** Guess *** -
 ['a', 'b', 'c', 'b', 'a', 'c', 'a', 'a', 'a', 'b']
INFO:root:*** Correct? *** -
 True


Correct??  True
