In [39]:
import collections
import math
import string
import logging
import numpy as np
import pandas as pd
from itertools import permutations
from sympy.utilities.iterables import partitions
import matplotlib.pyplot as plt 
pd.set_option('precision', 10)
%matplotlib inline

### Test Variables

In [4]:
prior = [1/2, 1/3, 1/6]
n = 10
k = 3

## Functions

---
**gen_col** generates a non-unform random sample of size n according to the prior

In [6]:
def gen_col(n, k, prior):
    return np.random.choice(np.arange(1,k+1), n, p=prior)

In [9]:
a = gen_col(n, k, prior)
print(a)

[2 1 3 2 2 2 1 1 1 1]


Next three blocks turn nparray of ints to a list of strings. May use later to make and check official guess.

In [77]:
def letter_dictionary():
    return dict(zip(range(1, k+1), string.ascii_lowercase))

In [78]:
letter_dict = letter_dictionary()
print(letter_dict)

{1: 'a', 2: 'b', 3: 'c'}


In [83]:
my_list = []
tuple_col = tuple(col)
for x in tuple_col:
    my_list.append(letter_dict[x])
print(my_list)

['b', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b']


---
**gen_counts** returns just block sizes in a nparray

In [19]:
def gen_counts(a):
    unique, counts = np.unique(a, return_counts=True)
    unique_counts_dict = dict(zip(unique, counts)) # not using rn
    return counts

In [52]:
int_lista = gen_counts(a)
print(int_lista)

[5 4 1]


---
**k_dictionary**

Input: pi, the probability distribution

Output: a dictionary with letters for keys. The letters represent individual diseases. Can be used to keep track of individual diseases throughout the gain function calculations.

In [28]:
def k_dictionary(prior):
    return dict(zip(string.ascii_lowercase, prior))

In [30]:
pi_dict = k_dictionary(prior)
print(pi_dict)

{'a': 0.5, 'b': 0.3333333333333333, 'c': 0.16666666666666666}


---
**type_3** calculates all possible probabilies for each column type, given an integer partition as list

Input: 
- an integer partition as list, int_list 
- pi_dict, a dictionary of diseases and probabilities in k 
- k, the number of unique plaintext values/diseases

Output:
- df: columns represent the different blocks. Each row has a different permutation of diseases. The final column is the probability of that particular column type occuring

In [55]:
def type_3(int_list, pi_dict, k):
    int_list_len = len(int_list)
    data = list(permutations(pi_dict.keys(), min(k, int_list_len)))
    df = pd.DataFrame(data = data, columns=int_list)
    if int_list_len == 1:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0])
    elif int_list_len == 2:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1])
    elif int_list_len == 3:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1]) * pow(df.iloc[:,2].map(pi_dict), df.columns.values[2])
    elif int_list_len == 4:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1]) * pow(df.iloc[:,2].map(pi_dict), df.columns.values[2]) * pow(df.iloc[:,3].map(pi_dict), df.columns.values[3])
    elif int_list_len == 5:
        df['prob']= pow(df.iloc[:,0].map(pi_dict), df.columns.values[0]) * pow(df.iloc[:,1].map(pi_dict), df.columns.values[1]) * pow(df.iloc[:,2].map(pi_dict), df.columns.values[2]) * pow(df.iloc[:,3].map(pi_dict), df.columns.values[3]) * pow(df.iloc[:,4].map(pi_dict), df.columns.values[4])
    else:
         raise ValueError("The integer partition doesn't make sense")
    return df

In [56]:
dfa = type_3(int_lista, pi_dict, k)
dfa

Unnamed: 0,5,4,1,prob
0,a,b,c,6.43004e-05
1,a,c,b,8.0376e-06
2,b,a,c,4.28669e-05
3,b,c,a,1.5877e-06
4,c,a,b,2.6792e-06
5,c,b,a,7.938e-07


In [48]:
dfa.loc[df['prob'].idxmax()]

5                     a
4                     b
1                     c
prob    6.430041152e-05
Name: 0, dtype: object

In [66]:
col = gen_col(n, k, prior)
print("N: ", n)
print("K: ", k)
print("*** Generated Column ***")
print(col)
int_list = gen_counts(col)
df = type_3(int_list, pi_dict, k)
print("*** DF ***")
print(df)
choices = df.loc[df['prob'].idxmax()]
print(choices)

N:  10
K:  3
*** Generated Column ***
[2 1 1 1 1 1 1 2 2 2]
*** DF ***
   6  4          prob
0  a  b  0.0001929012
1  a  c  0.0000120563
2  b  a  0.0000857339
3  b  c  0.0000010584
4  c  a  0.0000013396
5  c  b  0.0000002646
6                     a
4                     b
prob    0.0001929012346
Name: 0, dtype: object
