# Testing the BoolFunction class for probabilistic boolean function learning

In [1]:
import babool as bb
import numpy as np
import pandas as pd

## Reading mushroom dataset

In [2]:
dfmush = pd.read_csv('data/agaricus-lepiota.data', header = None)
dfmush.columns = (['y','cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'])
dfmush = dfmush.drop(['veil-type'], axis = 1)

In [3]:
dfb = pd.get_dummies(dfmush)
dfb.rename({'y_p' : 'y'}, axis = 1, inplace = True)
dfb.drop('y_e', axis = 1, inplace = True)


## Probabilistic Boolean function learning

### Creates object

In [4]:
model = bb.BoolFunction(pgeomm = 0.5, theta = 10)

### Train model

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [6]:
y = dfb.y.values
M = dfb.drop('y', axis = 1).values

In [17]:
# Seeds for the 10 splits
seeds = [1910, 1990, 1997, 1998, 2000, 2005, 2011, 2012, 2015, 2017]

In [18]:
%%time
nchains = 20
njobs = -1
nsteps = 10000
nstart = 1

thetas = [2, 5, 10, 30]
pgeos = [0.1, 0.5, 0.9]


# Cross validation
res = []
psis = []
for s in seeds:
    Xtrain, Xtest, ytrain, ytest = train_test_split(M, y, test_size = 0.5, stratify = y, random_state = s)
    for theta in thetas:
        for pgeo in pgeos:
            model = bb.BoolFunction(pgeomm = pgeo, theta = theta)
            _ = model.fit(Xtrain, ytrain, nchains, njobs, nsteps, nstart )
            ypred = model.predict(Xtest, binary = False)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(ytest, ypred)
            roc_auc = auc(false_positive_rate, true_positive_rate)
            f = model.psi
            k = sum([len(c) for c in f])
            res.append([s, theta, pgeo, len(f), k, roc_auc])
            psis.append(model.psi)

CPU times: user 4min 10s, sys: 2.56 s, total: 4min 12s
Wall time: 5h 17min 10s


In [20]:
import pickle

with open('mush.pkl', 'wb') as arq:
    pickle.dump(res, arq)

In [20]:
import pickle

with open("mush1.pkl", 'rb') as arq:
    res = pickle.load(arq)

In [24]:
dfres = pd.DataFrame({'seed' : [r[0] for r in res], 'theta' : [r[1] for r in res], 'pgeo' : [r[2] for r in res], 'm' : [r[3] for r in res], 'sum_k' : [r[4] for r in res], 'auc' : [r[5] for r in res]})
dfres

Unnamed: 0,seed,theta,pgeo,m,sum_k,auc
0,1910,2,0.1,7,14,1.000000
1,1910,2,0.5,8,16,1.000000
2,1910,2,0.9,8,15,0.999489
3,1910,5,0.1,6,14,0.998979
4,1910,5,0.5,6,14,0.998979
5,1910,5,0.9,6,18,0.998979
6,1910,10,0.1,6,16,0.998979
7,1910,10,0.5,6,18,1.000000
8,1910,10,0.9,6,13,0.998979
9,1910,30,0.1,7,43,1.000000


In [27]:
dftbl = dfres.drop('seed', axis = 1).groupby(['theta', 'pgeo']).mean().reset_index()

In [30]:
dftbl.sort_values('auc')

Unnamed: 0,theta,pgeo,m,sum_k,auc
11,30,0.9,5.8,24.6,0.998503
8,10,0.9,6.1,16.4,0.998728
7,10,0.5,6.3,17.2,0.99884
4,5,0.5,6.5,14.4,0.999142
1,2,0.5,6.9,13.3,0.999219
0,2,0.1,6.9,13.4,0.999301
2,2,0.9,7.0,13.3,0.999336
6,10,0.1,6.6,19.9,0.999392
5,5,0.9,6.7,15.7,0.999483
9,30,0.1,6.5,32.1,0.999821


In [28]:
def formatter(l):
    return '%1.4f' % l

In [29]:
print(dftbl.to_latex(index = False, formatters = {'auc' : formatter}))

\begin{tabular}{rrrrr}
\toprule
 theta &  pgeo &    m &  sum\_k &    auc \\
\midrule
     2 &   0.1 &  6.9 &   13.4 & 0.9993 \\
     2 &   0.5 &  6.9 &   13.3 & 0.9992 \\
     2 &   0.9 &  7.0 &   13.3 & 0.9993 \\
     5 &   0.1 &  6.7 &   14.9 & 0.9999 \\
     5 &   0.5 &  6.5 &   14.4 & 0.9991 \\
     5 &   0.9 &  6.7 &   15.7 & 0.9995 \\
    10 &   0.1 &  6.6 &   19.9 & 0.9994 \\
    10 &   0.5 &  6.3 &   17.2 & 0.9988 \\
    10 &   0.9 &  6.1 &   16.4 & 0.9987 \\
    30 &   0.1 &  6.5 &   32.1 & 0.9998 \\
    30 &   0.5 &  6.5 &   29.4 & 0.9999 \\
    30 &   0.9 &  5.8 &   24.6 & 0.9985 \\
\bottomrule
\end{tabular}



Converting function $\psi$ to the logical classification rule

In [40]:
def convert_psi(psi):
    expr = '('
    for m in psi:
        for v in m:
            expr += dfb.columns[v+1] + " AND "
        expr = expr[:-5]
        expr += ') OR ('
    expr = expr[:-5]
    return expr

In [42]:
##### Getting the estimated functions for theta = 5, pgeo = 0.1
est_f = [psis[i] for i in range(len(res)) if res[i][1] == 5 and res[i][2] == 0.1]
aucs = [res[i][5] for i in range(len(res)) if res[i][1] == 5 and res[i][2] == 0.1]
ks = [res[i][4] for i in range(len(res)) if res[i][1] == 5 and res[i][2] == 0.1]
ms = [res[i][3] for i in range(len(res)) if res[i][1] == 5 and res[i][2] == 0.1]
expr = [convert_psi(f) for f in est_f]

In [44]:
dftbl2 = pd.DataFrame({'Expression' : expr, 'm' : ms, '\sum k' : ks, 'AUC' : aucs})

In [48]:
print(dftbl2.drop('Expression', axis = 1).to_latex(index = True, formatters = {'AUC' : formatter}))

\begin{tabular}{lrrr}
\toprule
{} &  m &  \textbackslash sum k &    AUC \\
\midrule
0 &  6 &      14 & 0.9990 \\
1 &  7 &      22 & 1.0000 \\
2 &  7 &      14 & 1.0000 \\
3 &  6 &      15 & 1.0000 \\
4 &  7 &      14 & 1.0000 \\
5 &  6 &      14 & 1.0000 \\
6 &  7 &      15 & 1.0000 \\
7 &  7 &      13 & 1.0000 \\
8 &  7 &      15 & 1.0000 \\
9 &  7 &      13 & 1.0000 \\
\bottomrule
\end{tabular}



In [49]:
expr

['(stalk-root_c AND stalk-surface-below-ring_y) OR (cap-shape_x AND odor_c AND ring-type_p) OR (odor_p AND stalk-root_e) OR (odor_f AND veil-color_w) OR (stalk-surface-below-ring_s AND spore-print-color_r) OR (gill-size_n AND stalk-root_? AND ring-type_e)',
 '(bruises_f AND stalk-shape_e AND ring-type_n) OR (odor_f AND veil-color_w AND ring-number_o) OR (bruises_t AND gill-size_n AND stalk-shape_e) OR (gill-spacing_c AND stalk-color-above-ring_w AND spore-print-color_r) OR (cap-color_y AND gill-spacing_w AND stalk-color-below-ring_y) OR (odor_c AND gill-size_n AND stalk-surface-above-ring_s) OR (gill-spacing_c AND ring-type_e AND spore-print-color_w AND population_v)',
 '(stalk-surface-above-ring_k AND habitat_d) OR (bruises_t AND gill-size_n AND stalk-shape_e) OR (odor_f AND veil-color_w) OR (stalk-surface-above-ring_s AND spore-print-color_r) OR (stalk-surface-below-ring_y AND ring-type_e AND habitat_l) OR (gill-color_b) OR (odor_c)',
 '(stalk-root_? AND ring-number_o AND ring-type_e

In [None]:
\item (stalk-root = c AND stalk-surface-below-ring = y) OR (cap-shape = x AND odor = c AND ring-type = p) OR (odor = p AND stalk-root = e) OR (odor = f AND veil-color = w) OR (stalk-surface-below-ring = s AND spore-print-color = r) OR (gill-size = n AND stalk-root = ? AND ring-type = e)
\item (bruises = f AND stalk-shape = e AND ring-type = n) OR (odor = f AND veil-color = w AND ring-number = o) OR (bruises = t AND gill-size = n AND stalk-shape = e) OR (gill-spacing = c AND stalk-color-above-ring = w AND spore-print-color = r) OR (cap-color = y AND gill-spacing = w AND stalk-color-below-ring = y) OR (odor = c AND gill-size = n AND stalk-surface-above-ring = s) OR (gill-spacing = c AND ring-type = e AND spore-print-color = w AND population = v)
\item (stalk-surface-above-ring = k AND habitat = d) OR (bruises = t AND gill-size = n AND stalk-shape = e) OR (odor = f AND veil-color = w) OR (stalk-surface-above-ring = s AND spore-print-color = r) OR (stalk-surface-below-ring = y AND ring-type = e AND habitat = l) OR (gill-color = b) OR (odor = c)
\item (stalk-root = ? AND ring-number = o AND ring-type = e) OR (stalk-surface-below-ring = s AND spore-print-color = r) OR (odor = f AND gill-attachment = f AND veil-color = w) OR (odor = c) OR (bruises = t AND gill-size = n AND stalk-shape = e) OR (bruises = f AND stalk-root = c AND spore-print-color = w)
\item (stalk-surface-below-ring = s AND spore-print-color = r) OR (gill-spacing = w AND population = c AND habitat = l) OR (odor = c) OR (odor = f AND veil-color = w) OR (gill-spacing = c AND stalk-surface-above-ring = k) OR (gill-color = b) OR (odor = p AND stalk-shape = e AND stalk-surface-below-ring = s)
\item (bruises = f AND gill-spacing = c AND ring-type = e) OR (stalk-root = c AND spore-print-color = w) OR (bruises = t AND gill-size = n AND stalk-shape = e) OR (odor = f AND veil-color = w) OR (stalk-surface-above-ring = s AND spore-print-color = r) OR (odor = c AND stalk-root = b)
\item (gill-size = n AND stalk-root = ? AND spore-print-color = w) OR (stalk-shape = e AND stalk-surface-below-ring = s AND habitat = d) OR (gill-spacing = w AND population = c) OR (odor = f) OR (spore-print-color = r) OR (odor = p AND gill-attachment = f AND stalk-color-above-ring = w) OR (odor = m AND stalk-color-below-ring = c)
\item (gill-spacing = c AND gill-size = n AND spore-print-color = w) OR (bruises = f AND odor = c) OR (odor = m AND spore-print-color = w) OR (spore-print-color = r) OR (odor = p AND stalk-shape = e) OR (odor = f) OR (gill-spacing = w AND population = c)
\item (gill-size = b AND spore-print-color = h) OR (bruises = f AND stalk-root = b AND stalk-color-below-ring = w) OR (stalk-surface-above-ring = k AND habitat = d) OR (bruises = f AND gill-color = b) OR (gill-spacing = w AND population = c) OR (gill-size = b AND spore-print-color = r) OR (bruises = t AND odor = p)
\item (odor = p AND gill-attachment = f) OR (odor = c AND stalk-surface-above-ring = s) OR (gill-color = b) OR (odor = f) OR (gill-spacing = w AND population = c) OR (stalk-shape = e AND stalk-surface-above-ring = k AND stalk-surface-below-ring = y AND veil-color = w) OR (spore-print-color = r)
\end{enumerate}