In [10]:
import pandas as pd
import numpy as np
import itertools
import sys
sys.path.insert(0,'../..')
import g4l


In [2]:
# Max tree depth used
max_depth = 4
# Alphabet
A = [0, 1, 2, 3, 4]
# Penalization constant required for BIC
c = 0.5

## 1. Loading the sample

In [3]:
sample = g4l.data.Sample('./g4l/fixtures/folha.txt', A)
initial_tree = g4l.tree.ContextTree(sample, max_depth=max_depth)

Context Trees are appropriately represented by a Data Frame. 
For any given sample `sample`, the following steps are performed:
* Populates Data Frame with all leaf nodes for a maximum depth as indicated in `max_depth`.
* Calculates the number of occurrences of each context (leaf node) in the sample (`node_freq`) 
* Calculates context's probability (`ps`) in the sample
* Calculates probability of transition to each symbol $a \in A$ (`child_probs`).
* Since all these values are computed, we can also compute the likelihood for each context (`lps`).

The resulting initial Data Frame is shown below:

In [4]:
dfx = initial_tree.df
dfx[(dfx.l==4) & (dfx.lps<0)].sort_values(['lps'], ascending=False).head(20)
# TODO: rename child_probs to next_symbol_transition_prob
dfx = initial_tree.df
x = dfx[dfx.node.str.match('.334')]
x['calc'] = x.node_freq * x.lps
x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final,calc
745,4,590,334,2,-1.386294,2e-05,"[0.0, 0.0, 0.5, 0.5, 0.0]",0,0,-2.77259
746,4,591,1334,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.0
747,4,592,2334,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.0
748,4,593,3334,1,0.0,1e-05,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,0.0
749,4,594,4334,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.0


*Implementation notes: In the original algorithm the steps described so far are part of the context tree estimator procedures. These steps were refactored to outside the tree estimation since they always produce the same results for a given sample. In the new implementation, these procedures are hence precomputed and will serve as input to any further call to BIC context tree estimation. (computing time has decreased to about $\frac{1}{10}$ of the original algorithm)*

## 2. Computing the BIC context tree estimator

In [21]:
ctm = g4l.estimators.CTM(initial_tree)
bic_tree = ctm.execute(c)

# Show resulting tree
print('Resulting tree: ', bic_tree.to_str())

Resulting tree:  0000 001 0010 100 2 20 200 2000 201 2010 21 210 3 30 300 4


The CTM procedure computes BIC context tree estimator efficiently. It receives as parameter the penalization constant `c` and returns a DataFrame representing the tree, containing all leaf nodes for a maximum depth as indicated in `max_depth`.  For any given `sample`, the DataFrame initially calculates the number of occurrences of each leaf node in the sample (`node_freq`) along with its probability `ps` and the probability of transition to each symbol $a \in A$ (`child_probs`). Since these values are computed, we can also compute the likelihood for each context `lps`.

In [22]:
bic_tree.df

Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final
155,4,0,0,943,-498.167922,0.009647,"[0.20254506892895016, 0.7974549310710498, 0.0,...",0,1
55,3,25,1,5841,-5667.415819,0.059755,"[0.6159904126005821, 0.0, 0.2722136620441705, ...",0,1
180,4,25,10,3598,-3355.740144,0.036809,"[0.030572540300166758, 0.0, 0.6700944969427459...",0,1
31,3,1,100,991,-857.654556,0.010138,"[0.0, 0.0, 0.6740665993945509, 0.2119071644803...",0,1
2,1,2,2,21830,-14738.41553,0.223323,"[0.5972972972972973, 0.4027027027027027, 0.0, ...",0,1
7,2,2,20,13039,-8990.185298,0.133391,"[0.4479638009049774, 0.5520361990950227, 0.0, ...",0,1
32,3,2,200,5841,-3963.412048,0.059755,"[0.40404040404040403, 0.5959595959595959, 0.0,...",0,1
157,4,2,2000,2360,-1499.97023,0.024144,"[0.31864406779661014, 0.6813559322033899, 0.0,...",0,1
57,3,27,201,7198,-6004.942636,0.073638,"[0.7167268685746041, 0.0, 0.18977493748263408,...",0,1
182,4,27,2010,5159,-5096.133751,0.052779,"[0.0684241132002326, 0.0, 0.6561349098662531, ...",0,1


### 2.1 - Executing the BIC tree estimator

The following steps demonstrate in details how BIC tree estimator performs to achieve the outcome data frame described above.

In [23]:
# Let's consider the initial tree with freqs/probs pre-calculated for the input Sample
data_frame = initial_tree.df.copy()

# These are the first 8 nodes in the Data Frame:
data_frame.head(8)

Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final
0,1,0,0,44111,-62840.152905,0.451259,"[0.23846659563374215, 0.29559520301058695, 0.3...",0,0
1,1,1,1,21830,-18794.382713,0.223323,"[0.6933577645442052, 0.0, 0.20998625744388455,...",0,0
2,1,2,2,21830,-14715.435172,0.223323,"[0.5972972972972973, 0.4027027027027027, 0.0, ...",0,0
3,1,3,3,7909,-6671.568838,0.08091,"[0.6849159185737768, 0.0, 0.2073587052724744, ...",0,0
4,1,4,4,2070,-1325.580428,0.021176,"[0.0, 0.0, 0.6608695652173913, 0.3381642512077...",0,0
5,2,0,0,10519,-11216.584131,0.107611,"[0.31400323224641125, 0.5552809202395665, 0.08...",0,0
6,2,1,10,15136,-14916.737004,0.154844,"[0.065473044397463, 0.0, 0.6489825581395349, 0...",0,0
7,2,2,20,13039,-8967.20494,0.133391,"[0.4479638009049774, 0.5520361990950227, 0.0, ...",0,0


In [24]:
# We instantiate the Context Tree Maximizer class by passing the pre-calculated tree as parameter
ctm = g4l.estimators.CTM(initial_tree)

### 2.2 - Applying the penalization

We apply a penalization considering the constant $c$ (in this case `c = 0.5`). We update each context's lps value as following: 

`data_frame.lps -= np.log(n) * (degrees_of_freedom * c)`

with  `degrees_of_freedom` = $|A|-1$

In [25]:
ctm.apply_penalization(c, data_frame)
data_frame

Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final
0,1,0,0,44111,-62863.133263,0.451259,"[0.23846659563374215, 0.29559520301058695, 0.3...",0,0
1,1,1,1,21830,-18817.363070,0.223323,"[0.6933577645442052, 0.0, 0.20998625744388455,...",0,0
2,1,2,2,21830,-14738.415530,0.223323,"[0.5972972972972973, 0.4027027027027027, 0.0, ...",0,0
3,1,3,3,7909,-6694.549196,0.080910,"[0.6849159185737768, 0.0, 0.2073587052724744, ...",0,0
4,1,4,4,2070,-1348.560785,0.021176,"[0.0, 0.0, 0.6608695652173913, 0.3381642512077...",0,0
...,...,...,...,...,...,...,...,...,...
775,4,620,0444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
776,4,621,1444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
777,4,622,2444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
778,4,623,3444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0


### 2.3 Selecting candidate contexts

For every context that occurs more than once in the sample, we sum the  `lps` of its child contexts and compare to its own `lps`. When the first value is greater than the second, we update the context's `lps` with the new value and "flag" this context to be further removed.

In [26]:
ctm.block2(data_frame) # TODO: rename this method
data_frame

Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final
0,1,0,0,44111,-36117.259084,0.451259,"[0.23846659563374215, 0.29559520301058695, 0.3...",1,0
1,1,1,1,21830,-18722.227112,0.223323,"[0.6933577645442052, 0.0, 0.20998625744388455,...",1,0
2,1,2,2,21830,-14738.415530,0.223323,"[0.5972972972972973, 0.4027027027027027, 0.0, ...",0,0
3,1,3,3,7909,-6694.549196,0.080910,"[0.6849159185737768, 0.0, 0.2073587052724744, ...",0,0
4,1,4,4,2070,-1348.560785,0.021176,"[0.0, 0.0, 0.6608695652173913, 0.3381642512077...",0,0
...,...,...,...,...,...,...,...,...,...
775,4,620,0444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
776,4,621,1444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
777,4,622,2444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
778,4,623,3444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0


### 2.4 Selecting the final nodes

*Need help to describe this step*

In [36]:
ctm.block3(data_frame)

In [37]:
data_frame

Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final
0,1,0,0,44111,-36117.259084,0.451259,"[0.23846659563374215, 0.29559520301058695, 0.3...",1,0
1,1,1,1,21830,-18722.227112,0.223323,"[0.6933577645442052, 0.0, 0.20998625744388455,...",1,0
2,1,2,2,21830,-14738.415530,0.223323,"[0.5972972972972973, 0.4027027027027027, 0.0, ...",0,1
3,1,3,3,7909,-6694.549196,0.080910,"[0.6849159185737768, 0.0, 0.2073587052724744, ...",0,1
4,1,4,4,2070,-1348.560785,0.021176,"[0.0, 0.0, 0.6608695652173913, 0.3381642512077...",0,1
...,...,...,...,...,...,...,...,...,...
775,4,620,0444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
776,4,621,1444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
777,4,622,2444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0
778,4,623,3444,0,-22.980357,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0,0


### 2.4 - Resulting context tree

In [38]:
ctm.final_tree(data_frame)

Unnamed: 0,l,node_idx,node,node_freq,lps,ps,child_probs,flag,final
155,4,0,0,943,-498.167922,0.009647,"[0.20254506892895016, 0.7974549310710498, 0.0,...",0,1
55,3,25,1,5841,-5667.415819,0.059755,"[0.6159904126005821, 0.0, 0.2722136620441705, ...",0,1
180,4,25,10,3598,-3355.740144,0.036809,"[0.030572540300166758, 0.0, 0.6700944969427459...",0,1
31,3,1,100,991,-857.654556,0.010138,"[0.0, 0.0, 0.6740665993945509, 0.2119071644803...",0,1
2,1,2,2,21830,-14738.41553,0.223323,"[0.5972972972972973, 0.4027027027027027, 0.0, ...",0,1
7,2,2,20,13039,-8990.185298,0.133391,"[0.4479638009049774, 0.5520361990950227, 0.0, ...",0,1
32,3,2,200,5841,-3963.412048,0.059755,"[0.40404040404040403, 0.5959595959595959, 0.0,...",0,1
157,4,2,2000,2360,-1499.97023,0.024144,"[0.31864406779661014, 0.6813559322033899, 0.0,...",0,1
57,3,27,201,7198,-6004.942636,0.073638,"[0.7167268685746041, 0.0, 0.18977493748263408,...",0,1
182,4,27,2010,5159,-5096.133751,0.052779,"[0.0684241132002326, 0.0, 0.6561349098662531, ...",0,1


### String representation

The resulting object also provides a string representation of the selected context tree:

In [39]:
bic_tree.to_str()

'0000 001 0010 100 2 20 200 2000 201 2010 21 210 3 30 300 4'

### Comparing context trees

A handy comparison method is available to compare trees

In [40]:
bic_tree.equals_to(bic_tree)

True

In [41]:
tree1 = g4l.estimators.CTM(initial_tree).execute(0.2)    # c = 0.5
tree2 = g4l.estimators.CTM(initial_tree).execute(3000)   # c = 3000
tree3 = g4l.estimators.CTM(initial_tree).execute(4000)   # c = 4000

print("tree1:", tree1.to_str(), "\n")
print("tree2:", tree2.to_str(), "\n")
print("tree3:", tree3.to_str(), "\n\n")

print("tree1 == tree2 ?", tree1.equals_to(tree2))
print("tree1 == tree3 ?", tree1.equals_to(tree3))
print("tree2 == tree3 ?", tree2.equals_to(tree3))

tree1: 0000 0001 0010 0210 03 030 100 1210 13 130 2 20 200 2000 2001 201 2010 21 300 3210 33 330 4 4210 43 430 

tree2: 0 1 2 3 4 

tree3: 0 1 2 3 4 


tree1 == tree2 ? False
tree1 == tree3 ? False
tree2 == tree3 ? True


In [42]:
bic_tree.log_likelihood()

-77621.0117069683