In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [33]:
from creme import datasets
from creme import metrics
from creme import model_selection
from creme import tree

X_y = datasets.Elec2()

model = tree.DecisionTreeClassifier(
    patience=2000,
    confidence=1e-5,
    criterion='gini'
)

metric = metrics.LogLoss()

model_selection.progressive_val_score(X_y, model, metric)

LogLoss: 0.562813

In [30]:
x, y = next(iter(X_y))
model.root.get_leaf(x).predict_naive_bayes(x)

{True: 0.2278395725429478, False: 0.7721604274570522}

## Batch

In [2]:
from creme import datasets
from creme import tree
import pandas as pd

X = []
Y = []

t = tree.DecisionTreeClassifier()

for i, (x, y) in enumerate(datasets.Elec2()):
    t.fit_one(x, y)
    X.append(list(x.values()))
    Y.append(y)
    
    if i == 1600:
        break
    
X = pd.DataFrame(X, columns=x.keys())
Y = pd.Series(Y)

X.head()

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer
0,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912
1,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912
2,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912
3,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912
4,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912


## Bayes rule

In [3]:
feature = 'nswprice'
threshold = 0.051

# The truth
P_Y_X = Y[X[feature] < threshold].value_counts()
print(f'P(y | x < t) = {P_Y_X[True] / (P_Y_X[True] + P_Y_X[False]):.5f}')
P_Y_X = Y[X[feature] >= threshold].value_counts()
print(f'P(y | x >= t) = {P_Y_X[True] / (P_Y_X[True] + P_Y_X[False]):.5f}')
print()

# The knowledge
P_X_Y = len(X[Y & (X[feature] < threshold)]) / len(X[Y])
print(f'P(x < t | y) = {P_X_Y:.5f}')

P_Y = Y.mean()
print(f'P(y) = {P_Y:.5f}')

P_X = (X[feature] < threshold).mean()
print(f'P(x < t) = {P_X:.5f}')
print()

print(f'P(y | x < t) ~ P(x < t | y) * P(y) / P(x < t) = {P_X_Y * P_Y / P_X:.5f}')
print(f'P(y | x >= t) ~ P(x >= t | y) * P(y) / P(x >= t) = {(1 - P_X_Y) * P_Y / (1 - P_X):.5f}')

P(y | x < t) = 0.05234
P(y | x >= t) = 0.67543

P(x < t | y) = 0.06041
P(y) = 0.39288
P(x < t) = 0.45347

P(y | x < t) ~ P(x < t | y) * P(y) / P(x < t) = 0.05234
P(y | x >= t) ~ P(x >= t | y) * P(y) / P(x >= t) = 0.67543


In [4]:
ss = t.root.split_enums[feature]

# The truth
P_Y_X = Y[X[feature] < threshold].value_counts()
print(f'P(y | x < t) = {P_Y_X[True] / (P_Y_X[True] + P_Y_X[False]):.5f}')
P_Y_X = Y[X[feature] >= threshold].value_counts()
print(f'P(y | x >= t) = {P_Y_X[True] / (P_Y_X[True] + P_Y_X[False]):.5f}')
print()

# The knowledge
P_X_Y = ss.hists[True].cdf(x=threshold)
print(f'P(x < t | y) = {P_X_Y:.5f}')

P_Y = t.root.target_dist.pmf(True)
print(f'P(y) = {P_Y:.5f}')

P_X = t.root.target_dist.pmf(True) * ss.hists[True].cdf(x=threshold) + t.root.target_dist.pmf(False) * ss.hists[False].cdf(x=threshold)
print(f'P(x < t) = {P_X:.5f}')
print()

print(f'P(y | x < t) ~ P(x < t | y) * P(y) / P(x < t) = {P_X_Y * P_Y / P_X:.5f}')
print(f'P(y | x >= t) ~ P(x >= t | y) * P(y) / P(x >= t) = {(1 - P_X_Y) * P_Y / (1 - P_X):.5f}')

P(y | x < t) = 0.05234
P(y | x >= t) = 0.67543

P(x < t | y) = 0.06041
P(y) = 0.39288
P(x < t) = 0.44810

P(y | x < t) ~ P(x < t | y) * P(y) / P(x < t) = 0.05297
P(y | x >= t) ~ P(x >= t | y) * P(y) / P(x >= t) = 0.66886
