In [84]:
import numpy as np
import pandas as pd

beta = np.linspace(1, 9, 5).astype(int)
top_p = np.linspace(0.5, 1, 10)
pd.DataFrame(
    [
        [f"{entropy([p, 1 - p]) / beta:.0%}" for beta in beta]
        for p in top_p
    ],
    index=pd.Index([f"{p:.0%}" for p in top_p], name="top_p"),
    columns=pd.Index(beta, name="beta"),
)

beta,1,3,5,7,9
top_p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
50%,100%,33%,20%,14%,11%
56%,99%,33%,20%,14%,11%
61%,96%,32%,19%,14%,11%
67%,92%,31%,18%,13%,10%
72%,85%,28%,17%,12%,9%
78%,76%,25%,15%,11%,8%
83%,65%,22%,13%,9%,7%
89%,50%,17%,10%,7%,6%
94%,31%,10%,6%,4%,3%
100%,-0%,-0%,-0%,-0%,-0%


In [59]:
import math

def entropy(P):
    return -sum(p * math.log2(p) for p in P if p > 0)

entropy(P=[0.5, 0.5])

1.0

In [65]:
entropy(P=[0.33, 0.33, 0.33]) / math.log2(3)

0.9990567278352848

In [43]:
other_classes = 0.1
for p in np.linspace((1 - other_classes) / 2, 1 - other_classes, 10):
    best = p
    second_best = 1 - p
    beta_mean = second_best / (best + second_best)
    print(f"{best:>4.0%}", '\t', f"{second_best:>4.0%}", '\t', f"{beta_mean:>4.0%}")

 45% 	  55% 	  55%
 50% 	  50% 	  50%
 55% 	  45% 	  45%
 60% 	  40% 	  40%
 65% 	  35% 	  35%
 70% 	  30% 	  30%
 75% 	  25% 	  25%
 80% 	  20% 	  20%
 85% 	  15% 	  15%
 90% 	  10% 	  10%


# Active learning

## Online active learning

In [1]:
from river import active
from river import datasets
from river import feature_extraction
from river import linear_model
from river import metrics
from river import preprocessing

dataset = datasets.SMSSpam()
metric = metrics.Accuracy()
model = (
    feature_extraction.TFIDF(on='body') |
    linear_model.LogisticRegression()
)
model = active.EntropySampler(model, seed=42)

for x, y in dataset:
    y_pred, ask = model.predict_proba_one(x)
    metric.update(y, y_pred)
    if ask:
        model.learn_one(x, y)
    
metric

Accuracy: 86.54%

This is well summarized in the following schema from [Online Active Learning Methods for Fast Label-Efficient Spam Filtering](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=6fef6272cd72292e2f5a54d02d6e5352664e20cb).

<div align="center">
    <img width="50%" src="../img/online_active_learning.png" />
</div>

In [1]:
from river import linear_model

model = linear_model.LogisticRegression()
model

LogisticRegression (
  optimizer=SGD (
    lr=Constant (
      learning_rate=0.01
    )
  )
  loss=Log (
    weight_pos=1.
    weight_neg=1.
  )
  l2=0.
  l1=0.
  intercept_init=0.
  intercept_lr=Constant (
    learning_rate=0.01
  )
  clip_gradient=1e+12
  initializer=Zeros ()
)

## Reduce training time

#

## Production considerations