In [1]:
import numpy as np
import plotly.express as px
import polars as pl
import sys
sys.path.append('../../python')
import pdstools
settingsShowTSSpread=pl.DataFrame({"n": 5, "positives": range(1, 1001)}).with_columns(
    p=[0.01, 0.05, 0.1]
).explode("p").with_columns(evidence=pl.col("positives") / pl.col("p"))
def betaDistribution(structcol):
    return structcol.apply(
        lambda x: np.random.beta(
            x["p"] * x["evidence"], (1 - x["p"]) * x["evidence"], x["n"]
        ).tolist()
    )


thompsonSamplingSimulation = settingsShowTSSpread.with_columns(
    sampled_propensity=betaDistribution(pl.struct(["n", "p", "evidence"]))
).explode('sampled_propensity').with_columns(positives = pl.col('evidence')*pl.col('p'))

In [2]:
px.scatter(
    thompsonSamplingSimulation.to_pandas(),
    x="positives",
    y="sampled_propensity",
    color="p",
    opacity=0.6,
    labels={
        "sampled_propensity": "Sampled Propensity",
        "positives": "Number of positive responses in the Adaptive Model",
        "p":"Propensity"
    },
    range_y=[0, 0.2],
    title='Thompson Sampling',
    template="pega",
).update_coloraxes(showscale=False).update_traces(marker={"size":3})

In [3]:
s = thompsonSamplingSimulation['positives']
thompsonSamplingSimulation2 = thompsonSamplingSimulation.hstack(s.cut(bins=np.array(range(int(s.min()), int(s.max())+20, 20))-1).select(bin='category'))
s = thompsonSamplingSimulation2.groupby("p", "bin").agg(
    n=pl.count(),
    n90=(((pl.col("sampled_propensity") - pl.col("p")) / pl.col("p")) < 0.1).sum(),
    positives=pl.min("positives"),
).with_columns(pct=pl.col('n90')/pl.col('n')).sort('p', 'bin').with_columns(pl.col('p').cast(pl.Utf8).cast(pl.Categorical))


In [4]:
px.line(s.to_pandas(), x='positives', y='pct', color='p', template='none', line_group='p', title='Percentage of Sampled Propensities <br><sup>that are within 10% of the Model Propensities</sup>', labels={'pct':'Percentage', 'positives':'Percentage of positive responses in the Adaptive Model', 'p':'Propensity'})

In [5]:
settings1 = (
    pl.DataFrame({"n": 100000})
    .with_columns(
        p=[0.01, 0.05, 0.1], evidence=[2000, 200, 100000], ypeak=[200, 50, 500]
    )
    .explode(["p"])
    .explode("evidence")
    .explode("ypeak")
).with_columns(
    sampled_propensity=betaDistribution(pl.struct(["n", "p", "evidence"]))
).explode('sampled_propensity').with_columns(positives = pl.col('evidence')*pl.col('p'))


In [6]:
from scipy.stats import gaussian_kde
series = settings1.filter(pl.col('p')==0.01)['sampled_propensity']

In [7]:
results = {}
for p, series in settings1.groupby('p'):
    results[p] = gaussian_kde(series['sampled_propensity'])(np.arange(0,1,0.01))

In [8]:
settings1.groupby('p').select(pl.col('sampled_propensity').map(lambda x: gaussian_kde(x)(np.arange(0,1,0.01))))

ComputeError: AttributeError: 'numpy.ndarray' object has no attribute 'over'

In [None]:
results

{0.01: array([1.31418200e+00, 3.96054952e+02, 4.90726614e+00, 9.04439094e-01,
        1.66552267e-01, 2.50914726e-02, 1.05400074e-02, 4.60435676e-04,
        3.41421316e-56, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.

In [None]:
kernel = gaussian_kde(series)

In [None]:
px.area(kernel(np.arange(0,1,0.01)))