# Estimating Conditional Probabilities

## Binary variables

In [82]:
import pandas as pd
import numpy as np
from scipy.stats import binom

np.random.seed(37)

N = 1_000
x_0 = np.random.binomial(1, 0.1, N)
x_1 = np.random.binomial(1, 0.5, N)
y = binom.rvs(1, 1 / (1 + np.exp(-(0.1 + 2.3 * x_0 - 3.4 * x_1))))

Xy = pd.DataFrame({
    'x_0': x_0,
    'x_1': x_1,
    'y': y
})

X, y = Xy[['x_0', 'x_1']], Xy['y']
X.shape, y.shape

((1000, 2), (1000,))

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

M = {
    'L': LogisticRegression(solver='saga', random_state=37, n_jobs=-1).fit(X, y),
    'R': RandomForestClassifier(random_state=37, n_jobs=-1, n_estimators=20).fit(X, y)
}

In [84]:
def get_conditional_probs(m):
    def p(x_0, x_1, y):
        n = (Xy['x_0']==x_0) & (Xy['x_1']==x_1) & (Xy['y']==y)
        d = (Xy['x_0']==x_0) & (Xy['x_1']==x_1)
        return Xy[n].shape[0] / Xy[d].shape[0]
    
    m_pred = pd.DataFrame({'x_0': [0, 0, 1, 1], 'x_1': [0, 1, 0, 1]}) \
        .assign(
            p=lambda d: [r for r in m.predict_proba(d)],
            m_p_0=lambda d: d.apply(lambda r: r['p'][0], axis=1),
            m_p_1=lambda d: d.apply(lambda r: r['p'][1], axis=1),
            y_0=0,
            y_1=1
        ) \
        .drop(columns=['p']) \
        [['x_0', 'x_1', 'y_0', 'y_1', 'm_p_0', 'm_p_1']] \
        .set_index(['x_0', 'x_1', 'y_0', 'y_1'])
    
    d_pred = pd.DataFrame({
            'x_0': [0, 0, 1, 1],
            'x_1': [0, 1, 0, 1],
            'y_0': [0, 0, 0, 0],
            'y_1': [1, 1, 1, 1]
        }) \
        .assign(
            d_p_0 = lambda d: d.apply(lambda r: p(r.x_0, r.x_1, r.y_0), axis=1),
            d_p_1 = lambda d: d.apply(lambda r: p(r.x_0, r.x_1, r.y_1), axis=1)
        ) \
        .set_index(['x_0', 'x_1', 'y_0', 'y_1'])
    
    return m_pred.join(d_pred)[['m_p_0', 'd_p_0', 'm_p_1', 'd_p_1']]

In [85]:
get_conditional_probs(M['L'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,m_p_0,d_p_0,m_p_1,d_p_1
x_0,x_1,y_0,y_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,1,0.451874,0.448661,0.548126,0.551339
0,1,0,1,0.936365,0.9447,0.063635,0.0553
1,0,0,1,0.09249,0.068966,0.90751,0.931034
1,1,0,1,0.645277,0.633333,0.354723,0.366667


In [86]:
get_conditional_probs(M['R'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,m_p_0,d_p_0,m_p_1,d_p_1
x_0,x_1,y_0,y_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,1,0.443748,0.448661,0.556252,0.551339
0,1,0,1,0.943741,0.9447,0.056259,0.0553
1,0,0,1,0.07087,0.068966,0.92913,0.931034
1,1,0,1,0.627743,0.633333,0.372257,0.366667


## Multinomial variables

In [88]:
x_0 = np.random.multinomial(1, [0.25, 0.35, 0.4], N)
x_1 = np.random.multinomial(1, [0.35, 0.4, 0.25], N)
y = binom.rvs(1, 1 / (1 + np.exp(-(0.5 + np.hstack([x_0, x_1]).dot(np.array([-1, 0.5, 0.37, -0.4, 0.8, 0.1]))))))

x_0 = pd.Series(np.argmax(x_0, axis=1)).map({0: 'a', 1: 'b', 2: 'c'})
x_1 = pd.Series(np.argmax(x_1, axis=1)).map({0: 'a', 1: 'b', 2: 'c'})

Xy = pd.DataFrame({
    'x_0': x_0,
    'x_1': x_1,
    'y': y
})

X, y = Xy[['x_0', 'x_1']], Xy['y']
X.shape, y.shape

((1000, 2), (1000,))

In [91]:
m = RandomForestClassifier(random_state=37, n_jobs=-1, n_estimators=20)
m.fit(pd.get_dummies(X), y)

In [100]:
import itertools

_X = pd.DataFrame(list(itertools.product(*[[0, 1] for _ in range(6)])), columns=pd.get_dummies(X).columns)
_y = m.predict_proba(_X)

In [112]:
_X.assign(y_0=_y[:,0], y_1=_y[:,1])

Unnamed: 0,x_0_a,x_0_b,x_0_c,x_1_a,x_1_b,x_1_c,y_0,y_1
0,0,0,0,0,0,0,0.431127,0.568873
1,0,0,0,0,0,1,0.479480,0.520520
2,0,0,0,0,1,0,0.288129,0.711871
3,0,0,0,0,1,1,0.342478,0.657522
4,0,0,0,1,0,0,0.480711,0.519289
...,...,...,...,...,...,...,...,...
59,1,1,1,0,1,1,0.357823,0.642177
60,1,1,1,1,0,0,0.474246,0.525754
61,1,1,1,1,0,1,0.456696,0.543304
62,1,1,1,1,1,0,0.343861,0.656139


In [116]:
y.value_counts().sort_index()  / y.shape[0]

0    0.369
1    0.631
Name: y, dtype: float64