In [30]:
import pandas as pd
import numpy as np
from scipy.stats import binom

np.random.seed(37)

N = 1_000
x_0 = np.random.binomial(1, 0.1, N)
x_1 = np.random.binomial(1, 0.5, N)
y = binom.rvs(1, 1 / (1 + np.exp(-(0.1 + 2.3 * x_0 - 3.4 * x_1))))

Xy = pd.DataFrame({
    'x_0': x_0,
    'x_1': x_1,
    'y': y
})

X, y = Xy[['x_0', 'x_1']], Xy['y']
X.shape, y.shape

((1000, 2), (1000,))

In [61]:
from sklearn.linear_model import LogisticRegression

m = LogisticRegression(solver='saga', random_state=37, n_jobs=-1)
m.fit(X, y)

m.intercept_, m.coef_

(array([0.19310115]), array([[ 2.09050255, -2.88194471]]))

In [90]:
def get_conditional_probs(m):
    p = lambda x_0, x_1, y:  Xy[(Xy['x_0']==x_0) & (Xy['x_1']==x_1) & (Xy['y']==y)].shape[0] / Xy[(Xy['x_0']==x_0) & (Xy['x_1']==x_1)].shape[0]
    
    m_pred = pd.DataFrame({'x_0': [0, 0, 1, 1], 'x_1': [0, 1, 0, 1]}) \
        .assign(
            p=lambda d: [r for r in m.predict_proba(d)],
            m_p_0=lambda d: d.apply(lambda r: r['p'][0], axis=1),
            m_p_1=lambda d: d.apply(lambda r: r['p'][1], axis=1),
            y_0=0,
            y_1=1
        ) \
        .drop(columns=['p']) \
        [['x_0', 'x_1', 'y_0', 'y_1', 'm_p_0', 'm_p_1']] \
        .set_index(['x_0', 'x_1', 'y_0', 'y_1'])
    
    d_pred = pd.DataFrame({
            'x_0': [0, 0, 1, 1],
            'x_1': [0, 1, 0, 1],
            'y_0': [0, 0, 0, 0],
            'y_1': [1, 1, 1, 1]
        }) \
        .assign(
            d_p_0 = lambda d: d.apply(lambda r: p(r.x_0, r.x_1, r.y_0), axis=1),
            d_p_1 = lambda d: d.apply(lambda r: p(r.x_0, r.x_1, r.y_1), axis=1)
        ) \
        .set_index(['x_0', 'x_1', 'y_0', 'y_1'])
    
    return m_pred.join(d_pred)[['m_p_0', 'd_p_0', 'm_p_1', 'd_p_1']]

In [91]:
get_conditional_probs(m)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,m_p_0,d_p_0,m_p_1,d_p_1
x_0,x_1,y_0,y_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,1,0.451874,0.448661,0.548126,0.551339
0,1,0,1,0.936365,0.9447,0.063635,0.0553
1,0,0,1,0.09249,0.068966,0.90751,0.931034
1,1,0,1,0.645277,0.633333,0.354723,0.366667


In [92]:
from sklearn.ensemble import RandomForestClassifier

m = RandomForestClassifier(random_state=37, n_jobs=-1, n_estimators=20)
m.fit(X, y)

ValueError: node array from the pickle has an incompatible dtype:
- expected: {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}
- got     : [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]