In [45]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy.special import expit

$$
Y_i^* = \alpha + \beta_1  call_i + \beta_2 call_i X_i + \theta X_i + e_i
$$

$$
Y_i = \dfrac{1}{1+e^{-Y_i^*}}
$$

In [59]:
n = 100000
n_x = 10

np.random.seed(123)

# true parameters
alpha = -2
beta1 = np.random.uniform(0,1)
beta2 = np.random.uniform(-1,1, n_x)
theta = np.random.uniform(-1,1, n_x)


# data
call = np.random.binomial(1, 0.5, n)
X = np.random.normal(0, 1, (n, n_x))

Y_star = alpha + beta1*call + call*X.dot(beta2) + X.dot(theta)
Y = np.random.binomial(1, expit(Y_star))

df = pd.DataFrame(dict(agreement=Y, call=call)).join(pd.DataFrame(X, columns=[f"x{i}" for i in range(n_x)]))

df

Unnamed: 0,agreement,call,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,0,1,1.564163,0.091429,1.388867,0.240799,0.154027,-1.050472,2.312805,-1.026806,0.210003,0.037489
1,1,1,-1.492450,-0.675429,-0.656971,1.023002,-0.774588,0.507497,0.355742,-0.437936,-1.080081,1.360770
2,0,1,-0.216074,-0.404754,0.179214,-0.342899,-0.524485,-1.193807,-0.040148,0.128821,-0.988354,-0.605981
3,0,1,0.002662,-0.354355,-1.624813,0.849674,-0.167226,-0.884153,-0.153367,0.392268,0.455826,0.756879
4,0,0,-0.148254,-0.579394,-0.736772,-1.012188,-0.126697,1.160276,0.846815,2.236606,-0.106393,0.547150
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,1,0.746418,0.523360,1.744976,-0.159363,-1.361788,-0.317357,0.700818,0.474245,1.420319,-1.049133
99996,0,1,0.548423,1.992458,-0.149136,0.393278,0.877541,1.400714,0.263317,-0.364506,-0.000415,1.163906
99997,0,1,-0.734112,-0.430433,0.103465,0.578520,1.374382,-0.380007,-1.023640,0.652050,1.027991,-0.488377
99998,0,1,-1.252847,-1.075358,-0.295104,-0.182125,-1.069035,0.266058,-0.549152,1.055482,-0.791112,-0.347339


In [47]:
df.mean()

agreement    0.853980
call         0.500960
x0          -0.002751
x1           0.001546
x2          -0.003561
x3           0.003214
x4           0.001600
x5          -0.002346
x6           0.005613
x7          -0.002390
x8           0.004779
x9           0.001502
dtype: float64

## Criando um Modelo Preditivo

In [48]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=1)

train.shape, test.shape

((80000, 12), (20000, 12))

In [49]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier()


X = [f"x{i}" for i in range(10)]

clf.fit(train[X], train["agreement"])

LGBMClassifier()

In [50]:
train_pred = train.assign(prediction = clf.predict_proba(train[X])[:, 1])
test_pred = test.assign(prediction = clf.predict_proba(test[X])[:, 1])

train_pred.head()


Unnamed: 0,agreement,call,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,prediction
78689,1,1,-1.202416,-0.280416,-1.810579,-0.882309,0.622271,0.911877,0.44506,-0.191113,-0.577777,0.766118,0.956417
76423,1,1,-0.805696,-1.232675,-0.760623,0.221013,0.366721,0.143864,-2.010781,0.69328,0.571393,-0.752304,0.974702
86945,1,1,-0.850826,-0.982348,-1.233703,-2.310498,1.078682,0.586545,0.100963,-0.744785,-1.819578,-0.355326,0.971572
57427,1,0,-0.238013,0.522953,1.027834,-1.299981,-0.458112,1.211409,-1.098646,0.289592,-0.975876,1.028182,0.779198
34616,1,0,-0.854292,0.989608,-0.277672,0.67671,0.239035,0.961658,-1.529399,-0.285434,1.462993,-0.722879,0.927606


In [51]:
from sklearn.metrics import roc_auc_score

print("Train AUC:", roc_auc_score(train_pred["agreement"], train_pred["prediction"]))
print("Test AUC:", roc_auc_score(test_pred["agreement"], test_pred["prediction"]))

Train AUC: 0.8181206278544151
Test AUC: 0.7642554212464638


# Priorizando

In [52]:
test_ranked = test_pred.assign(
    lost_cause_1st = np.argsort(test_pred["prediction"]).argsort(),
    sure_thing_1st = np.argsort(-1*test_pred["prediction"]).argsort(),
)

In [53]:
test_ranked.sort_values(["sure_thing_1st"])

Unnamed: 0,agreement,call,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,prediction,lost_cause_1st,sure_thing_1st
67534,1,0,1.723651,1.145491,-2.714713,-0.137719,0.101587,-0.385853,-2.629268,-1.162119,-0.228837,0.011167,0.996588,19999,0
30330,1,1,0.148807,-1.065254,-2.152062,1.459757,-0.796461,-0.673396,-2.239162,-0.150769,0.781312,0.136687,0.995829,19998,1
25973,1,0,-0.939034,0.552453,-3.246118,1.284368,0.506694,-0.127436,-2.329455,-0.244106,-0.044151,0.541396,0.995442,19997,2
69081,1,1,-0.317000,-0.786124,-3.111926,0.140252,-0.002939,-0.573890,-1.536846,-0.033591,-0.339201,3.004093,0.995317,19996,3
65368,1,0,0.215321,-1.165824,-1.859961,-0.467916,-0.469505,-0.145950,-3.245424,-0.157003,0.086914,1.777500,0.995278,19995,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20142,0,0,-0.427885,1.111556,2.390521,0.178255,-1.384017,0.929317,1.725965,-0.947224,0.411649,-1.556848,0.202904,4,19995
71972,0,0,0.356397,2.696516,3.133174,-0.671220,0.435666,-1.480976,-0.172448,-1.495032,-1.101001,1.069223,0.201295,3,19996
33631,0,1,-0.235599,0.833632,3.398472,-0.106204,-1.968615,-0.192665,0.262232,-1.117790,-0.420411,-0.225667,0.188792,2,19997
54483,0,1,-1.010004,1.505689,1.095830,0.333214,-2.914065,-0.825166,0.115243,0.804730,1.473137,-2.341130,0.173514,1,19998


# Qual Ordenamento é Melhor? (Qini Curve)

In [54]:
# mais provaveis
test_ranked.query("sure_thing_1st<10000").groupby(["call"])["agreement"].mean()

call
0    0.935070
1    0.956687
Name: agreement, dtype: float64

In [55]:
test_ranked.query("sure_thing_1st<10000").groupby(["call"])["agreement"].mean().diff()

call
0         NaN
1    0.021616
Name: agreement, dtype: float64

In [56]:
# menos provaveis
test_ranked.query("sure_thing_1st>10000").groupby(["call"])["agreement"].mean()

call
0    0.688608
1    0.846229
Name: agreement, dtype: float64

In [57]:
test_ranked.query("sure_thing_1st>10000").groupby(["call"])["agreement"].mean().diff()

call
0         NaN
1    0.157621
Name: agreement, dtype: float64