In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

  return f(*args, **kwds)
  return f(*args, **kwds)


In [5]:
def read_train_test():
    X_train = pd.read_csv(
        "../data/processed/two_models/X_train.csv", index_col="client_id"
    )
    y_train = pd.read_csv(
        "../data/processed/two_models/y_train.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    train_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_train_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_valid = pd.read_csv("../data/processed/two_models/X_valid.csv", index_col="client_id")
    y_valid = pd.read_csv(
        "../data/processed/two_models/y_valid.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    valid_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_valid_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_test = pd.read_csv("../data/processed/two_models/X_test.csv", index_col="client_id")

    return X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test


def join_train_validation(X_train, X_valid, y_train, y_valid):
    X_train = pd.concat([X_train, X_valid], ignore_index=False)
    y_train = pd.concat([y_train, y_valid], ignore_index=False)
    return X_train, y_train


def split_control_treatment(X, y, is_treatment):
    X_control = X[is_treatment == 0]
    X_treatment = X[is_treatment == 1]
    y_control = y[is_treatment == 0]
    y_treatment = y[is_treatment == 1]
    return X_control, X_treatment, y_control, y_treatment

In [3]:
def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    try:
        order = np.argsort(-prediction.values)
    except Exception:
        order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    return score

### Prepare data

In [77]:
X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test = read_train_test()
X_train, y_train = join_train_validation(X_train, X_valid, y_train, y_valid)
train_is_treatment = pd.concat([train_is_treatment, valid_is_treatment], ignore_index=False)

# Let's compare baselines

## Two models

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=142)

In [10]:
metrics = {
    "control_acc": [],
    "treatment_acc": [],
    "control_auc": [],
    "treatment_auc": [],
    "uplift": []
}

In [12]:
for i, (train_idx, test_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i + 1}/5")
    
    train_data = X_train.iloc[train_idx]
    train_target = y_train.iloc[train_idx]
    train_data_is_treatment = train_is_treatment.iloc[train_idx]

    test_data = X_train.iloc[test_idx]
    test_target = y_train.iloc[test_idx]
    test_data_is_treatment = train_is_treatment.iloc[test_idx]

    X_train_control, X_train_treatment, y_train_control, y_train_treatment = split_control_treatment(
        train_data, train_target, train_data_is_treatment
    )
    X_valid_control, X_valid_treatment, y_valid_control, y_valid_treatment = split_control_treatment(
        test_data, test_target, test_data_is_treatment
    )

    clf_control = xgb.XGBClassifier(objective="binary:logistic").fit(X_train_control, y_train_control)
    clf_treatment = xgb.XGBClassifier(objective="binary:logistic").fit(X_train_treatment, y_train_treatment)
    
    treatment_proba = clf_treatment.predict_proba(test_data)[:, 1]
    control_proba = clf_control.predict_proba(test_data)[:, 1]
    uplift_prediction = treatment_proba - control_proba
    up_score = uplift_score(uplift_prediction, test_target, test_data_is_treatment)

    metrics["control_acc"].append(clf_control.score(test_data, test_target))
    metrics["treatment_acc"].append(clf_treatment.score(test_data, test_target))
    metrics["control_auc"].append(roc_auc_score(test_target, control_proba))
    metrics["treatment_auc"].append(roc_auc_score(test_target, treatment_proba))
    metrics["uplift"].append(up_score)

print(metrics)
print(dict(zip(metrics.keys(), list(map(np.mean, metrics.values())))))

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
{'control_acc': [0.7117398585318303, 0.7148141668124672, 0.7164746169420352, 0.7117398585318303, 0.7148141668124672, 0.7164746169420352, 0.7123753343164946, 0.712850251206039], 'treatment_acc': [0.7124147066909945, 0.7144142567922217, 0.71544979628565, 0.7124147066909945, 0.7144142567922217, 0.71544979628565, 0.7140500412427825, 0.7148748968930437], 'control_auc': [0.7684184387577829, 0.7708315914299072, 0.7734410232872382, 0.7684184387577829, 0.7708315914299072, 0.7734410232872382, 0.7719671463905629, 0.7732457768281585], 'treatment_auc': [0.7698255958036173, 0.7718591765837445, 0.7736627033496883, 0.7698255958036173, 0.7718591765837445, 0.7736627033496883, 0.7716824753880323, 0.7749180171164807], 'uplift': [0.047263862765340614, 0.06438784866098163, 0.0656723612073557, 0.047263862765340614, 0.06438784866098163, 0.0656723612073557, 0.056691379936172126, 0.06466479755625015]}
{'control_acc': 0.7139103587618999, 'treatment_acc': 0.71418530720

## Leave "is_treatment" as feature

### eval_metric = log_loss

In [78]:
metrics = {
    "accuracy": [],
    "roc_auc": [],
    "logloss": [],
    "uplift": []
}

for i, (train_idx, test_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i + 1}/5")
    
    train_data = X_train.iloc[train_idx]
    train_target = y_train.iloc[train_idx]
    train_data["is_treatment"] = train_is_treatment.iloc[train_idx]
    
    test_data = X_train.iloc[test_idx]
    test_target = y_train.iloc[test_idx]
    test_data["is_treatment"] = train_is_treatment.iloc[test_idx]
    
    eval_set = [(train_data, train_target), (test_data, test_target)]
    clf = xgb.XGBClassifier(objective="binary:logistic").fit(
        train_data, train_target, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True
    )
    
    treatment_data = test_data.copy()
    treatment_data["is_treatment"] = 1
    
    control_data = test_data.copy()
    control_data["is_treatment"] = 1
    
    treatment_proba = clf.predict_proba(treatment_data)[:, 1]
    control_proba = clf.predict_proba(control_data)[:, 1]
    uplift_prediction = treatment_proba - control_proba
    up_score = uplift_score(uplift_prediction, test_target, test_data["is_treatment"])

    metrics["accuracy"].append(clf.score(test_data, test_target))
    metrics["roc_auc"].append(roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["logloss"].append(log_loss(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["uplift"].append(up_score)

print(metrics)
print(dict(zip(metrics.keys(), list(map(np.mean, metrics.values())))))

Fold 1/5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-logloss:0.670386	validation_1-logloss:0.670668
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.651754	validation_1-logloss:0.652297
[2]	validation_0-logloss:0.636294	validation_1-logloss:0.637089
[3]	validation_0-logloss:0.623491	validation_1-logloss:0.624521
[4]	validation_0-logloss:0.612788	validation_1-logloss:0.614026
[5]	validation_0-logloss:0.603771	validation_1-logloss:0.605206
[6]	validation_0-logloss:0.59622	validation_1-logloss:0.597848
[7]	validation_0-logloss:0.589635	validation_1-logloss:0.591409
[8]	validation_0-logloss:0.584145	validation_1-logloss:0.586047
[9]	validation_0-logloss:0.57937	validation_1-logloss:0.581395
[10]	validation_0-logloss:0.575403	validation_1-logloss:0.57761
[11]	validation_0-logloss:0.571907	validation_1-logloss:0.574245
[12]	validation_0-logloss:0.568975	validation_1-logloss:0.571425
[13

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-logloss:0.670539	validation_1-logloss:0.670655
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.652072	validation_1-logloss:0.652338
[2]	validation_0-logloss:0.63656	validation_1-logloss:0.636909
[3]	validation_0-logloss:0.623701	validation_1-logloss:0.624036
[4]	validation_0-logloss:0.612981	validation_1-logloss:0.613425
[5]	validation_0-logloss:0.603897	validation_1-logloss:0.604391
[6]	validation_0-logloss:0.596295	validation_1-logloss:0.596831
[7]	validation_0-logloss:0.589874	validation_1-logloss:0.590485
[8]	validation_0-logloss:0.584325	validation_1-logloss:0.585037
[9]	validation_0-logloss:0.579699	validation_1-logloss:0.580471
[10]	validation_0-logloss:0.575673	validation_1-logloss:0.576486
[11]	validation_0-logloss:0.572235	validation_1-logloss:0.57307
[12]	validation_0-logloss:0.56929	validation_1-logloss:0.570163
[13

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-logloss:0.670734	validation_1-logloss:0.6707
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.652402	validation_1-logloss:0.652324
[2]	validation_0-logloss:0.636987	validation_1-logloss:0.636741
[3]	validation_0-logloss:0.624207	validation_1-logloss:0.623748
[4]	validation_0-logloss:0.613522	validation_1-logloss:0.612964
[5]	validation_0-logloss:0.604608	validation_1-logloss:0.603952
[6]	validation_0-logloss:0.59698	validation_1-logloss:0.596161
[7]	validation_0-logloss:0.590546	validation_1-logloss:0.589608
[8]	validation_0-logloss:0.58499	validation_1-logloss:0.583967
[9]	validation_0-logloss:0.580355	validation_1-logloss:0.579274
[10]	validation_0-logloss:0.576346	validation_1-logloss:0.575243
[11]	validation_0-logloss:0.572917	validation_1-logloss:0.571809
[12]	validation_0-logloss:0.569999	validation_1-logloss:0.568851
[13]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-logloss:0.670444	validation_1-logloss:0.670503
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.651954	validation_1-logloss:0.652079
[2]	validation_0-logloss:0.636446	validation_1-logloss:0.636643
[3]	validation_0-logloss:0.623574	validation_1-logloss:0.623733
[4]	validation_0-logloss:0.612865	validation_1-logloss:0.613106
[5]	validation_0-logloss:0.603837	validation_1-logloss:0.604072
[6]	validation_0-logloss:0.596295	validation_1-logloss:0.596613
[7]	validation_0-logloss:0.589827	validation_1-logloss:0.590214
[8]	validation_0-logloss:0.584354	validation_1-logloss:0.584756
[9]	validation_0-logloss:0.579657	validation_1-logloss:0.580059
[10]	validation_0-logloss:0.575617	validation_1-logloss:0.576102
[11]	validation_0-logloss:0.572132	validation_1-logloss:0.572583
[12]	validation_0-logloss:0.569188	validation_1-logloss:0.569667


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-logloss:0.670519	validation_1-logloss:0.670425
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.652033	validation_1-logloss:0.651747
[2]	validation_0-logloss:0.636764	validation_1-logloss:0.636379
[3]	validation_0-logloss:0.623918	validation_1-logloss:0.623496
[4]	validation_0-logloss:0.613293	validation_1-logloss:0.612727
[5]	validation_0-logloss:0.604388	validation_1-logloss:0.603662
[6]	validation_0-logloss:0.596698	validation_1-logloss:0.595919
[7]	validation_0-logloss:0.590272	validation_1-logloss:0.589448
[8]	validation_0-logloss:0.584886	validation_1-logloss:0.583964
[9]	validation_0-logloss:0.580135	validation_1-logloss:0.579202
[10]	validation_0-logloss:0.576143	validation_1-logloss:0.575223
[11]	validation_0-logloss:0.572753	validation_1-logloss:0.571821
[12]	validation_0-logloss:0.569845	validation_1-logloss:0.568917


### eval_metric = roc_auc

In [79]:
X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test = read_train_test()
X_train, y_train = join_train_validation(X_train, X_valid, y_train, y_valid)
train_is_treatment = pd.concat([train_is_treatment, valid_is_treatment], ignore_index=False)

In [80]:
metrics = {
    "accuracy": [],
    "roc_auc": [],
    "logloss": [],
    "uplift": []
}

for i, (train_idx, test_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i + 1}/5")
    
    train_data = X_train.iloc[train_idx]
    train_target = y_train.iloc[train_idx]
    train_data["is_treatment"] = train_is_treatment.iloc[train_idx]
    
    test_data = X_train.iloc[test_idx]
    test_target = y_train.iloc[test_idx]
    test_data["is_treatment"] = train_is_treatment.iloc[test_idx]
    
    eval_set = [(train_data, train_target), (test_data, test_target)]
    clf = xgb.XGBClassifier(objective="binary:logistic").fit(
        train_data, train_target, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set, verbose=True
    )
    
    treatment_data = test_data.copy()
    treatment_data["is_treatment"] = 1
    
    control_data = test_data.copy()
    control_data["is_treatment"] = 1
    
    treatment_proba = clf.predict_proba(treatment_data)[:, 1]
    control_proba = clf.predict_proba(control_data)[:, 1]
    uplift_prediction = treatment_proba - control_proba
    up_score = uplift_score(uplift_prediction, test_target, test_data["is_treatment"])

    metrics["accuracy"].append(clf.score(test_data, test_target))
    metrics["roc_auc"].append(roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["logloss"].append(log_loss(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["uplift"].append(up_score)

print(metrics)
print(dict(zip(metrics.keys(), list(map(np.mean, metrics.values())))))

Fold 1/5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-auc:0.758057	validation_1-auc:0.753703
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.764214	validation_1-auc:0.759895
[2]	validation_0-auc:0.765238	validation_1-auc:0.760661
[3]	validation_0-auc:0.765768	validation_1-auc:0.761209
[4]	validation_0-auc:0.766442	validation_1-auc:0.761919
[5]	validation_0-auc:0.766404	validation_1-auc:0.761874
[6]	validation_0-auc:0.76685	validation_1-auc:0.762243
[7]	validation_0-auc:0.767393	validation_1-auc:0.76288
[8]	validation_0-auc:0.767632	validation_1-auc:0.763123
[9]	validation_0-auc:0.768045	validation_1-auc:0.763598
[10]	validation_0-auc:0.768277	validation_1-auc:0.763766
[11]	validation_0-auc:0.768709	validation_1-auc:0.764058
[12]	validation_0-auc:0.769025	validation_1-auc:0.764381
[13]	validation_0-auc:0.769241	validation_1-auc:0.764598
[14]	validation_0-auc:0.76937	validation_1-auc:0.764713
[

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-auc:0.756294	validation_1-auc:0.754358
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.760342	validation_1-auc:0.758299
[2]	validation_0-auc:0.764273	validation_1-auc:0.762514
[3]	validation_0-auc:0.765434	validation_1-auc:0.763983
[4]	validation_0-auc:0.766019	validation_1-auc:0.764488
[5]	validation_0-auc:0.766654	validation_1-auc:0.765202
[6]	validation_0-auc:0.766988	validation_1-auc:0.765609
[7]	validation_0-auc:0.767151	validation_1-auc:0.765758
[8]	validation_0-auc:0.76751	validation_1-auc:0.766116
[9]	validation_0-auc:0.767847	validation_1-auc:0.766424
[10]	validation_0-auc:0.768196	validation_1-auc:0.766824
[11]	validation_0-auc:0.768223	validation_1-auc:0.76683
[12]	validation_0-auc:0.768377	validation_1-auc:0.767094
[13]	validation_0-auc:0.768602	validation_1-auc:0.767284
[14]	validation_0-auc:0.768784	validation_1-auc:0.767484


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-auc:0.754518	validation_1-auc:0.756134
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.757855	validation_1-auc:0.759189
[2]	validation_0-auc:0.762202	validation_1-auc:0.764205
[3]	validation_0-auc:0.763622	validation_1-auc:0.766482
[4]	validation_0-auc:0.764862	validation_1-auc:0.767498
[5]	validation_0-auc:0.765136	validation_1-auc:0.767668
[6]	validation_0-auc:0.765802	validation_1-auc:0.768536
[7]	validation_0-auc:0.765871	validation_1-auc:0.768629
[8]	validation_0-auc:0.766476	validation_1-auc:0.769208
[9]	validation_0-auc:0.76665	validation_1-auc:0.769326
[10]	validation_0-auc:0.766977	validation_1-auc:0.769536
[11]	validation_0-auc:0.767171	validation_1-auc:0.769558
[12]	validation_0-auc:0.767434	validation_1-auc:0.769684
[13]	validation_0-auc:0.767861	validation_1-auc:0.77005
[14]	validation_0-auc:0.767979	validation_1-auc:0.7702
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-auc:0.75762	validation_1-auc:0.75618
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.757998	validation_1-auc:0.756567
[2]	validation_0-auc:0.763664	validation_1-auc:0.762709
[3]	validation_0-auc:0.765866	validation_1-auc:0.765494
[4]	validation_0-auc:0.7663	validation_1-auc:0.765639
[5]	validation_0-auc:0.766455	validation_1-auc:0.765857
[6]	validation_0-auc:0.766765	validation_1-auc:0.766043
[7]	validation_0-auc:0.767068	validation_1-auc:0.766361
[8]	validation_0-auc:0.767483	validation_1-auc:0.766797
[9]	validation_0-auc:0.767751	validation_1-auc:0.767057
[10]	validation_0-auc:0.767867	validation_1-auc:0.76711
[11]	validation_0-auc:0.767994	validation_1-auc:0.767325
[12]	validation_0-auc:0.768317	validation_1-auc:0.767608
[13]	validation_0-auc:0.768591	validation_1-auc:0.767903
[14]	validation_0-auc:0.768599	validation_1-auc:0.767983
[15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-auc:0.756887	validation_1-auc:0.758053
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.763224	validation_1-auc:0.765217
[2]	validation_0-auc:0.763281	validation_1-auc:0.764958
[3]	validation_0-auc:0.764411	validation_1-auc:0.765985
[4]	validation_0-auc:0.765219	validation_1-auc:0.766925
[5]	validation_0-auc:0.765371	validation_1-auc:0.767317
[6]	validation_0-auc:0.766155	validation_1-auc:0.768086
[7]	validation_0-auc:0.766723	validation_1-auc:0.768359
[8]	validation_0-auc:0.766721	validation_1-auc:0.768399
[9]	validation_0-auc:0.767071	validation_1-auc:0.768716
[10]	validation_0-auc:0.767174	validation_1-auc:0.768931
[11]	validation_0-auc:0.767239	validation_1-auc:0.768876
[12]	validation_0-auc:0.767384	validation_1-auc:0.768916
[13]	validation_0-auc:0.767689	validation_1-auc:0.769151
[14]	validation_0-auc:0.768043	validation_1-auc:0.76950

### eval_metric = uplift_score

In [51]:
def uplift_score_eval(predt: np.ndarray, dtrain: xgb.DMatrix, treatment):
    treatment = treatment[len(predt)]
    try:
        order = np.argsort(-predt.values)
    except Exception:
        order = np.argsort(-predt)
    rate = .3
    y = dtrain.get_label()
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = y[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = y[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    return "uplift-score", float(-score)

In [52]:
from functools import partial

In [55]:
metrics = {
    "accuracy": [],
    "roc_auc": [],
    "logloss": [],
    "uplift": []
}

for i, (train_idx, test_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i + 1}/5")
    
    train_data = X_train.iloc[train_idx]
    train_target = y_train.iloc[train_idx]
    train_data["is_treatment"] = train_is_treatment.iloc[train_idx]
    
    test_data = X_train.iloc[test_idx]
    test_target = y_train.iloc[test_idx]
    test_data["is_treatment"] = train_is_treatment.iloc[test_idx]
    
    eval_set = [(train_data, train_target), (test_data, test_target)]
    treatment = {
        len(train_data): train_is_treatment.iloc[train_idx],
        len(test_data): train_is_treatment.iloc[test_idx]
    }
    eval_uplift = partial(uplift_score_eval, treatment=treatment)

    clf = xgb.XGBClassifier(objective="binary:logistic").fit(
        train_data, train_target, early_stopping_rounds=10, eval_metric=eval_uplift, eval_set=eval_set, verbose=True
    )
    
    treatment_data = test_data.copy()
    treatment_data["is_treatment"] = 1
    
    control_data = test_data.copy()
    control_data["is_treatment"] = 1
    
    treatment_proba = clf.predict_proba(treatment_data)[:, 1]
    control_proba = clf.predict_proba(control_data)[:, 1]
    uplift_prediction = treatment_proba - control_proba
    up_score = uplift_score(uplift_prediction, test_target, test_data["is_treatment"])

    metrics["accuracy"].append(clf.score(test_data, test_target))
    metrics["roc_auc"].append(roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["logloss"].append(log_loss(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["uplift"].append(up_score)

print(metrics)
print(dict(zip(metrics.keys(), list(map(np.mean, metrics.values())))))

Fold 1/5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-error:0.293183	validation_1-error:0.297908	validation_0-uplift-score:-0.01767	validation_1-uplift-score:-0.014839
Multiple eval metrics have been passed: 'validation_1-uplift-score' will be used for early stopping.

Will train until validation_1-uplift-score hasn't improved in 10 rounds.
[1]	validation_0-error:0.288408	validation_1-error:0.291759	validation_0-uplift-score:-0.01621	validation_1-uplift-score:-0.023191
[2]	validation_0-error:0.288408	validation_1-error:0.291759	validation_0-uplift-score:-0.014557	validation_1-uplift-score:-0.017648
[3]	validation_0-error:0.288408	validation_1-error:0.291759	validation_0-uplift-score:-0.014851	validation_1-uplift-score:-0.020145
[4]	validation_0-error:0.288408	validation_1-error:0.291759	validation_0-uplift-score:-0.016306	validation_1-uplift-score:-0.017141
[5]	validation_0-error:0.288408	validation_1-error:0.291759	validation_0-uplift-score:-0.016267	validation_1-uplift-score:-0.019319
[6]	validation_0-error:0.288408	val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-error:0.297425	validation_1-error:0.300682	validation_0-uplift-score:-0.01478	validation_1-uplift-score:-0.026834
Multiple eval metrics have been passed: 'validation_1-uplift-score' will be used for early stopping.

Will train until validation_1-uplift-score hasn't improved in 10 rounds.
[1]	validation_0-error:0.291133	validation_1-error:0.291759	validation_0-uplift-score:-0.015532	validation_1-uplift-score:-0.026159
[2]	validation_0-error:0.289008	validation_1-error:0.28936	validation_0-uplift-score:-0.014868	validation_1-uplift-score:-0.02549
[3]	validation_0-error:0.288858	validation_1-error:0.289735	validation_0-uplift-score:-0.011798	validation_1-uplift-score:-0.022478
[4]	validation_0-error:0.288858	validation_1-error:0.289735	validation_0-uplift-score:-0.013629	validation_1-uplift-score:-0.021968
[5]	validation_0-error:0.288858	validation_1-error:0.289735	validation_0-uplift-score:-0.012216	validation_1-uplift-score:-0.021309
[6]	validation_0-error:0.288565	vali

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-error:0.294953	validation_1-error:0.293399	validation_0-uplift-score:-0.018289	validation_1-uplift-score:-0.013497
Multiple eval metrics have been passed: 'validation_1-uplift-score' will be used for early stopping.

Will train until validation_1-uplift-score hasn't improved in 10 rounds.
[1]	validation_0-error:0.294953	validation_1-error:0.293399	validation_0-uplift-score:-0.018289	validation_1-uplift-score:-0.014664
[2]	validation_0-error:0.292866	validation_1-error:0.291449	validation_0-uplift-score:-0.017747	validation_1-uplift-score:-0.014666
[3]	validation_0-error:0.289292	validation_1-error:0.28645	validation_0-uplift-score:-0.015247	validation_1-uplift-score:-0.011192
[4]	validation_0-error:0.289292	validation_1-error:0.28645	validation_0-uplift-score:-0.016664	validation_1-uplift-score:-0.014683
[5]	validation_0-error:0.289386	validation_1-error:0.287	validation_0-uplift-score:-0.015997	validation_1-uplift-score:-0.013684
[6]	validation_0-error:0.289061	valida

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-error:0.294341	validation_1-error:0.294548	validation_0-uplift-score:-0.018444	validation_1-uplift-score:-0.01418
Multiple eval metrics have been passed: 'validation_1-uplift-score' will be used for early stopping.

Will train until validation_1-uplift-score hasn't improved in 10 rounds.
[1]	validation_0-error:0.294341	validation_1-error:0.294548	validation_0-uplift-score:-0.018025	validation_1-uplift-score:-0.011984
[2]	validation_0-error:0.289698	validation_1-error:0.28775	validation_0-uplift-score:-0.016558	validation_1-uplift-score:-0.013429
[3]	validation_0-error:0.289573	validation_1-error:0.287875	validation_0-uplift-score:-0.017052	validation_1-uplift-score:-0.013781
[4]	validation_0-error:0.288755	validation_1-error:0.288974	validation_0-uplift-score:-0.016259	validation_1-uplift-score:-0.013133
[5]	validation_0-error:0.288755	validation_1-error:0.288974	validation_0-uplift-score:-0.0153	validation_1-uplift-score:-0.012127
[6]	validation_0-error:0.288755	valid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-error:0.294304	validation_1-error:0.293424	validation_0-uplift-score:-0.017181	validation_1-uplift-score:-0.014761
Multiple eval metrics have been passed: 'validation_1-uplift-score' will be used for early stopping.

Will train until validation_1-uplift-score hasn't improved in 10 rounds.
[1]	validation_0-error:0.289161	validation_1-error:0.288749	validation_0-uplift-score:-0.016807	validation_1-uplift-score:-0.014761
[2]	validation_0-error:0.289161	validation_1-error:0.288749	validation_0-uplift-score:-0.018635	validation_1-uplift-score:-0.011259
[3]	validation_0-error:0.289161	validation_1-error:0.288749	validation_0-uplift-score:-0.017182	validation_1-uplift-score:-0.012583
[4]	validation_0-error:0.289161	validation_1-error:0.288749	validation_0-uplift-score:-0.017932	validation_1-uplift-score:-0.011256
[5]	validation_0-error:0.289161	validation_1-error:0.288749	validation_0-uplift-score:-0.017307	validation_1-uplift-score:-0.013086
[6]	validation_0-error:0.288511	v

## Class transformation

In [81]:
X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test = read_train_test()
X_train, y_train = join_train_validation(X_train, X_valid, y_train, y_valid)
train_is_treatment = pd.concat([train_is_treatment, valid_is_treatment], ignore_index=False)

In [82]:
y_train.value_counts()

1    124002
0     76037
Name: target, dtype: int64

In [83]:
y_train[((train_is_treatment == 1) & (y_train == 1)) | ((train_is_treatment == 0) & (y_train == 0))].value_counts()

1    63639
0    39695
Name: target, dtype: int64

In [84]:
y_train[((train_is_treatment == 1) & (y_train == 0)) | ((train_is_treatment == 0) & (y_train == 1))].value_counts()

1    60363
0    36342
Name: target, dtype: int64

In [85]:
y_train.loc[((train_is_treatment == 1) & (y_train == 1)) | ((train_is_treatment == 0) & (y_train == 0))] = 1
y_train.loc[((train_is_treatment == 1) & (y_train == 0)) | ((train_is_treatment == 0) & (y_train == 1))] = 0
y_train.value_counts()

0    136400
1     63639
Name: target, dtype: int64

In [88]:
metrics = {
    "accuracy": [],
    "roc_auc": [],
    "logloss": [],
    "uplift": []
}

for i, (train_idx, test_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i + 1}/5")
    
    train_data = X_train.iloc[train_idx]
    train_target = y_train.iloc[train_idx]
    
    test_data = X_train.iloc[test_idx]
    test_target = y_train.iloc[test_idx]
    
    eval_set = [(train_data, train_target), (test_data, test_target)]
    clf = xgb.XGBClassifier(objective="binary:logistic").fit(
        train_data, train_target, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set, verbose=True
    )
    
    proba = clf.predict_proba(test_data)[:, 1]
    uplift_prediction = 2 * proba - 1
    up_score = uplift_score(uplift_prediction, test_target, train_is_treatment.iloc[test_idx])

    metrics["accuracy"].append(clf.score(test_data, test_target))
    metrics["roc_auc"].append(roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["logloss"].append(log_loss(test_target, clf.predict_proba(test_data)[:, 1]))
    metrics["uplift"].append(up_score)

print(metrics)
print(dict(zip(metrics.keys(), list(map(np.mean, metrics.values())))))

Fold 1/5
[0]	validation_0-auc:0.633641	validation_1-auc:0.642601
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.636137	validation_1-auc:0.64563
[2]	validation_0-auc:0.639011	validation_1-auc:0.648871
[3]	validation_0-auc:0.640374	validation_1-auc:0.650059
[4]	validation_0-auc:0.640773	validation_1-auc:0.650442
[5]	validation_0-auc:0.641618	validation_1-auc:0.651056
[6]	validation_0-auc:0.641792	validation_1-auc:0.651005
[7]	validation_0-auc:0.642196	validation_1-auc:0.65094
[8]	validation_0-auc:0.642269	validation_1-auc:0.65125
[9]	validation_0-auc:0.642507	validation_1-auc:0.651157
[10]	validation_0-auc:0.642705	validation_1-auc:0.651299
[11]	validation_0-auc:0.642991	validation_1-auc:0.65152
[12]	validation_0-auc:0.643318	validation_1-auc:0.651605
[13]	validation_0-auc:0.643424	validation_1-auc:0.6517
[14]	validation_0-auc:0.643689	validation_1-auc:0.65