In [17]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
def read_train_test():
    X_train = pd.read_csv(
        "../data/processed/two_models/X_train.csv", index_col="client_id"
    )
    y_train = pd.read_csv(
        "../data/processed/two_models/y_train.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    train_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_train_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_valid = pd.read_csv("../data/processed/two_models/X_valid.csv", index_col="client_id")
    y_valid = pd.read_csv(
        "../data/processed/two_models/y_valid.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    valid_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_valid_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_test = pd.read_csv("../data/processed/two_models/X_test.csv", index_col="client_id")

    return X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test


def join_train_validation(X_train, X_valid, y_train, y_valid):
    X_train = pd.concat([X_train, X_valid], ignore_index=False)
    y_train = pd.concat([y_train, y_valid], ignore_index=False)
    return X_train, y_train


def split_control_treatment(X, y, is_treatment):
    X_control = X[is_treatment == 0]
    X_treatment = X[is_treatment == 1]
    y_control = y[is_treatment == 0]
    y_treatment = y[is_treatment == 1]
    return X_control, X_treatment, y_control, y_treatment

In [4]:
def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    print(f"    number of treatment users: {treatment_n}")
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    print(f"    treatment p: {treatment_p}")
    control_n = int((treatment == 0).sum() * rate)
    print(f"    number of control users: {treatment_n}")
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    print(f"    control p: {control_p}")
    score = treatment_p - control_p
    return score

In [5]:
X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test = read_train_test()

In [8]:
X_train.fillna(-999, inplace=True)
X_valid.fillna(-999, inplace=True)

In [9]:
X_train_control, X_train_treatment, y_train_control, y_train_treatment = split_control_treatment(
    X_train, y_train, train_is_treatment
)
X_valid_control, X_valid_treatment, y_valid_control, y_valid_treatment = split_control_treatment(
    X_valid, y_valid, valid_is_treatment
)

In [10]:
neigh_control = KNeighborsClassifier(n_neighbors=3)
neigh_control.fit(X_train_control, y_train_control)
neigh_control.score(X_valid_control, y_valid_control)

0.6014587525150905

In [11]:
neigh_treatment = KNeighborsClassifier(n_neighbors=3)
neigh_treatment.fit(X_train_treatment, y_train_treatment)
neigh_treatment.score(X_valid_treatment, y_valid_treatment)

0.622714626391097

In [13]:
predict_valid_control = neigh_control.predict_proba(X_valid)[:, 1]
predict_valid_treatment = neigh_treatment.predict_proba(X_valid)[:, 1]
predict_valid_uplift = predict_valid_treatment - predict_valid_control
uplift_score(predict_valid_uplift, valid_is_treatment, y_valid)

    number of treatment users: 6038
    treatment p: 0.6414375621066578
    number of control users: 6038
    control p: 0.6116700201207244


0.029767541985933454

In [18]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [19]:
X_train_control, X_train_treatment, y_train_control, y_train_treatment = split_control_treatment(
    X_train_scaled, y_train, train_is_treatment
)
X_valid_control, X_valid_treatment, y_valid_control, y_valid_treatment = split_control_treatment(
    X_valid_scaled, y_valid, valid_is_treatment
)

In [20]:
neigh_control = KNeighborsClassifier(n_neighbors=3)
neigh_control.fit(X_train_control, y_train_control)
neigh_control.score(X_valid_control, y_valid_control)

0.6213782696177063

In [21]:
neigh_treatment = KNeighborsClassifier(n_neighbors=3)
neigh_treatment.fit(X_train_treatment, y_train_treatment)
neigh_treatment.score(X_valid_treatment, y_valid_treatment)

0.6397058823529411

In [22]:
predict_valid_control = neigh_control.predict_proba(X_valid)[:, 1]
predict_valid_treatment = neigh_treatment.predict_proba(X_valid)[:, 1]
predict_valid_uplift = predict_valid_treatment - predict_valid_control
uplift_score(predict_valid_uplift, valid_is_treatment, y_valid)

    number of treatment users: 6038
    treatment p: 0.649387214309374
    number of control users: 6038
    control p: 0.6190476190476191


0.030339595261754893

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [25]:
X_train_control, X_train_treatment, y_train_control, y_train_treatment = split_control_treatment(
    X_train_scaled, y_train, train_is_treatment
)
X_valid_control, X_valid_treatment, y_valid_control, y_valid_treatment = split_control_treatment(
    X_valid_scaled, y_valid, valid_is_treatment
)

In [26]:
neigh_control = KNeighborsClassifier(n_neighbors=3)
neigh_control.fit(X_train_control, y_train_control)
neigh_control.score(X_valid_control, y_valid_control)

0.6477867203219316

In [27]:
neigh_treatment = KNeighborsClassifier(n_neighbors=3)
neigh_treatment.fit(X_train_treatment, y_train_treatment)
neigh_treatment.score(X_valid_treatment, y_valid_treatment)

0.6598767885532592

In [28]:
predict_valid_control = neigh_control.predict_proba(X_valid)[:, 1]
predict_valid_treatment = neigh_treatment.predict_proba(X_valid)[:, 1]
predict_valid_uplift = predict_valid_treatment - predict_valid_control
uplift_score(predict_valid_uplift, valid_is_treatment, y_valid)

    number of treatment users: 6038
    treatment p: 0.6503809208347134
    number of control users: 6038
    control p: 0.6158618376928237


0.034519083141889784

In [23]:
n_neighbors_list = list(range(1, 26))

In [None]:
scores = {}
for k in n_neighbors_list:
    print(k)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    scores[str(k)] = knn.score(X_valid_scaled, y_valid)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_control = LogisticRegression()
lr_control.fit(X_train_control, y_train_control)
lr_control.score(X_valid_control, y_valid_control)

In [None]:
lr_treatment = LogisticRegression()
lr_treatment.fit(X_train_treatment, y_train_treatment)
lr_treatment.score(X_valid_treatment, y_valid_treatment)

In [None]:
predict_valid_control = lr_control.predict_proba(X_valid)[:, 1]
predict_valid_treatment = lr_treatment.predict_proba(X_valid)[:, 1]
predict_valid_uplift = predict_valid_treatment - predict_valid_control
uplift_score(predict_valid_uplift, valid_is_treatment, y_valid)