In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklift.metrics import uplift_auc_score
from sklift.models import ClassTransformation

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,application_1,cc_1,cc_2,cc_3,cc_4,feature_1,mb_1,cc_5,cc_6,feature_2,...,cc_21,application_15,feature_25,feature_26,cc_22,partner_24,application_16,retro_date,successful_utilization,treatment
0,,1.0,Representatives,123.0,Первичная ДК,1,14.0,147000.0,PLT,0.0,...,,0,1.0,0.0,-1.2,1.0,0,2024-07-04,0,1
1,,1.0,Offline,43.0,Airports,0,1.0,120000.0,PLT,0.0,...,1.0,0,1.0,0.0,-1.2,1.0,0,2024-06-06,0,1
2,0.0,1.0,Web,2.0,seo,0,,15000.0,PLT,0.0,...,,0,,0.0,-1.2,1.0,0,2024-07-21,1,1
3,0.0,1.0,MB,2.0,One Click Offer,0,91.0,260000.0,PLT,0.0,...,1.0,0,,1.0,-1.2,1.0,0,2024-05-23,0,1
4,0.0,1.0,Representatives,123.0,Первичная ДК,1,1.0,130000.0,PLT,0.0,...,,0,1.0,0.0,-1.2,1.0,0,2024-06-28,0,1


In [9]:
# Проверка количества уникальных значений
# for col in df.columns:
#     print(col, len(df[col].value_counts()))

In [23]:
df = pd.read_csv("data/train.csv")
df = df.drop(
    ["retro_date", "cc_1", "feature_6"], 
    axis=1
)

treatment = df["treatment"]
y = df["successful_utilization"]
X = df.drop(["treatment", "successful_utilization"], axis=1)

cat_features = []
for col in X.columns:
    if len(X[col].value_counts()) < 76:
        cat_features.append(col)

cat_imp = SimpleImputer(strategy="most_frequent")
X_cat = pd.DataFrame(cat_imp.fit_transform(X[cat_features]), columns=cat_features).astype(str)

another = SimpleImputer()
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(another.fit_transform(X.drop(cat_features, axis=1))), 
                  columns=X.drop(cat_features, axis=1).columns)


for col in cat_features:
    X[col] = X_cat[col]

# n_components = 100
# pca = PCA(n_components=n_components)
# X = pca.fit_transform(X)



In [27]:
stratify_cols = pd.concat([treatment, y], axis=1)

X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
    X,
    treatment,
    y,
    stratify=stratify_cols,
    test_size=0.3,
    random_state=42
)

In [28]:
estimator = CatBoostClassifier(verbose=100,
                               random_state=42,
                               thread_count=4,
                               cat_features=cat_features)

ct_model = ClassTransformation(estimator=estimator)

ct_model.fit(
    X=X_train, 
    y=y_train, 
    treatment=trmnt_train
)
uplift_ct = ct_model.predict(X_val)
auuc = uplift_auc_score(y_val, uplift_ct, trmnt_val)
print(f"Uplift auc score: {auuc:.4f}")

Learning rate set to 0.10065
0:	learn: 0.6680698	total: 847ms	remaining: 14m 6s
100:	learn: 0.5356039	total: 53.1s	remaining: 7m 53s
200:	learn: 0.5264610	total: 1m 44s	remaining: 6m 54s
300:	learn: 0.5204786	total: 2m 34s	remaining: 5m 59s
400:	learn: 0.5158943	total: 3m 25s	remaining: 5m 6s
500:	learn: 0.5109628	total: 4m 16s	remaining: 4m 15s
600:	learn: 0.5066411	total: 5m 6s	remaining: 3m 23s
700:	learn: 0.5025869	total: 5m 56s	remaining: 2m 32s
800:	learn: 0.4989305	total: 6m 46s	remaining: 1m 41s
900:	learn: 0.4951979	total: 7m 35s	remaining: 50.1s
999:	learn: 0.4917199	total: 8m 24s	remaining: 0us
Uplift auc score: 0.0549


# Тест

In [347]:
df = pd.read_csv("data/train.csv")
df = df.drop(["retro_date", "cc_1", "feature_6", "application_16"], axis=1)

rat = len(df.loc[df['successful_utilization']==0])//len(df.loc[df['successful_utilization']==1])
df_1 = df.loc[df['successful_utilization']==1]
df_1 = df_1.loc[df_1.index.repeat(rat)]
df = pd.concat([df.loc[df['successful_utilization']==0], df_1]).sample(frac=1)

imp_cc_2 = SimpleImputer(strategy="most_frequent")
imp_cc_4 = SimpleImputer(strategy="most_frequent")
imp_cc_6 = SimpleImputer(strategy="most_frequent")
df["cc_2"] = imp_cc_2.fit_transform(np.array(df["cc_2"]).reshape(-1, 1)).ravel()
df["cc_4"] = imp_cc_4.fit_transform(np.array(df["cc_4"]).reshape(-1, 1)).ravel()
df["cc_6"] = imp_cc_6.fit_transform(np.array(df["cc_6"]).reshape(-1, 1)).ravel()

X=df.drop(["successful_utilization", "treatment"], axis=1)
y=df["successful_utilization"]
treatment=df["treatment"]



estimator = CatBoostClassifier(verbose=100,
                               depth=7,
                               random_state=42,
                               iterations=600,
                               l2_leaf_reg=0.2,
                               thread_count=2,
                               cat_features=["cc_2", "cc_4", "cc_6"])

ct_model = ClassTransformation(estimator=estimator)

ct_model.fit(
    X=X, 
    y=y, 
    treatment=treatment
)

0:	learn: 0.6901918	total: 470ms	remaining: 4m 41s
100:	learn: 0.6274498	total: 44.5s	remaining: 3m 39s
200:	learn: 0.6179452	total: 1m 28s	remaining: 2m 56s
300:	learn: 0.6124796	total: 2m 6s	remaining: 2m 5s
400:	learn: 0.6082984	total: 2m 42s	remaining: 1m 20s
500:	learn: 0.6044125	total: 3m 19s	remaining: 39.4s
599:	learn: 0.6006114	total: 3m 55s	remaining: 0us


In [348]:
X_test = pd.read_csv("data/test.csv")
X_test = X_test.drop(["retro_date", "cc_1", "feature_6", "application_16"], axis=1)
X_test["cc_2"] = imp_cc_2.transform(np.array(X_test["cc_2"]).reshape(-1, 1)).ravel()
X_test["cc_4"] = imp_cc_4.transform(np.array(X_test["cc_4"]).reshape(-1, 1)).ravel()
X_test["cc_6"] = imp_cc_6.transform(np.array(X_test["cc_6"]).reshape(-1, 1)).ravel()
test_pred = ct_model.predict(X_test)
pd.DataFrame({"successful_utilization": test_pred}).to_csv("data/submission.csv")

In [246]:
# res = pd.read_csv("data/sample_submission2.csv").drop("Unnamed: 0", axis=1)
# res["target"] = np.ones((res.shape[0]), dtype=int)
# res.to_csv("data/submission_2.csv")