In [60]:
import pandas as pd
import os
import json
import xgboost as xgb
import numpy as np
from sklearn import metrics

In [43]:
df = []
data_dir = "sample_data"
for file in os.listdir():
    if not file.endswith(".parquet"):
        continue
    df.append(pd.read_parquet(data_dir + "/" + file, engine="pyarrow", use_threads=True))
df = pd.concat(df).reset_index(drop=True)

In [211]:
df["audio_status"]

0          True
1          True
2         False
3          True
4          True
          ...  
205787     True
205788     True
205789     True
205790     True
205791     True
Name: audio_status, Length: 205792, dtype: bool

In [70]:
eval_df = df[df["time"] >= "2023-05-16"]
train_df = df[df["time"] < "2023-05-16"]
eval_df.shape, train_df.shape

((56416, 119), (149376, 119))

In [45]:
label_cols = ["label"]
input_cols = [
    "like_2m",
    "like_4m",
    "like_8m",
    "like_16m",
    "like_30m",
    "like_1d",
    "dt_liker_2m",
    "dt_liker_4m",
    "dt_liker_8m",
    "dt_liker_16m",
    "dt_liker_30m",
    "dt_liker_1d",
    "cheer_spent_2m",
    "cheer_spent_4m",
    "cheer_spent_8m",
    "cheer_spent_16m",
    "cheer_spent_30m",
    "cheer_spent_1d",
    "comment_2m",
    "comment_4m",
    "comment_8m",
    "comment_16m",
    "comment_30m",
    "comment_1d",
    "dt_commenter_2m",
    "dt_commenter_4m",
    "dt_commenter_8m",
    "dt_commenter_16m",
    "dt_commenter_30m",
    "dt_commenter_1d",
    "cohost_req_2m",
    "cohost_req_4m",
    "cohost_req_8m",
    "cohost_req_16m",
    "cohost_req_30m",
    "cohost_req_1d",
    "dt_cohost_req_2m",
    "dt_cohost_req_4m",
    "dt_cohost_req_8m",
    "dt_cohost_req_16m",
    "dt_cohost_req_30m",
    "dt_cohost_req_1d",
    "gifts_2m",
    "gifts_4m",
    "gifts_8m",
    "gifts_16m",
    "gifts_30m",
    "gifts_1d",
    "dt_gifter_2m",
    "dt_gifter_4m",
    "dt_gifter_8m",
    "dt_gifter_16m",
    "dt_gifter_30m",
    "dt_gifter_1d",
    "share_2m",
    "share_4m",
    "share_8m",
    "share_16m",
    "share_30m",
    "share_1d",
    "dt_shares_2m",
    "dt_shares_4m",
    "dt_shares_8m",
    "dt_shares_16m",
    "dt_shares_30m",
    "dt_shares_1d",
    "follow_2m",
    "follow_4m",
    "follow_8m",
    "follow_16m",
    "follow_30m",
    "follow_1d",
    "act_vc",
    "agg_vc",
    "avg_ts",
    "live-cross-feature-user-timespent-gifting-affinity-model-v1",
    "live-promotion_retool_Planned_Live",
    "live-cross-feature-affinity-model-v2",
    "live-similar-creator-affinity",
    "live-short-video-creator-affinity",
    "live-promotion_retool_Celeb_Live",
    "live-min-views-agency",
    "live-promotion_backend_super_power_creator",
    "live-promotion_retool_Creator_Push",
    "live-promotion_retool_Good_Content",
    "live-min-views",
    "live-popular",
    "live-popularity-embeddings-affinity-v3"
  ]

In [215]:
input_cols = [
    "audio_status",
    "video_status"
  ]

In [216]:
train_data = train_df[input_cols]
train_labels = train_df[label_cols]
val_data = eval_df[input_cols]
val_labels = eval_df[label_cols]
dtrain = xgb.DMatrix(train_data, train_labels)
dval = xgb.DMatrix(val_data, val_labels)

evallist = [(dtrain, 'train'), (dval, 'eval')]

params = {
    'max_depth': 2,
    'eval_metric': ['auc', 'error', 'logloss'],
    'eta': 0.01,
    'objective': 'binary:logistic',
    }

In [217]:
model = xgb.train(params=params,
                dtrain=dtrain,
                evals = evallist,
                num_boost_round=2000,
                verbose_eval=50000,
                early_stopping_rounds=100,
                # xgb_model = pretrained_model
                )

print(f"model best iteration at {model.best_iteration}, best_score: {model.best_score} and best_ntree_limit: {model.best_ntree_limit} ")
preds = model.predict(dval)
y_true = val_labels["label"].to_numpy()
fpr, tpr, thresholds = metrics.roc_curve(y_true, preds, pos_label=1)
metrics.auc(fpr, tpr)

[0]	train-auc:0.50636	train-error:0.10103	train-logloss:0.68681	eval-auc:0.49761	eval-error:0.09636	eval-logloss:0.68674
[1363]	train-auc:0.50822	train-error:0.10103	train-logloss:0.32725	eval-auc:0.51338	eval-error:0.09636	eval-logloss:0.31718
model best iteration at 1264, best_score: 0.31718258022476503 and best_ntree_limit: 1265 


0.5133794190164173

In [126]:
np.mean(val_data["live-cross-feature-affinity-model-v2"] > 0)

0.09564662507090187

In [65]:
# v3 auc
fpr, tpr, thresholds = metrics.roc_curve(y_true, val_data["live-popularity-embeddings-affinity-v3"], pos_label=1)
metrics.auc(fpr, tpr)

0.539385716555945

In [72]:
eval_df2 = eval_df[eval_df["live-popularity-embeddings-affinity-v3"] != 0]
val_data2 = eval_df2[input_cols]
val_labels2 = eval_df2[label_cols]
dval2 = xgb.DMatrix(val_data2, val_labels2)
preds = model.predict(dval2)
fpr, tpr, thresholds = metrics.roc_curve(val_labels2["label"], val_data2["live-popularity-embeddings-affinity-v3"], pos_label=1)
metrics.auc(fpr, tpr)

0.6237975925396698

In [73]:
fpr, tpr, thresholds = metrics.roc_curve(val_labels2["label"], preds, pos_label=1)
metrics.auc(fpr, tpr)

0.6676674640920899