## フル特徴量で予測

In [1]:
from os.path import dirname
import os
import datetime
from dateutil.relativedelta import relativedelta
import sys
from functools import reduce

import pandas as pd
import swifter
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns', 200)

In [3]:
%reload_ext autoreload
from brandear_est import utils, dataset as bds, models, evals as ev, submit
from brandear_est.preprocess import pp_est_rank as pp

In [4]:
SUB_DIR = os.path.join(os.getcwd(), "../../data/submit/")
IMD_DIR = os.path.join(os.getcwd(), "../../data/intermediate/")
INPUT_DIR = os.path.join(os.getcwd(), "../../data/input")

IMD_INPUT_DIR = os.path.join(IMD_DIR, "pickled_inputs")
IMD_EST_WEEKLY_DIR = os.path.join(IMD_DIR, "rank_weekly")
IMD_ARCHIVE_DIR = os.path.join(IMD_DIR, "arc_rank")


dset_to_period = {
    "train": {"oldest": datetime.datetime(2019, 9, 3, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 10, 0, 0, 0)},
    "valid_for_train": {"oldest": datetime.datetime(2019, 9, 10, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 17, 0, 0, 0),},    
    "valid_for_sub": {"oldest": datetime.datetime(2019, 9, 17, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 24, 0, 0, 0),},
    "submission": {"oldest": datetime.datetime(2019, 9, 24, 0, 0, 0),
              "newest": datetime.datetime(2019, 10, 1, 0, 0, 0),},    
}

## インプットデータ整理

In [5]:
# 元データ読み込み
auction = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "auction.pkl"))
watch = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "watch.pkl"))
bid = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "bid.pkl"))
bid_success = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "bid_success.pkl"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submission.csv"))
sub_users = sample_sub[["KaiinID"]].drop_duplicates()

# ブランド作成日
brand = utils.read_csv(os.path.join(INPUT_DIR , "brand.csv"))
# itemcolorid付与(色の大分類)
color = utils.read_csv(os.path.join(INPUT_DIR , "color.csv"))
# カテゴリーid,itemdailID付与のためのItemShouID付与
genre = utils.read_csv(os.path.join(INPUT_DIR , "genre.csv"))
# itemshowID,itemdailID付与
itemshou = utils.read_csv(os.path.join(INPUT_DIR , "itemshou.csv"))
# itemlineID付与
line = utils.read_csv(os.path.join(INPUT_DIR , "line.csv"))
# 会員登録日、生年月日付与
kaiin = utils.read_csv(os.path.join(INPUT_DIR , "kaiin.csv"))

# オークション情報拡充
auction_mst = pp.build_auction_mst(
    auction=auction, itemshou=itemshou, genre=genre,
    brand=brand, color=color, line=line
)

Mem. usage decreased to  0.32 Mb (15.0% reduction)
Mem. usage decreased to  0.01 Mb (32.2% reduction)
Mem. usage decreased to  0.00 Mb (45.4% reduction)
Mem. usage decreased to  0.00 Mb (33.4% reduction)
Mem. usage decreased to  0.01 Mb (41.3% reduction)
Mem. usage decreased to  5.83 Mb (31.2% reduction)


## 特徴量エンジニアリング

In [8]:
# 特徴量作成の設定
# dataset_types = ["valid_for_train", "valid_for_sub", "submission"]
dataset_types = ["valid_for_train"]
rank_th = 1600
# 商品紐付けとest_rank_weeklyの出力のどちら、または両方使うかのフラグ
# inputs_type = ["Shouhin", "rank_weekly"]
inputs_type = ["Shouhin"]
rank_weekly_th = 1600

data_dict = {"watch": watch, "bid": bid, "bid_success": bid_success, "auction": auction_mst, "sub_users": sub_users}

In [9]:
%%time
pp.build_target_candidate(
    dataset_types=dataset_types, inputs_type=inputs_type, data_dict=data_dict,
    dset_to_period=dset_to_period, rank_th=rank_th, rank_weekly_th=rank_weekly_th,
    input_est_weekly_dir=IMD_EST_WEEKLY_DIR, output_dir=IMD_ARCHIVE_DIR
)

['Shouhin']
##################
start cross count
['AuctionID', 'BrandID', 'ItemShouID', 'ShouhinID', ['KaiinID', 'AuctionID'], ['KaiinID', 'BrandID'], ['KaiinID', 'ItemShouID'], ['KaiinID', 'ShouhinID']]
AuctionID
BrandID
ItemShouID
ShouhinID
['KaiinID', 'AuctionID']
['KaiinID', 'BrandID']
['KaiinID', 'ItemShouID']
['KaiinID', 'ShouhinID']
##################
start cross count
['AuctionID', 'BrandID', 'ItemShouID', 'ShouhinID', ['KaiinID', 'AuctionID'], ['KaiinID', 'BrandID'], ['KaiinID', 'ItemShouID'], ['KaiinID', 'ShouhinID']]
AuctionID
BrandID
ItemShouID
ShouhinID
['KaiinID', 'AuctionID']
['KaiinID', 'BrandID']
['KaiinID', 'ItemShouID']
['KaiinID', 'ShouhinID']


in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…




in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


CPU times: user 1min 4s, sys: 33.2 s, total: 1min 37s
Wall time: 2min 17s


In [14]:
vt_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/valid_for_train_feature_1600_202002231640.pkl")
# vs_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/valid_for_sub_feature_1600_202002231500.pkl")
# sub_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/submission_feature_1600_202002231509.pkl")

add_confs = {
    "valid_for_train": vt_dataset_base
#     "valid_for_sub": vs_dataset_base,
#     "submission": sub_dataset_base
}

def calc_timedelta(df, dtime_col, delta_col, oldest_dtime):
    df[delta_col] = df[dtime_col].swifter.apply(lambda d: (oldest_dtime - d).days)
    
for dset_type, dataset_base  in add_confs.items():
    print(dset_type)
    watch_arranged, bid_arranged, bid_success_arranged, auction_arranged = (
        arrange_inputs(watch=watch, bid=bid, bid_success=bid_success, auction=auction, period=dset_to_period[dset_type])
    )
    watch_cnt = watch_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "watch_cnt"})
    bid_cnt = bid_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "bid_cnt"})
    dataset_base = dataset_base.merge(bid_cnt, on="KaiinID", how="left").merge(watch_cnt, on="KaiinID", how="left").fillna(0)
    
    calc_timedelta(watch_arranged, "TourokuDate", "watch_elapsed_days", dset_to_period[dset_type]["oldest"])
    watch_interval = (
        watch_arranged[["KaiinID", "watch_elapsed_days"]]
        .sort_values(["KaiinID", "watch_elapsed_days"])
        .groupby("KaiinID")["watch_elapsed_days"].apply(lambda s: (s - s.shift()).mean())
    )    
    dataset_base["watch_interval"] = dataset_base["KaiinID"].map(watch_interval).fillna(999)
    
    calc_timedelta(bid_arranged, "ShudouNyuusatsuDate", "bid_elapsed_days", dset_to_period[dset_type]["oldest"])
    bid_interval = (
        bid_arranged[["KaiinID", "bid_elapsed_days"]]
        .sort_values(["KaiinID", "bid_elapsed_days"])
        .groupby("KaiinID")["bid_elapsed_days"].apply(lambda s: (s - s.shift()).mean())
    )    
    dataset_base["bid_interval"] = dataset_base["KaiinID"].map(bid_interval).fillna(999)
    
    
    dataset_base["left_day_interval_rate"] = (
        dataset_base["KaiinID_watch_elapsed_day_min"] / dataset_base["Auction_elapsed_days"]
    ).fillna(0)
    dataset_base["left_day_watch_interval_rate"] = (
        dataset_base["watch_interval"] / dataset_base["Auction_elapsed_days"]
    ).fillna(0)
    dataset_base["left_day_bid_interval_rate"] = (
        dataset_base["bid_interval"] / dataset_base["Auction_elapsed_days"]
    ).fillna(0)
    
    now = datetime.datetime.now().strftime("%Y%m%d%H%M")
    be.df2pkl(dataset_base, IMD_ARCHIVE_DIR, f"{dset_type}_feature_{rank_th}_cnts_{now}.pkl")

valid_for_train


## 予測

In [6]:
# vt_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/valid_for_train_feature_1600_cnts_202002231645.pkl")
vs_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/valid_for_sub_feature_1600_cnts_202002231516.pkl")
sub_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/submission_feature_1600_cnts_202002231517.pkl")

In [7]:
auc_foreign_ids = ["BrandID", "CategoryID", "ColorID", "ConditionID", "DanjobetsuID", "GenreGroupID",
            "GenreID", "ItemColorID", "ItemDaiID", "ItemLineID", "ItemShouID", "LineID", "ShouhinID", "ShouhinShubetsuID"]
drop_cols = (
    ["KaiinID", "AuctionID", "BrandCreateDate", "LineCreateDate", "CreateDate", "rank", "watch_actioned", "bid_actioned", "watch_cnt", "bid_cnt"]
    + auc_foreign_ids
)

# vt_private_dataset = DataSet(data=vt_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)"),
#                                 drop_cols=drop_cols, target_col="watch_actioned")
# vt_business_dataset = DataSet(data=vt_dataset_base.query("(watch_cnt >= 1000) | (bid_cnt >= 50)"),
#                                 drop_cols=drop_cols, target_col="watch_actioned")

vs_private_dataset = bds.DataSet(data=vs_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")
vs_business_dataset = bds.DataSet(data=vs_dataset_base.query("(watch_cnt >= 1000) | (bid_cnt >= 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")

sub_private_dataset = bds.DataSet(data=sub_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")
sub_business_dataset = bds.DataSet(data=sub_dataset_base.query("(watch_cnt >= 1000) | (bid_cnt >= 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")


In [19]:
%%time

target_col = "watch_actioned"

datasets_conf = {
    "private": {"in_train_dataset": vs_private_dataset, "sub_dataset": sub_private_dataset,
    "sample_posi": 4, "sample_nega": 400},
    "business": {"in_train_dataset": vs_business_dataset, "sub_dataset": sub_business_dataset,
    "sample_posi": 40, "sample_nega": 600}
}

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    "ndcg_at": 20,
    "nround": 500,
    "learning_rate": 0.01,
    "max_depth": 6,
    "num_leaves": 127
}

ensemble_pri_buf = {}
ensemble_bus_buf = {}
ensemble_buf = {
    "private": ensemble_pri_buf, "business": ensemble_bus_buf
}

rank_buf = {}

for dataset_name, dataset_conf in datasets_conf.items():

    in_train_dataset = dataset_conf["in_train_dataset"]
    sub_dataset = dataset_conf["sub_dataset"]

    seed_buf = {}
    
    for seed in range(5):
        # train test split
        sampled_train = bds.DataSet(
            pd.concat([
                in_train_dataset.data.query("(watch_actioned == 1) | (bid_actioned == 1)").sample(frac=1)
                .groupby("KaiinID").head(dataset_conf["sample_posi"]),
                in_train_dataset.data.query("(watch_actioned != 1) & (bid_actioned != 1)").sample(frac=1)
                .groupby("KaiinID").head(dataset_conf["sample_nega"]),
            ]),
            drop_cols=drop_cols, target_col=target_col
        )
        # target encoding
        cat_cols = ["BrandID", "ItemShouID", "AuctionID"]
        for cat_col in cat_cols:
            sampled_train.add_target_encode(cat_col=cat_col)

        params["random_state"] = seed
        lgb_rank = models.LgbLambdaLank(params=params)
        lgb_rank.train(train_dataset=sampled_train, desc=True)

        for cat_col in cat_cols:
            bds.target_encode_for_test(
                train_dataset=in_train_dataset, test_dataset=sub_dataset, cat_col=cat_col
            )
        
        seed_buf[seed] = lgb_rank.predict(sub_dataset)

    sampled_train = bds.DataSet(
        pd.concat([
            in_train_dataset.data.query("(watch_actioned == 1) | (bid_actioned == 1)").sample(frac=1)
            .groupby("KaiinID").head(dataset_conf["sample_posi"]),
            in_train_dataset.data.query("(watch_actioned != 1) & (bid_actioned != 1)").sample(frac=1)
            .groupby("KaiinID").head(dataset_conf["sample_nega"]),
        ]),
        drop_cols=drop_cols, target_col=target_col
    )        
    
    for cat_col in cat_cols:
        sampled_train.add_target_encode(cat_col=cat_col)        
        bds.target_encode_for_test(
            train_dataset=in_train_dataset, test_dataset=sub_dataset, cat_col=cat_col
        )

    lgb_bin = models.LgbBinaryClassifier()
    lgb_bin.train(train_dataset=sampled_train)
    ensemble_buf[dataset_name]["lgb_bin"] = lgb_bin.predict(dataset=sub_dataset)
                
    rfc_bin = models.RfcBinaryClassifier()
    rfc_bin.train(train_dataset=sampled_train)
    ensemble_buf[dataset_name]["rfc_bin"] = rfc_bin.predict(dataset=sub_dataset)
        
    ensemble_buf[dataset_name]["lgb_rank"] = reduce(lambda a,b: a+b, [v for v in seed_buf.values()])



##################
                                             importance
SaishuppinKaisuu                                    204
SankouKakaku                                         43
watch_AuctionID_cnt                                 245
watch_BrandID_cnt                                   270
watch_ItemShouID_cnt                                 95
watch_ShouhinID_cnt                                 144
watch_KaiinID_AuctionID_cnt_x                        52
watch_KaiinID_BrandID_cnt                            86
watch_KaiinID_ItemShouID_cnt                        106
watch_KaiinID_ShouhinID_cnt                         125
bid_AuctionID_cnt                                    82
bid_BrandID_cnt                                     360
bid_ItemShouID_cnt                                   65
bid_ShouhinID_cnt                                    43
bid_KaiinID_AuctionID_cnt                             0
bid_KaiinID_BrandID_cnt                             118
bid_KaiinID_ItemShouID_cnt 



##################
                                             importance
SaishuppinKaisuu                                    130
SankouKakaku                                         57
watch_AuctionID_cnt                                 253
watch_BrandID_cnt                                   318
watch_ItemShouID_cnt                                124
watch_ShouhinID_cnt                                 178
watch_KaiinID_AuctionID_cnt_x                        41
watch_KaiinID_BrandID_cnt                           113
watch_KaiinID_ItemShouID_cnt                        136
watch_KaiinID_ShouhinID_cnt                         127
bid_AuctionID_cnt                                    82
bid_BrandID_cnt                                     317
bid_ItemShouID_cnt                                   35
bid_ShouhinID_cnt                                    38
bid_KaiinID_AuctionID_cnt                             0
bid_KaiinID_BrandID_cnt                              90
bid_KaiinID_ItemShouID_cnt 



##################
                                             importance
SaishuppinKaisuu                                    280
SankouKakaku                                         78
watch_AuctionID_cnt                                 141
watch_BrandID_cnt                                   244
watch_ItemShouID_cnt                                 63
watch_ShouhinID_cnt                                 164
watch_KaiinID_AuctionID_cnt_x                       121
watch_KaiinID_BrandID_cnt                           325
watch_KaiinID_ItemShouID_cnt                        153
watch_KaiinID_ShouhinID_cnt                         120
bid_AuctionID_cnt                                    65
bid_BrandID_cnt                                     279
bid_ItemShouID_cnt                                   48
bid_ShouhinID_cnt                                    55
bid_KaiinID_AuctionID_cnt                             0
bid_KaiinID_BrandID_cnt                             167
bid_KaiinID_ItemShouID_cnt 

In [20]:
sub_private_dataset.set_pred(
    ensemble_buf["private"]["lgb_rank"]
    + 5 *(ensemble_buf["private"]["lgb_bin"] + ensemble_buf["private"]["rfc_bin"])
)

sub_business_dataset.set_pred(
    ensemble_buf["business"]["lgb_rank"]
    + 5 * (ensemble_buf["business"]["lgb_bin"] + ensemble_buf["business"]["rfc_bin"])
)

In [21]:
pred = (
    pd.concat([sub_private_dataset.data[["KaiinID", "AuctionID", "pred"]], sub_business_dataset.data[["KaiinID", "AuctionID", "pred"]]])
    .rename(columns={"pred": "score"})
)
sub = submit.adjust_sub_form(sub_users, pred, drop=True)
# sub.to_csv(SUB_DIR + datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)
sub.head()

Unnamed: 0,KaiinID,AuctionID
865,24,910664
77,24,24763
520,24,302925
610,24,426583
1559,24,4084555


### 予測結果可視化

### 過去データセットでndcg計算

In [233]:
# y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
# sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(),
#                       pd.concat([vs_private_dataset.data[["KaiinID", "AuctionID", "pred"]], vs_business_dataset.data[["KaiinID", "AuctionID", "pred"]]])
#                       .rename(columns={"pred":"score"}), drop=True
# )
# ndcg_score = be.calc_ndcg(y_true, sub)
# print("ndcg_score : ", ndcg_score)

ndcg_score :  0.061767156609773444


### ユーザー毎のdcgとその他の変数の関係性を調査

In [33]:
y_true = pp.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
sub = submit.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(),
                      pd.concat([vs_private_dataset.data[["KaiinID", "AuctionID", "pred"]], vs_business_dataset.data[["KaiinID", "AuctionID", "pred"]]])
                      .rename(columns={"pred":"score"}), drop=True
)
dcgs = ev.calc_dcgs(y_true, sub)
dcgs = dcgs.reset_index()

In [34]:
dcg_infos = (
    dcgs.merge(watch_cnt, on="KaiinID", how="left")
    .merge(bid_cnt, on="KaiinID", how="left")
    .merge(y_true.drop("AuctionID", axis=1).groupby("KaiinID", as_index=False).sum(), on="KaiinID", how="left")
    .merge(cheat_dcgs.rename(columns={"score": "cheat_score"}), on="KaiinID", how="left")
    .fillna(0)
)
print(dcg_infos.corr())
print(dcg_infos.query("score == 0").describe())
print(dcg_infos.query("score != 0").describe())
print(dcg_infos.query("(watch_cnt < 1000) & (bid_cnt < 50)").describe())
print(dcg_infos.query("(watch_cnt >= 1000) | (bid_cnt >= 50)").describe())
print((dcg_infos["cheat_score"] == 0).sum())

                 KaiinID     score  watch_cnt   bid_cnt  watch_actioned  \
KaiinID         1.000000  0.004205  -0.012328 -0.003893       -0.016975   
score           0.004205  1.000000   0.025938  0.031368        0.020287   
watch_cnt      -0.012328  0.025938   1.000000  0.509239        0.622164   
bid_cnt        -0.003893  0.031368   0.509239  1.000000        0.325908   
watch_actioned -0.016975  0.020287   0.622164  0.325908        1.000000   
bid_actioned   -0.017098  0.091771   0.248743  0.616189        0.386692   
cheat_score    -0.001473  0.450758   0.194837  0.115459        0.182354   

                bid_actioned  cheat_score  
KaiinID            -0.017098    -0.001473  
score               0.091771     0.450758  
watch_cnt           0.248743     0.194837  
bid_cnt             0.616189     0.115459  
watch_actioned      0.386692     0.182354  
bid_actioned        1.000000     0.108280  
cheat_score         0.108280     1.000000  
             KaiinID   score     watch_cnt     

In [325]:
# ランダムな予測の場合
# y_true = pp.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
# random_pred = watch_vs_dataset.data[["KaiinID", "AuctionID"]].copy()
# random_pred["score"] = pd.Series(np.random.random(len(random_pred)), index=random_pred.index)
# random_sub = submit.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(), random_pred, drop=True)
# ndcg_score = ev.calc_dcgs(y_true, random_sub)
# print("ndcg_score : ", ndcg_score.mean())

ndcg_score :  0.012993657402914974
