- データの作成過程毎にモデルを分ける
- 落札の特徴量を入れる
- クーポンの特徴量を入れる
- データセットを作り直す

In [1]:
from os.path import dirname
import os
import datetime
from dateutil.relativedelta import relativedelta
import sys

import pandas as pd
import swifter
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns', 200)

In [3]:
%reload_ext autoreload
import brandear_est as be

In [4]:
SUB_DIR = os.path.join(os.getcwd(), "../../data/submit/")
IMD_DIR = os.path.join(os.getcwd(), "../../data/intermediate/")
INPUT_DIR = os.path.join(os.getcwd(), "../../data/input")

IMD_INPUT_DIR = os.path.join(IMD_DIR, "pickled_inputs")
IMD_EST_WEEKLY_DIR = os.path.join(IMD_DIR, "rank_weekly")
IMD_ARCHIVE_DIR = os.path.join(IMD_DIR, "arc_rank")


dset_to_period = {
    "train": {"oldest": datetime.datetime(2019, 9, 3, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 10, 0, 0, 0)},
    "valid_for_train": {"oldest": datetime.datetime(2019, 9, 10, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 17, 0, 0, 0),},    
    "valid_for_sub": {"oldest": datetime.datetime(2019, 9, 17, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 24, 0, 0, 0),},
    "submission": {"oldest": datetime.datetime(2019, 9, 24, 0, 0, 0),
              "newest": datetime.datetime(2019, 10, 1, 0, 0, 0),},    
}

In [5]:
# 元データ読み込み
auction = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "auction.pkl"))
watch = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "watch.pkl"))
bid = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "bid.pkl"))
bid_success = pd.read_pickle(os.path.join(IMD_INPUT_DIR , "bid_success.pkl"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submission.csv"))
sub_users = sample_sub[["KaiinID"]].drop_duplicates()

# ブランド作成日
brand = be.read_csv(os.path.join(INPUT_DIR , "brand.csv"))
# itemcolorid付与(色の大分類)
color = be.read_csv(os.path.join(INPUT_DIR , "color.csv"))
# カテゴリーid,itemdailID付与のためのItemShouID付与
genre = be.read_csv(os.path.join(INPUT_DIR , "genre.csv"))
# itemshowID,itemdailID付与
itemshou = be.read_csv(os.path.join(INPUT_DIR , "itemshou.csv"))
# itemlineID付与
line = be.read_csv(os.path.join(INPUT_DIR , "line.csv"))
# 会員登録日、生年月日付与
kaiin = be.read_csv(os.path.join(INPUT_DIR , "kaiin.csv"))
# 入荷お知らせ
nyuuka_oshirase = be.read_csv(os.path.join(INPUT_DIR , "nyuuka_oshirase.csv"))
# 検索履歴
search_log = be.read_csv(os.path.join(INPUT_DIR , "search_log.tsv"), sep="\t")

Mem. usage decreased to  0.32 Mb (15.0% reduction)
Mem. usage decreased to  0.01 Mb (32.2% reduction)
Mem. usage decreased to  0.00 Mb (45.4% reduction)
Mem. usage decreased to  0.00 Mb (33.4% reduction)
Mem. usage decreased to  0.01 Mb (41.3% reduction)
Mem. usage decreased to  5.83 Mb (31.2% reduction)
Mem. usage decreased to 12.62 Mb (43.7% reduction)


  if (await self.run_code(code, result,  async_=asy)):


Mem. usage decreased to 872.24 Mb (40.6% reduction)


In [6]:
# オークション情報拡充
def rename(df, prefix):
    target_columns = ["ModifyDate", "CreateDate"]
    for target_column in target_columns:
        if target_column in df.columns:
            df.rename(columns={target_column: prefix + target_column}, inplace=True)
    return df

itemshou = rename(itemshou, prefix="ItemShow")
genre = rename(genre, prefix="Genre")
brand = rename(brand, prefix="Brand")
color = rename(color, prefix="Color")
line = rename(line, prefix="Line")
kaiin = rename(kaiin, prefix="Kaiin")

genre_mst = (
    genre[["GenreID", "ItemShouID", "CategoryID"]]
    .merge(itemshou[["ItemShouID", "ItemDaiID"]], on="ItemShouID", how="inner")
)

auction_mst = (
    auction
    .merge(genre_mst, on="GenreID", how="left")
    .merge(brand[["BrandID", "BrandCreateDate"]], on="BrandID", how="left")    
    .merge(color[["ColorID", "ItemColorID"]], on="ColorID", how="left")    
    .merge(line[["LineID", "ItemLineID", "LineCreateDate"]], on="LineID", how="left")        
    .fillna(0)
)
auction_mst = be.to_datetime(auction_mst[sorted(list(auction_mst.columns))])

In [7]:
# kaiin["KaiinCreateDate"] = pd.to_datetime(kaiin["KaiinCreateDate"], format='%Y-%m-%d %H:%M:%S')
# search_log["TourokuTime"] = pd.to_datetime(search_log["TourokuTime"], format='%Y-%m-%d %H:%M:%S')
# nyuuka_oshirase = be.to_datetime(nyuuka_oshirase)
# search_log = be.to_datetime(search_log)

In [25]:
def extract_target_actions(watch, bid, period):
    watch_actioned = (
        watch.loc[(watch["TourokuDate"] >= period["oldest"]) & (watch["TourokuDate"] < period["newest"]),
                  ["KaiinID", "AuctionID"]]
    )
    bid_actioned = (
        bid.loc[(bid["ShudouNyuusatsuDate"] >= period["oldest"]) & (bid["ShudouNyuusatsuDate"] < period["newest"]),
                ["KaiinID", "AuctionID"]]
    )
    # 学習用データの際は正解データを作成する
    watch_actioned["watch_actioned"] = 1
    bid_actioned["bid_actioned"] = 1

    target_actions = (
        watch_actioned
        .merge(bid_actioned, on=["KaiinID", "AuctionID"], how="outer")
        .drop_duplicates()
        .fillna(0)
    )

    return target_actions


def arrange_dtime_condition(watch, bid, bid_success, auction, period):
    retval = (
        watch[watch["TourokuDate"] <= period["oldest"]],
        bid[bid["ShudouNyuusatsuDate"] <= period["oldest"]],
        bid_success[bid_success["RakusatsuDate"] < period["oldest"]],        
        auction[auction["CreateDate"] < period["newest"]]        
    )
    return retval

def arrange_inputs(watch, bid, bid_success, auction, period):
    watch_t_arranged, bid_t_arranged, bid_success_t_arranged, auction_arranged = (
        arrange_dtime_condition(watch=watch, bid=bid, bid_success=bid_success,auction=auction, period=period)
    )
    
    retval =  (
        watch_t_arranged.merge(auction, on="AuctionID", how="left"),
        bid_t_arranged.merge(auction, on="AuctionID", how="left"),
        bid_success_t_arranged.merge(auction, on="AuctionID", how="left"),
        auction_arranged
    )
    return retval


def sort_columns(df):
    return df[sorted(list(df.columns))]

def extract_similar_aucs(target_users, auction, actions,):
    similar_aucs = (
        actions.merge(target_users, on="KaiinID")
            .merge(auction, on="ShouhinID")
    )
    return similar_aucs

In [9]:
def add_user_feature(df, feature_df, col_prefix):
    user_feature = calc_user_feature(feature_df)
    user_feature.columns = [col_prefix + "_KaiinID_" + col if col != "KaiinID" else "KaiinID" for col in user_feature.columns]
    
    return df.merge(user_feature, on="KaiinID", how="left").fillna(0)

def calc_user_feature(feature_df):
    user_feature = (
        feature_df.groupby("KaiinID")
        .agg({
            "AuctionID": {"AuctionID_cnt": "count"},
            "ConditionID": {"ConditionID_mean": "mean"},
            "SaishuppinKaisuu": {
                "SaishuppinKaisuu_mean": "mean",
                "SaishuppinKaisuu_std": "std",
                "SaishuppinKaisuu_sum": "sum"},
            "SankouKakaku": {
                "SankouKakaku_mean": "mean",
                "SankouKakaku_std": "std",
                "SankouKakaku_sum": "sum"}
        })
    ).fillna(0)
    user_feature.columns = user_feature.columns.droplevel(0)
    user_feature = user_feature.reset_index()
    return user_feature

def cross_counts(df, col_set, col_name=None):
    if col_name is not None:
        cnt_col_name = col_name
    elif isinstance(col_set, str):
        cnt_col_name = col_set + "_cnt"
    elif isinstance(col_set, list):
        cnt_col_name = "_".join(col_set) + "_cnt"
    cnts = (
        df.groupby(col_set, as_index=False).size().reset_index()
        .rename(columns={0: cnt_col_name})
    )
    return cnts


In [10]:
# 特徴量作成の設定
# dataset_types = ["valid_for_train", "valid_for_sub", "submission"]
dataset_types = ["valid_for_train"]
rank_th = 1600
# 商品紐付けとest_rank_weeklyの出力のどちら、または両方使うかのフラグ
inputs_type = ["Shouhin", "rank_weekly"]
rank_weekly_th = 1600

In [11]:
# %%time

for dset_type in dataset_types:
        
    print(inputs_type)
    
    # データの時系列整理
    watch_arranged, bid_arranged, auction_arranged = (
        arrange_inputs(watch=watch, bid=bid, auction=auction_mst, period=dset_to_period[dset_type])
    )

    target_actions = extract_target_actions(watch=watch, bid=bid, period=dset_to_period[dset_type])    
    target_users = None
    if dset_type == "submission":
        target_users = sub_users
    else:
        target_users = target_actions[["KaiinID"]].drop_duplicates()
    
    # inputとする候補の分岐
    dataset_base = None    
    # est_rank_weeklyの出力のみ使う
    if "rank_weekly" in inputs_type and "Shouhin" not in inputs_type:
        dataset_base = pd.read_pickle(IMD_EST_WEEKLY_DIR + f"/watch_{dset_type}_{rank_th}.pkl")
        dataset_base = dataset_base.query(f"rank <= {rank_weekly_th}")
    # 商品紐付けのみ使う
    elif "rank_weekly" not in inputs_type and "Shouhin" in inputs_type:
        dataset_base = extract_similar_aucs(
            target_users=target_users,
             auction=auction_arranged,
             actions=pd.concat([watch_arranged[["KaiinID", "ShouhinID"]], bid_arranged[["KaiinID", "ShouhinID"]]]).drop_duplicates()
        )
    # 両方使う
    elif "rank_weekly" in inputs_type and "Shouhin" in inputs_type:
        ranked_weekly = pd.read_pickle(IMD_EST_WEEKLY_DIR + f"/watch_{dset_type}_{rank_th}.pkl")
        ranked_weekly = ranked_weekly.query(f"rank <= {rank_weekly_th}")
        similar_aucs = extract_similar_aucs(
            target_users=target_users,
             auction=auction_arranged,
             actions=pd.concat([watch_arranged[["KaiinID", "ShouhinID"]], bid_arranged[["KaiinID", "ShouhinID"]]]).drop_duplicates()
        )    
        dataset_base = (
            pd.concat([ranked_weekly[["AuctionID", "KaiinID"]], similar_aucs[["AuctionID", "KaiinID"]]]).drop_duplicates()
        )
    else:
        raise ValueError()
    
    if "targets" in inputs_type:
        dataset_base = pd.concat([dataset_base, target_actions[["KaiinID", "AuctionID"]]], sort=False).drop_duplicates()
    
    # 正解付与
    dataset_base = dataset_base.merge(target_actions, on=["KaiinID", "AuctionID"], how="left").fillna(0)
    
    # オークション情報付与
    dataset_base_a = dataset_base.merge(
        auction_arranged[[col for col in auction_arranged.columns if not col in dataset_base.columns] + ["AuctionID"]],
        on="AuctionID", how="left"
    )

    dataset_base_a = sort_columns(dataset_base_a)
    
    # クロス集計
    w_cate_col = ["AuctionID", "BrandID", "ItemShouID", "LineID", "ShouhinID", "ShouhinShubetsuID"]
    b_cate_col = ["AuctionID", "BrandID", "ItemShouID", "ShouhinID"]
    def add_cate_with_user(cate_col):
        cate_with_user = [["KaiinID", col] for col in w_cate_col]
        return cate_col + cate_with_user
    
    w_cnt_colsets = add_cate_with_user(w_cate_col)
    b_cnt_colsets = add_cate_with_user(b_cate_col)

    dataset_base_cwb = dataset_base_a

    for cross_conf in [[watch_arranged, "watch", w_cnt_colsets], [bid_arranged, "bid", b_cnt_colsets]]:    
        dataset_base_cwb = be.add_cross_counts(
            df=dataset_base_cwb, feature_df=cross_conf[0], prefix=cross_conf[1], col_sets=cross_conf[2]
        )
    
    # ユーザーの特徴量付与
    # 何回watch/bid/successしたか
    # 再出品回数、価格の平均、分散、今回との割合
    dataset_base_u = dataset_base_cwb

    dataset_base_u = add_user_feature(df=dataset_base_u, feature_df=watch_arranged, col_prefix="watch")

    for col in ["SaishuppinKaisuu", "SankouKakaku"]:
        dataset_base_u[f"watch_KaiinID_rate_mean_to_{col}"] = (
            dataset_base_u[f"watch_KaiinID_{col}_mean"] / dataset_base_u[col]
        )    
    
    # 時間系の特緒量付与
    oldest_dtime = dset_to_period[dset_type]["oldest"]
    
    def calc_timedelta(df, dtime_col, delta_col):
        df[delta_col] = df[dtime_col].swifter.apply(lambda d: (oldest_dtime - d).days)
    
    calc_timedelta(dataset_base_u, "CreateDate", "Auction_elapsed_days")
    calc_timedelta(dataset_base_u, "BrandCreateDate", "Brand_elapsed_days")
    calc_timedelta(dataset_base_u, "LineCreateDate", "Line_elapsed_days")
    calc_timedelta(watch_arranged, "TourokuDate", "watch_elapsed_day")    
        
    def agg_time_feature(df, agg_key, na_value):
        time_agg = df.groupby(agg_key).agg({
            "watch_elapsed_day": {
                f"{agg_key}_watch_elapsed_day_max": "max",
                f"{agg_key}_watch_elapsed_day_min": "min",
                f"{agg_key}_watch_elapsed_day_std": "std",
            }    
        }).fillna(na_value)
        time_agg.columns = time_agg.columns.droplevel(0)
        time_agg = time_agg.reset_index()
        return time_agg
        
    w_k_d = agg_time_feature(df=watch_arranged, agg_key="KaiinID", na_value=999)
    w_a_d = agg_time_feature(df=watch_arranged, agg_key="AuctionID", na_value=0)
    
    dataset_base_u = (
        dataset_base_u.merge(w_k_d, on="KaiinID", how="left").fillna(999)
        .merge(w_a_d, on="AuctionID", how="left").fillna(0)
    )
    
    # お気に入り合計に対する該当オークションのお気に入り数

    dataset_base_b = dataset_base_u
    dataset_base_b["watch_BrandID_KaiinID_rate"] = (
        dataset_base_b["watch_KaiinID_BrandID_cnt"] / dataset_base_b["watch_KaiinID_AuctionID_cnt_y"]
    )
    brand_ave = (
        auction_arranged[["BrandID", "SankouKakaku"]].groupby("BrandID", as_index=False).mean()
        .rename(columns={"SankouKakaku": "BrandID_SankouKakaku"})
    )
    dataset_base_b = dataset_base_b.merge(brand_ave, on="BrandID", how="left")
    dataset_base_b["SankouKakaku_rate_to_BrandID"] = (
        dataset_base_b["SankouKakaku"] / dataset_base_b["BrandID_SankouKakaku"]
    )
    
    now = datetime.datetime.now().strftime("%Y%m%d%H%M")
    be.df2pkl(dataset_base_b, IMD_ARCHIVE_DIR, f"{dset_type}_feature_{rank_th}_{now}.pkl")    

['Shouhin', 'rank_weekly']
##################
start cross count
['AuctionID', 'BrandID', 'ItemShouID', 'LineID', 'ShouhinID', 'ShouhinShubetsuID', ['KaiinID', 'AuctionID'], ['KaiinID', 'BrandID'], ['KaiinID', 'ItemShouID'], ['KaiinID', 'LineID'], ['KaiinID', 'ShouhinID'], ['KaiinID', 'ShouhinShubetsuID']]
AuctionID
BrandID
ItemShouID
LineID
ShouhinID
ShouhinShubetsuID
['KaiinID', 'AuctionID']
['KaiinID', 'BrandID']
['KaiinID', 'ItemShouID']
['KaiinID', 'LineID']
['KaiinID', 'ShouhinID']
['KaiinID', 'ShouhinShubetsuID']
##################
start cross count
['AuctionID', 'BrandID', 'ItemShouID', 'ShouhinID', ['KaiinID', 'AuctionID'], ['KaiinID', 'BrandID'], ['KaiinID', 'ItemShouID'], ['KaiinID', 'LineID'], ['KaiinID', 'ShouhinID'], ['KaiinID', 'ShouhinShubetsuID']]
AuctionID
BrandID
ItemShouID
ShouhinID
['KaiinID', 'AuctionID']
['KaiinID', 'BrandID']
['KaiinID', 'ItemShouID']
['KaiinID', 'LineID']
['KaiinID', 'ShouhinID']
['KaiinID', 'ShouhinShubetsuID']


in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…




in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


In [31]:
vt_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/valid_for_train_feature_1600_shouhin_plus_1600_cands.pkl")
vs_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/valid_for_sub_feature_1600_shouhin_plus_1600_cands.pkl")
# sub_dataset_base = pd.read_pickle(IMD_ARCHIVE_DIR + "/submission_feature_1600_shouhin_plus_1600_cands.pkl")

auc_foreign_ids = ["BrandID", "CategoryID", "ColorID", "ConditionID", "DanjobetsuID", "GenreGroupID",
            "GenreID", "ItemColorID", "ItemDaiID", "ItemLineID", "ItemShouID", "LineID", "ShouhinID", "ShouhinShubetsuID"]
drop_cols = (
    ["KaiinID", "AuctionID", "BrandCreateDate", "LineCreateDate", "CreateDate", "rank", "watch_actioned", "bid_actioned"]
    + auc_foreign_ids
)

In [None]:
dset_type = "valid_for_train"
watch_arranged, bid_arranged, bid_success_arranged, auction_arranged = (
    arrange_inputs(watch=watch, bid=bid, bid_success=bid_success, auction=auction, period=dset_to_period[dset_type])
)
watch_cnt = watch_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "watch_cnt"})
bid_cnt = bid_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "bid_cnt"})
vt_dataset_base = (
    be.left_anti_join(
        vt_dataset_base.merge(watch_cnt, on="KaiinID", how="left")
        .merge(bid_cnt, on="KaiinID", how="left")
        .fillna(0),
        bid_success_arranged, "AuctionID", "AuctionID"
    )
)

dset_type = "valid_for_sub"
watch_arranged, bid_arranged, bid_success_arranged, auction_arranged = (
    arrange_inputs(watch=watch, bid=bid, bid_success=bid_success, auction=auction, period=dset_to_period[dset_type])
)
watch_cnt = watch_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "watch_cnt"})
bid_cnt = bid_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "bid_cnt"})
vs_dataset_base = (
    be.left_anti_join(
        vt_dataset_base.merge(watch_cnt, on="KaiinID", how="left")
        .merge(bid_cnt, on="KaiinID", how="left")
        .fillna(0),
        bid_success_arranged, "AuctionID", "AuctionID"
    )
)

# dset_type = "submission"
# watch_arranged, bid_arranged, auction_arranged = (
#     arrange_inputs(watch=watch, bid=bid, auction=auction, period=dset_to_period[dset_type])
# )
# watch_cnt = watch_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "watch_cnt"})
# bid_cnt = bid_arranged.groupby("KaiinID").size().reset_index().rename(columns={0: "bid_cnt"})
# sub_dataset_base = (
#     sub_dataset_base.merge(watch_cnt, on="KaiinID", how="left")
#     .merge(bid_cnt, on="KaiinID", how="left")
#     .fillna(0)
# )

In [None]:
# print(vt_dataset_base.shape[0])
# print(vt_dataset_base.query("watch_cnt < 1000").shape[0])
# print(vt_dataset_base.query("bid_cnt < 50").shape[0])
# print(vt_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)").shape[0])
# print(vt_dataset_base[["watch_actioned", "bid_actioned"]].sum())
# print(vt_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)")[["watch_actioned", "bid_actioned"]].sum())
# print(vt_dataset_base[["KaiinID"]].nunique())
# print(vt_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)")[["KaiinID"]].nunique())

In [None]:
vt_private_dataset = be.DataSet(data=vt_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")
vt_business_dataset = be.DataSet(data=vt_dataset_base.query("(watch_cnt >= 1000) | (bid_cnt >= 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")
vs_private_dataset = be.DataSet(data=vs_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")
vs_business_dataset = be.DataSet(data=vs_dataset_base.query("(watch_cnt >= 1000) | (bid_cnt >= 50)"),
                                drop_cols=drop_cols, target_col="watch_actioned")

# sub_private_dataset = be.DataSet(data=sub_dataset_base.query("(watch_cnt < 1000) & (bid_cnt < 50)"),
#                                 drop_cols=drop_cols, target_col="watch_actioned")
# sub_business_dataset = be.DataSet(data=sub_dataset_base.query("(watch_cnt >= 1000) | (bid_cnt >= 50)"),
#                                 drop_cols=drop_cols, target_col="watch_actioned")


In [None]:
in_train_dataset = vt_private_dataset
sub_dataset = vs_private_dataset

# train test split
train_dataset, test_dataset =be.DataSet.train_test_split(in_train_dataset)
# sampled_train = be.DataSet.under_sampling(train_dataset, rate=100)
sampled_train = be.DataSet(
#     train_dataset.data.sample(frac=1).groupby(["KaiinID", "watch_actioned", "bid_actioned"]).head(40),
    pd.concat([
        train_dataset.data.query("(watch_actioned == 1) | (bid_actioned == 1)").sample(frac=1).groupby("KaiinID").head(40),
        train_dataset.data.query("(watch_actioned != 1) & (bid_actioned != 1)").sample(frac=1).groupby("KaiinID").head(200),        
    ]),
    drop_cols=train_dataset.drop_cols, target_col=train_dataset.target_col
)
# target encoding
cat_cols = ["BrandID", "GenreID"]
for cat_col in cat_cols:
    sampled_train.add_target_encode(cat_col=cat_col)
    be.target_encode_for_test(train_dataset=sampled_train, test_dataset=test_dataset, cat_col=cat_col)    

lgb_rank = be.LgbLambdaLank()
lgb_rank.train(train_dataset=sampled_train, valid_dataset=test_dataset, desc=True)

for cat_col in cat_cols:
    be.target_encode_for_test(train_dataset=in_train_dataset, test_dataset=sub_dataset, cat_col=cat_col)        
sub_dataset.set_pred(lgb_rank.predict(sub_dataset))

In [None]:
tmp = lgb_rank.get_model_info(sub_dataset)
tmp.sort_values("importance", ascending=False).head(10)

In [None]:
in_train_dataset = vt_business_dataset
sub_dataset = vs_business_dataset

# train test split
train_dataset, test_dataset =be.DataSet.train_test_split(in_train_dataset)
sampled_train = be.DataSet(
#     train_dataset.data.sample(frac=1).groupby(["KaiinID", "watch_actioned", "bid_actioned"]).head(40),
    pd.concat([
        train_dataset.data.query("(watch_actioned == 1) | (bid_actioned == 1)").sample(frac=1).groupby("KaiinID").head(40),
        train_dataset.data.query("(watch_actioned != 1) & (bid_actioned != 1)").sample(frac=1).groupby("KaiinID").head(200),        
    ]),    
    drop_cols=train_dataset.drop_cols, target_col=train_dataset.target_col
)

# target encoding
cat_cols = ["BrandID", "GenreID"]
for cat_col in cat_cols:
    sampled_train.add_target_encode(cat_col=cat_col)
    be.target_encode_for_test(train_dataset=sampled_train, test_dataset=test_dataset, cat_col=cat_col)    

lgb_rank = be.LgbLambdaLank()
lgb_rank.train(train_dataset=sampled_train, valid_dataset=test_dataset, desc=True)

for cat_col in cat_cols:
    be.target_encode_for_test(train_dataset=in_train_dataset, test_dataset=sub_dataset, cat_col=cat_col)        

sub_dataset.set_pred(lgb_rank.predict(sub_dataset))

In [None]:
tmp = lgb_rank.get_model_info(sampled_train)
tmp.sort_values("importance", ascending=False).head(10)

In [None]:
# pred = (
#     pd.concat([sub_private_dataset.data[["KaiinID", "AuctionID", "pred"]], sub_business_dataset.data[["KaiinID", "AuctionID", "pred"]]])
#     .rename(columns={"pred": "score"})
# )
# sub = be.adjust_sub_form(sub_users, pred, drop=True)
# sub.to_csv(SUB_DIR + datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)
# sub.head()


In [None]:
# 手元のデータセットでndcg計算
y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(),
                      pd.concat([vs_private_dataset.data[["KaiinID", "AuctionID", "pred"]], vs_business_dataset.data[["KaiinID", "AuctionID", "pred"]]])
                      .rename(columns={"pred":"score"}), drop=True
)
ndcg_score = be.calc_ndcg(y_true, sub)
print("ndcg_score : ", ndcg_score)

In [320]:
# 手元のデータで最善の予測をした際の結果
y_true = extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
cheat_pred = be.get_cheat_pred(pd.concat([vs_private_dataset.data[["KaiinID", "AuctionID", "pred"]], vs_business_dataset.data[["KaiinID", "AuctionID", "pred"]]]), y_true)
cheat_sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(), cheat_pred, drop=True)
cheat_dcgs = calc_ndcg(y_true, cheat_sub)
print(cheat_dcgs.mean())
cheat_dcgs = cheat_dcgs.reset_index()

0.15527827996086205


In [321]:
# 各データのdcg確認
y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(),
                      pd.concat([vs_private_dataset.data[["KaiinID", "AuctionID", "pred"]], vs_business_dataset.data[["KaiinID", "AuctionID", "pred"]]])
                      .rename(columns={"pred":"score"}), drop=True
)
dcgs = calc_ndcg(y_true, sub)
dcgs = dcgs.reset_index()

In [322]:
dcg_infos = (
    dcgs.merge(watch_cnt, on="KaiinID", how="left")
    .merge(bid_cnt, on="KaiinID", how="left")
    .merge(y_true.drop("AuctionID", axis=1).groupby("KaiinID", as_index=False).sum(), on="KaiinID", how="left")
    .merge(cheat_dcgs.rename(columns={"score": "cheat_score"}), on="KaiinID", how="left")
    .fillna(0)
)
print(dcg_infos.corr())
print(dcg_infos.query("score == 0").describe())
print(dcg_infos.query("score != 0").describe())
print(dcg_infos.query("(watch_cnt < 1000) & (bid_cnt < 50)").describe())
print(dcg_infos.query("(watch_cnt >= 1000) | (bid_cnt >= 50)").describe())

                 KaiinID     score  watch_cnt   bid_cnt  watch_actioned  \
KaiinID         1.000000 -0.005421  -0.012328 -0.003893       -0.016975   
score          -0.005421  1.000000   0.046678  0.049824        0.045194   
watch_cnt      -0.012328  0.046678   1.000000  0.509239        0.622164   
bid_cnt        -0.003893  0.049824   0.509239  1.000000        0.325908   
watch_actioned -0.016975  0.045194   0.622164  0.325908        1.000000   
bid_actioned   -0.017098  0.120509   0.248743  0.616189        0.386692   
cheat_score    -0.006767  0.761916   0.288007  0.184718        0.258121   

                bid_actioned  cheat_score  
KaiinID            -0.017098    -0.006767  
score               0.120509     0.761916  
watch_cnt           0.248743     0.288007  
bid_cnt             0.616189     0.184718  
watch_actioned      0.386692     0.258121  
bid_actioned        1.000000     0.238901  
cheat_score         0.238901     1.000000  
             KaiinID   score     watch_cnt     

In [84]:
def stack_target_actions(target_actions):
    watch_target = target_actions.query("(watch_actioned == 1)")[["KaiinID", "AuctionID"]]
    bid_target = target_actions.query("(bid_actioned == 1)")[["KaiinID", "AuctionID"]]
    watch_target["score"] = 1
    bid_target["score"] = 2
    stacked_target_actions = pd.concat([watch_target, bid_target], sort=False)
    return stacked_target_actions

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    return np.sum((2 ** r - 1) / np.log2(np.arange(2, r.size + 2)))


def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max


def calc_ndcg(y_true, y_pred, k=20):
    y_pred_cp = y_pred.copy()

    actione_true = stack_target_actions(y_true)
    actione_true["rank"] = 100

    y_pred_cp['rank'] = y_pred_cp.groupby('KaiinID')['AuctionID'].cumcount()

    scored_pred = (
        y_pred_cp.merge(actione_true[["KaiinID", "AuctionID", "score"]], on=["KaiinID", "AuctionID"],
                        how="left").fillna(0))

    unchoiced_actiones = (
        be.left_anti_join(actione_true, y_pred_cp, ["KaiinID", "AuctionID"], ["KaiinID", "AuctionID"]))

    scored_actiones = (
        pd.concat([scored_pred, unchoiced_actiones], sort=False)
            .sort_values(["KaiinID", "rank"], ascending=["True", "True"]))

    dcgs = scored_actiones.groupby("KaiinID")["score"].apply(lambda s: ndcg_at_k(s.tolist(), k=20))
#     ndcg = dcgs.mean()

    return dcgs


In [325]:
# ランダムな予測の場合
y_true = extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
random_pred = watch_vs_dataset.data[["KaiinID", "AuctionID"]].copy()
random_pred["score"] = pd.Series(np.random.random(len(random_pred)), index=random_pred.index)
random_sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(), random_pred, drop=True)
ndcg_score = calc_ndcg(y_true, random_sub)
print("ndcg_score : ", ndcg_score.mean())

ndcg_score :  0.012993657402914974


In [130]:
%%time
# watch学習

dataset_pairs = [
    [watch_vt_dataset, watch_vs_dataset]
]

sample_flg = True

# 学習の様子可視化
for in_train_dataset, sub_dataset in dataset_pairs:
    
    # train test split
    train_dataset, test_dataset =be.DataSet.train_test_split(in_train_dataset)
    
    lgb_rank = be.LgbLambdaLank()
    
    if sample_flg:
        sampled_train = be.DataSet.under_sampling(train_dataset, rate=100)

        # target encoding
        cat_cols = ["BrandID", "GenreID"]
        for cat_col in cat_cols:
            sampled_train.add_target_encode(cat_col=cat_col)
            be.target_encode_for_test(train_dataset=sampled_train, test_dataset=test_dataset, cat_col=cat_col)    
    
        lgb_rank.train(train_dataset=sampled_train, valid_dataset=test_dataset, desc=True)

    else:
        # target encoding
        cat_cols = ["BrandID", "GenreID"]
        for cat_col in cat_cols:
            train_dataset.add_target_encode(cat_col=cat_col)
            be.target_encode_for_test(train_dataset=train_dataset, test_dataset=test_dataset, cat_col=cat_col)    
        lgb_rank.train(train_dataset=train_dataset, valid_dataset=test_dataset, desc=True)        

    for cat_col in cat_cols:
        be.target_encode_for_test(train_dataset=in_train_dataset, test_dataset=sub_dataset, cat_col=cat_col)        
    sub_dataset.set_pred(lgb_rank.predict(sub_dataset))    

[1]	valid_0's ndcg@20: 0.895492
[2]	valid_0's ndcg@20: 0.899167
[3]	valid_0's ndcg@20: 0.901706
[4]	valid_0's ndcg@20: 0.90204
[5]	valid_0's ndcg@20: 0.902014
[6]	valid_0's ndcg@20: 0.904161
[7]	valid_0's ndcg@20: 0.904586
[8]	valid_0's ndcg@20: 0.904388
[9]	valid_0's ndcg@20: 0.903982
[10]	valid_0's ndcg@20: 0.904519
[11]	valid_0's ndcg@20: 0.904724
[12]	valid_0's ndcg@20: 0.904628
[13]	valid_0's ndcg@20: 0.905696
[14]	valid_0's ndcg@20: 0.906099
[15]	valid_0's ndcg@20: 0.9062
[16]	valid_0's ndcg@20: 0.906725
[17]	valid_0's ndcg@20: 0.906832
[18]	valid_0's ndcg@20: 0.906856
[19]	valid_0's ndcg@20: 0.906618
[20]	valid_0's ndcg@20: 0.906919
[21]	valid_0's ndcg@20: 0.906747
[22]	valid_0's ndcg@20: 0.90672
[23]	valid_0's ndcg@20: 0.906723
[24]	valid_0's ndcg@20: 0.9068
[25]	valid_0's ndcg@20: 0.907222
[26]	valid_0's ndcg@20: 0.907357
[27]	valid_0's ndcg@20: 0.907309
[28]	valid_0's ndcg@20: 0.90717
[29]	valid_0's ndcg@20: 0.907048
[30]	valid_0's ndcg@20: 0.907188
[31]	valid_0's ndcg@20: 0.

In [133]:
# 手元のデータセットでndcg計算
y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(),
                      watch_vs_dataset.data[["KaiinID", "AuctionID", "pred"]].rename(columns={"pred":"score"}), drop=True
)
ndcg_score = be.calc_ndcg(y_true, sub)
print("ndcg_score : ", ndcg_score)

ndcg_score :  0.05488949924815673


In [138]:
# 手元のデータで最善の予測をした際の結果
y_true = extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
cheat_pred = be.get_cheat_pred(pd.concat([vs_private_dataset.data[["KaiinID", "AuctionID", "pred"]], vs_business_dataset.data[["KaiinID", "AuctionID", "pred"]]]), y_true)
cheat_sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(), cheat_pred, drop=True)
cheat_dcgs = calc_ndcg(y_true, cheat_sub)
cheat_dcgs = cheat_dcgs.reset_index()

In [145]:
# 各データのdcg確認
# 手元のデータセットでndcg計算
y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
sub = be.adjust_sub_form(y_true[["KaiinID"]].drop_duplicates(),
                      watch_vs_dataset.data[["KaiinID", "AuctionID", "pred"]].rename(columns={"pred":"score"}), drop=True
)
dcgs = calc_ndcg(y_true, sub)
dcgs = dcgs.reset_index()

In [146]:
dcg_infos = (
    dcgs.merge(watch_cnt, on="KaiinID", how="left")
    .merge(bid_cnt, on="KaiinID", how="left")
    .merge(y_true.drop("AuctionID", axis=1).groupby("KaiinID", as_index=False).sum(), on="KaiinID", how="left")
    .merge(cheat_dcgs.rename(columns={"score": "cheat_score"}), on="KaiinID", how="left")
    .fillna(0)
)
print(dcg_infos.corr())
print(dcg_infos.query("score == 0").describe())
print(dcg_infos.query("score != 0").describe())
print(dcg_infos.query("(watch_cnt < 1000) & (bid_cnt < 50)").describe())
print(dcg_infos.query("(watch_cnt >= 1000) | (bid_cnt >= 50)").describe())

                 KaiinID     score  watch_cnt   bid_cnt  watch_actioned  \
KaiinID         1.000000 -0.004682  -0.012328 -0.003893       -0.016975   
score          -0.004682  1.000000   0.052040  0.033016        0.046012   
watch_cnt      -0.012328  0.052040   1.000000  0.509239        0.622164   
bid_cnt        -0.003893  0.033016   0.509239  1.000000        0.325908   
watch_actioned -0.016975  0.046012   0.622164  0.325908        1.000000   
bid_actioned   -0.017098  0.081521   0.248743  0.616189        0.386692   
cheat_score    -0.006767  0.717838   0.288007  0.184718        0.258121   

                bid_actioned  cheat_score  
KaiinID            -0.017098    -0.006767  
score               0.081521     0.717838  
watch_cnt           0.248743     0.288007  
bid_cnt             0.616189     0.184718  
watch_actioned      0.386692     0.258121  
bid_actioned        1.000000     0.238901  
cheat_score         0.238901     1.000000  
             KaiinID   score     watch_cnt     

In [None]:
tmp = lgb_rank.get_model_info(sampled_train)
tmp.sort_values("importance", ascending=False).head(30)

In [43]:
%%time
# 予測値取得
dataset_pairs = [
    [watch_vs_dataset, watch_sub_dataset]
]

rank_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    "ndcg_at": 20,
    "nround": 500,
    "learning_rate": 0.01,
    "max_depth": 6,
    "num_leaves": 127,
    "num_iteration": 500
}

sample_flg = True

# 学習の様子可視化
for in_train_dataset, sub_dataset in dataset_pairs:

    # train test split
    train_dataset, test_dataset =be.DataSet.train_test_split(in_train_dataset)
    
    lgb_rank = LgbLambdaLank()    
    
    if sample_flg:
        sampled_train = be.DataSet.under_sampling(train_dataset, rate=10)

        # target encoding
        cat_cols = ["BrandID", "GenreID"]
        for cat_col in cat_cols:
            sampled_train.add_target_encode(cat_col=cat_col)
            be.target_encode_for_test(train_dataset=train_dataset, test_dataset=test_dataset, cat_col=cat_col)            
        lgb_rank.train(train_dataset=sampled_train, valid_dataset=test_dataset, desc=True)
    else:
        # target encoding
        cat_cols = ["BrandID", "GenreID"]
        for cat_col in cat_cols:
            train_dataset.add_target_encode(cat_col=cat_col)
            be.target_encode_for_test(train_dataset=train_dataset, test_dataset=test_dataset, cat_col=cat_col)    
        lgb_rank.train(train_dataset=train_dataset, valid_dataset=test_dataset, desc=True)        

    for cat_col in cat_cols:
        be.target_encode_for_test(train_dataset=in_train_dataset, test_dataset=sub_dataset, cat_col=cat_col)        
    sub_dataset.set_pred(lgb_rank.predict(sub_dataset))    

[1]	valid_0's ndcg@20: 0.565636
[2]	valid_0's ndcg@20: 0.576263
[3]	valid_0's ndcg@20: 0.594657
[4]	valid_0's ndcg@20: 0.598639
[5]	valid_0's ndcg@20: 0.612305
[6]	valid_0's ndcg@20: 0.612833
[7]	valid_0's ndcg@20: 0.613805
[8]	valid_0's ndcg@20: 0.614455
[9]	valid_0's ndcg@20: 0.614003
[10]	valid_0's ndcg@20: 0.61465
[11]	valid_0's ndcg@20: 0.616131
[12]	valid_0's ndcg@20: 0.618102
[13]	valid_0's ndcg@20: 0.621269
[14]	valid_0's ndcg@20: 0.622527
[15]	valid_0's ndcg@20: 0.625764
[16]	valid_0's ndcg@20: 0.626662
[17]	valid_0's ndcg@20: 0.627856
[18]	valid_0's ndcg@20: 0.629533
[19]	valid_0's ndcg@20: 0.631363
[20]	valid_0's ndcg@20: 0.631108
[21]	valid_0's ndcg@20: 0.630578
[22]	valid_0's ndcg@20: 0.632222
[23]	valid_0's ndcg@20: 0.632136
[24]	valid_0's ndcg@20: 0.632413
[25]	valid_0's ndcg@20: 0.632302
[26]	valid_0's ndcg@20: 0.632676
[27]	valid_0's ndcg@20: 0.632572
[28]	valid_0's ndcg@20: 0.632266
[29]	valid_0's ndcg@20: 0.633787
[30]	valid_0's ndcg@20: 0.633711
[31]	valid_0's ndcg@

In [44]:
tmp = lgb_rank.get_model_info(sub_dataset)
tmp.sort_values("importance", ascending=False).head(30)

Unnamed: 0,importance
Auction_elapsed_days,648
SankouKakaku,619
AuctionID_watch_elapsed_day_min,469
watch_KaiinID_ShouhinID_cnt,390
watch_BrandID_KaiinID_rate,330
SaishuppinKaisuu,276
watch_AuctionID_cnt,224
bid_KaiinID_ShouhinID_cnt,205
watch_KaiinID_ItemShouID_cnt,204
watch_KaiinID_SankouKakaku_mean,200


In [46]:
pred = watch_sub_dataset.data[["KaiinID", "AuctionID", "pred"]].rename(columns={"pred": "score"})
sub = adjust_sub_form(sub_users, pred, drop=True)
sub.to_csv(SUB_DIR + datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)
sub.head()

Unnamed: 0,KaiinID,AuctionID
259,24,93537
1571,24,4090309
1436,24,3345576
230,24,83252
640,24,475279


In [89]:
sub.drop_duplicates()

Unnamed: 0,KaiinID,AuctionID
33,24,3664401.0
20,24,2503413.0
1,24,111774.0
2,24,508537.0
3,24,596848.0
...,...,...
3603349,277891,607545.0
3603226,277891,234592.0
3603335,277891,564888.0
3603925,277891,2750836.0


In [90]:
sub.shape

(120320, 2)

In [55]:
sub.groupby(["KaiinID", "AuctionID"]).size().reset_index().sort_values(0, ascending=False)

Unnamed: 0,KaiinID,AuctionID,0
66772,153257,3337329.0,2
30351,70083,2503413.0,2
90659,208911,111774.0,2
80209,184030,1757415.0,1
80220,184130,648018.0,1
...,...,...,...
40101,93650,508537.0,1
40100,93650,461819.0,1
40099,93650,111774.0,1
40098,93581,3647003.0,1


In [64]:
%%time
# アンサンブル
# watch学習

dataset_pairs = [
    [watch_vt_dataset, watch_vs_dataset]
]

# 学習の様子可視化
for in_train_dataset, sub_dataset in dataset_pairs:
    
    # train test split
    undersampled_datasets = [
        {"train_dataset": splited[0], "valid_dataset": splited[1]}
        for splited in 
        [be.DataSet.train_test_split(be.DataSet.under_sampling(in_train_dataset, rate=100))
        for _ in range(4)]
    ]
        
    # target encoding
#     cat_cols = ["BrandID", "GenreID"]
#     for cat_col in cat_cols:
#         sampled_train.add_target_encode(cat_col=cat_col)
#         be.target_encode_for_test(train_dataset=sampled_train, test_dataset=test_dataset, cat_col=cat_col)    
    
    models = be.DataEmsembler(LgbLambdaLank)
    models.train(undersampled_datasets)
    
    sub_dataset.set_pred(models.predict(sub_dataset))    

[1]	valid_0's ndcg@20: 0.892296
[2]	valid_0's ndcg@20: 0.900662
[3]	valid_0's ndcg@20: 0.902642
[4]	valid_0's ndcg@20: 0.902543
[5]	valid_0's ndcg@20: 0.903634
[6]	valid_0's ndcg@20: 0.90407
[7]	valid_0's ndcg@20: 0.903476
[8]	valid_0's ndcg@20: 0.906179
[9]	valid_0's ndcg@20: 0.906513
[10]	valid_0's ndcg@20: 0.906211
[11]	valid_0's ndcg@20: 0.906645
[12]	valid_0's ndcg@20: 0.906717
[13]	valid_0's ndcg@20: 0.906789
[14]	valid_0's ndcg@20: 0.906407
[15]	valid_0's ndcg@20: 0.906738
[16]	valid_0's ndcg@20: 0.907029
[17]	valid_0's ndcg@20: 0.907427
[18]	valid_0's ndcg@20: 0.907248
[19]	valid_0's ndcg@20: 0.907019
[20]	valid_0's ndcg@20: 0.907291
[21]	valid_0's ndcg@20: 0.90717
[22]	valid_0's ndcg@20: 0.907242
[23]	valid_0's ndcg@20: 0.907145
[24]	valid_0's ndcg@20: 0.907292
[25]	valid_0's ndcg@20: 0.907445
[26]	valid_0's ndcg@20: 0.907496
[27]	valid_0's ndcg@20: 0.90757
[28]	valid_0's ndcg@20: 0.90753
[29]	valid_0's ndcg@20: 0.907704
[30]	valid_0's ndcg@20: 0.907758
[31]	valid_0's ndcg@20:

In [56]:
%%time
# bid学習

# 学習の様子可視化
for target, dataset in {"bid": bid_vs_dataset}.items():
    print(target)
    
    # train test split
    train_dataset, test_dataset =be.DataSet.train_test_split(dataset)
    sampled_train = be.DataSet.under_sampling(train_dataset)

    # target encoding
#     cat_cols = ["BrandID", "GenreID"]
#     # userも組み合わせる
#     for cat_col in cat_cols:
#         sampled_train.add_target_encode(cat_col=cat_col)
#         be.target_encode_for_test(train_dataset=sampled_train, test_dataset=test_dataset, cat_col=cat_col)    
    
    lgb_bi = be.LgbBinaryClassifier()
    lgb_bi.train(train_dataset=sampled_train, valid_dataset=test_dataset, desc=True)    

bid
[1]	valid_0's auc: 0.967723
[2]	valid_0's auc: 0.969615
[3]	valid_0's auc: 0.969867
[4]	valid_0's auc: 0.97008
[5]	valid_0's auc: 0.970028
[6]	valid_0's auc: 0.970128
[7]	valid_0's auc: 0.970536
[8]	valid_0's auc: 0.970549
[9]	valid_0's auc: 0.970813
[10]	valid_0's auc: 0.970847
[11]	valid_0's auc: 0.970968
[12]	valid_0's auc: 0.971024
[13]	valid_0's auc: 0.971418
[14]	valid_0's auc: 0.971452
[15]	valid_0's auc: 0.97176
[16]	valid_0's auc: 0.971855
[17]	valid_0's auc: 0.971855
[18]	valid_0's auc: 0.971889
[19]	valid_0's auc: 0.971999
[20]	valid_0's auc: 0.972026
[21]	valid_0's auc: 0.971983
[22]	valid_0's auc: 0.972125
[23]	valid_0's auc: 0.972186
[24]	valid_0's auc: 0.97229
[25]	valid_0's auc: 0.972349
[26]	valid_0's auc: 0.972334
[27]	valid_0's auc: 0.972438
[28]	valid_0's auc: 0.971929
[29]	valid_0's auc: 0.971969
[30]	valid_0's auc: 0.971986
[31]	valid_0's auc: 0.971998
[32]	valid_0's auc: 0.97201
[33]	valid_0's auc: 0.972008
[34]	valid_0's auc: 0.97202
[35]	valid_0's auc: 0.97

In [57]:
tmp = lgb_bi.get_model_info(sampled_train)
tmp.sort_values("importance", ascending=False).head(30)

Unnamed: 0,importance
AuctionID_watch_elapsed_day_min,650
SaishuppinKaisuu,402
BrandID_SankouKakaku,370
watch_ShouhinID_cnt,341
Auction_elapsed_days,309
AuctionID_watch_elapsed_day_max,302
KaiinID_watch_elapsed_day_min,270
watch_BrandID_cnt,245
bid_AuctionID_cnt,203
bid_KaiinID_BrandID_cnt,194


In [58]:
%%time
# watch学習
# params = {
#     "objective": "binary",
#     'metric': 'binary_logloss',
#     "nround": 500,
#     "learning_rate": 0.01,
#     "max_depth": 6,
#     "num_leaves": 127
# }
# 学習の様子可視化
for target, dataset in {"watch": watch_vs_dataset}.items():
    print(target)
    
    # train test split
    train_dataset, test_dataset =be.DataSet.train_test_split(dataset)
    sampled_train = be.DataSet.under_sampling(train_dataset)

    # target encoding
    cat_cols = ["BrandID", "GenreID"]
    for cat_col in cat_cols:
        sampled_train.add_target_encode(cat_col=cat_col)
        be.target_encode_for_test(train_dataset=sampled_train, test_dataset=test_dataset, cat_col=cat_col)    
    
    lgb_bi = be.LgbBinaryClassifier()
    lgb_bi.train(train_dataset=sampled_train, valid_dataset=test_dataset, desc=True)


watch
[1]	valid_0's auc: 0.952835
[2]	valid_0's auc: 0.953211
[3]	valid_0's auc: 0.956015
[4]	valid_0's auc: 0.956183
[5]	valid_0's auc: 0.956196
[6]	valid_0's auc: 0.956116
[7]	valid_0's auc: 0.956201
[8]	valid_0's auc: 0.956405
[9]	valid_0's auc: 0.956599
[10]	valid_0's auc: 0.956641
[11]	valid_0's auc: 0.956557
[12]	valid_0's auc: 0.956557
[13]	valid_0's auc: 0.956544
[14]	valid_0's auc: 0.956488
[15]	valid_0's auc: 0.956492
[16]	valid_0's auc: 0.956395
[17]	valid_0's auc: 0.956541
[18]	valid_0's auc: 0.956532
[19]	valid_0's auc: 0.956653
[20]	valid_0's auc: 0.956694
[21]	valid_0's auc: 0.956825
[22]	valid_0's auc: 0.95692
[23]	valid_0's auc: 0.957064
[24]	valid_0's auc: 0.957152
[25]	valid_0's auc: 0.957189
[26]	valid_0's auc: 0.957338
[27]	valid_0's auc: 0.957423
[28]	valid_0's auc: 0.957503
[29]	valid_0's auc: 0.95762
[30]	valid_0's auc: 0.957656
[31]	valid_0's auc: 0.957674
[32]	valid_0's auc: 0.957747
[33]	valid_0's auc: 0.957871
[34]	valid_0's auc: 0.957904
[35]	valid_0's auc:

In [59]:
tmp = lgb_bi.get_model_info(sampled_train)
tmp.sort_values("importance", ascending=False).head(30)

Unnamed: 0,importance
AuctionID_watch_elapsed_day_min,420
BrandID_target_mean,323
Auction_elapsed_days,284
watch_KaiinID_AuctionID_cnt_x,260
watch_KaiinID_SaishuppinKaisuu_sum,212
bid_AuctionID_cnt,197
SaishuppinKaisuu,178
watch_KaiinID_ShouhinID_cnt,165
watch_KaiinID_BrandID_cnt,163
KaiinID_watch_elapsed_day_min,162
