In [399]:
from os.path import dirname
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [400]:
pd.set_option('display.max_columns', 50)

In [401]:
%load_ext autoreload
%autoreload 2
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [402]:
MIMIC_DIR = os.path.join(os.getcwd(), "../../data/mimic/")
INPUT_DIR = os.path.join(os.getcwd(), "../../data/input/")
IMD_DIR = os.path.join(os.getcwd(), "../../data/intermediate/")

In [403]:
sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")
sub_users = sample_sub[["KaiinID"]].drop_duplicates()

In [423]:
access_date = datetime.datetime(2019, 10, 1, 0, 0, 0)
access_period = 7

In [415]:
auction = pd.read_csv(INPUT_DIR + "auction.csv")
watch = pd.read_csv(INPUT_DIR + "watchlist.csv")
bid = pd.read_csv(INPUT_DIR + "shudounyuusatsu.csv")
bid_success = pd.read_csv(INPUT_DIR + "rakusatsu.csv")

auction = to_datetime(auction)
auction["CreateDateDelta"] = auction["CreateDate"].apply(lambda d: (access_date - d).days)
watch = to_datetime(watch)
watch["TourokuDateDelta"] = watch["TourokuDate"].apply(lambda d: (access_date - d).days)
watch = watch.merge(auction, on="AuctionID")
bid = to_datetime(bid)
bid["ShudouNyuusatsuDateDelta"] = bid["ShudouNyuusatsuDate"].apply(lambda d: (access_date - d).days)
bid = bid.merge(auction, on="AuctionID")
bid_success = to_datetime(bid_success)
bid_success["RakusatsuDateDelta"] = bid_success["RakusatsuDate"].apply(lambda d: (access_date - d).days)
bid_success = bid_success.merge(auction, on="AuctionID")

to_pickle(auction, IMD_DIR, "auction.pkl")
to_pickle(watch, IMD_DIR, "watch.pkl")
to_pickle(bid, IMD_DIR, "bid.pkl")
to_pickle(bid_success, IMD_DIR, "bid_success.pkl")

In [421]:
def to_pickle(obj, dirname, filename):
    now = datetime.datetime.now().strftime("%Y%m%d%H%M")
    print(filename + " : " + now)
    obj.to_pickle(dirname + now + "_" + filename)

In [370]:
auction= pd.read_pickle("auction.pkl")
watch= pd.read_pickle("watch.pkl")
bid= pd.read_pickle("bid.pkl")
bid_success= pd.read_pickle("bid_success.pkl")

In [468]:
def extract_target_aucs(watch, bid, oldest_dtime, newest_dtime):

    watch_actioned = (
        watch[(watch["TourokuDate"] >= oldest_dtime) & (watch["TourokuDate"] < newest_dtime)]
    )
    bid_actioned = (
        bid[(bid["ShudouNyuusatsuDate"] >= oldest_dtime) & (bid["ShudouNyuusatsuDate"] < newest_dtime)]
    )
    target_users = (
        pd.concat([watch_actioned, bid_actioned], sort=False)[["KaiinID"]]
        .drop_duplicates()
    )
    
    return target_users

In [494]:
dset_to_period = {
    "train": {"oldest": datetime.datetime(2019, 9, 10, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 17, 0, 0, 0)},
    "valid": {"oldest": datetime.datetime(2019, 9, 17, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 24, 0, 0, 0),},
    "submission": {"oldest": datetime.datetime(2019, 9, 24, 0, 0, 0),
              "newest": datetime.datetime(2019, 10, 1, 0, 0, 0),},    
}

In [501]:
def build_dataset(watch, bid, auction, dset_type, dset_to_period, target_users=None):

    oldest_dtime, newest_dtime = (
        dset_to_period[dset_type]["oldest"],
        dset_to_period[dset_type]["newest"]           
    )
    
    # データセット作成の対象となるユーザー一覧
    if dset_type != "submission":
        watch_actioned = (
            watch[(watch["TourokuDate"] >= oldest_dtime) & (watch["TourokuDate"] < newest_dtime)]
        )
        bid_actioned = (
            bid[(bid["ShudouNyuusatsuDate"] >= oldest_dtime) & (bid["ShudouNyuusatsuDate"] < newest_dtime)]
        )
        target_users = (
            pd.concat([watch_actioned, bid_actioned], sort=False)[["KaiinID"]]
            .drop_duplicates()
        )
    
    # リークを防ぐため、特徴量、choiced_auc作成用のデータから正解データ抽出期間時のデータを削除する
    watch_train = watch[watch["TourokuDate"] < oldest_dtime]
    bid_train = bid[bid["ShudouNyuusatsuDate"] < oldest_dtime]
    bid_success_train = bid_success[bid_success["RakusatsuDate"] < oldest_dtime]
    
    # 予測対象とするオークションをルール,0次ベースのロジックで限定する
    dataset = merge_choiced_aucs(target_users, watch_train, bid_train, auction)
    
    # 特徴量付与
    dataset = add_features(dataset, watch_train, bid_train, oldest_dtime)
    
    # 正解データ付与
    if dset_type != "submission":        
        watch_actioned["watch_actioned"] = 1
        bid_actioned["bid_actioned"] = 1
        dataset = (
            dataset
            .merge(watch_actioned[["KaiinID", "AuctionID", "watch_actioned"]], on=["KaiinID", "AuctionID"], how="left")
            .merge(bid_actioned[["KaiinID", "AuctionID", "bid_actioned"]], on=["KaiinID", "AuctionID"], how="left")
            .fillna(0)
        )
    
    return dataset

In [497]:
def merge_choiced_aucs(target_users, watch, bid, auction):
    # choiced_auc付与部分
    # あるユーザーが現在までにアクションをした商品と同じ商品IDのオークションを抽出
    # これが関数内で作成する学習データの大元
    target_aucs = (
        pd.concat([watch[["KaiinID", "ShouhinID"]], bid[["KaiinID", "ShouhinID"]]])
        .drop_duplicates()
        .merge(auction[["AuctionID", "ShouhinID"]], on="ShouhinID")[["KaiinID", "AuctionID"]]
        .drop_duplicates()
    )
    
    auc_cols = (
        ['AuctionID', 'ShouhinShubetsuID', 'ShouhinID', 'SaishuppinKaisuu',
       'ConditionID', 'BrandID', 'GenreID', 'GenreGroupID', 'LineID',
       'DanjobetsuID']
    )
    
    # 今回の対象ユーザーに絞る
    target_data = (
        target_users
        .merge(target_aucs, on="KaiinID")
        .merge(auction[auc_cols], on="AuctionID")
    )
    
    return target_data

def add_features(dataset, watch, bid, oldest_dtime):

    # 特徴量追加部分
    
    dataset = add_time_features(dataset, watch, "TourokuDate", "watch", oldest_dtime)
    dataset = add_value_counts(
        dataset, watch, [["AuctionID"], ["ShouhinID"], ["BrandID"], ["LineID"], ["KaiinID", "ShouhinID"], 
        ["KaiinID", "BrandID"], ["KaiinID", "GenreGroupID"], ["KaiinID", "LineID"]], "watch"
    )

    dataset = add_time_features(dataset, bid, "ShudouNyuusatsuDate", "bid", oldest_dtime)
    dataset = add_value_counts(
        dataset, bid, [["AuctionID"], ["ShouhinID"], ["BrandID"], ["LineID"], ["KaiinID", "ShouhinID"], 
        ["KaiinID", "BrandID"], ["KaiinID", "GenreGroupID"], ["KaiinID", "LineID"]], "bid"
    )    
    
    drop_cols = ["ShouhinShubetsuID", "ShouhinID", "BrandID", "GenreID", "GenreGroupID", "LineID"]
    
    dataset = dataset.drop(drop_cols, axis=1).fillna(-1)
    
    return dataset


In [498]:
def add_time_features(df, feature_df, time_col, prefix, oldest_dtime):
    tmp_time_col = f"Tmp{time_col}Delta"
    key_cols = ["KaiinID", "AuctionID"]
    feature_df[tmp_time_col] = feature_df[time_col].apply(lambda d: (oldest_dtime - d).days)
    time_features = (
        feature_df
        .groupby(key_cols)[tmp_time_col]
        .agg(["count", "max", "min"])
        .rename(columns={"count": f"{prefix}_ua_cnt", "max": f"{prefix}_ua_newest", "min": f"{prefix}_ua_oldest"})
    )
    time_features[f"{prefix}_period"] = time_features[f"{prefix}_ua_newest"] - time_features[f"{prefix}_ua_oldest"]
    output = df.merge(time_features, on=key_cols, how="left")
    return output

def add_value_counts(df, feature_df, colsets, prefix):
    df_cp = df.copy()
    for colset in colsets:
        if len(colset) == 2:
            cnts = (
                feature_df[colset + ["AuctionID"]].groupby(colset, as_index=False).count()
                .rename(columns={"AuctionID": f"{prefix}_{colset[0]}_{colset[1]}_cnt"})
            )
        elif len(colset) == 1: 
            col = colset[0]
            cnts = feature_df[col].value_counts().reset_index().rename(columns={"index": col, col: f"{prefix}_{col}_cnt"})
        
        df_cp = df_cp.merge(cnts, on=colset, how="left")
    return df_cp


In [505]:
%%time
train_dataset = build_dataset(watch, bid, auction, "train", dset_to_period)
valid_dataset = build_dataset(watch, bid, auction, "valid", dset_to_period)
sub_dataset = build_dataset(watch, bid, auction, "submission", dataset_type_to_period, sub_users.sample(frac=0.04))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [510]:
to_pickle(train_dataset, IMD_DIR, "train_dataset.pkl")
to_pickle(valid_dataset, IMD_DIR, "valid_dataset.pkl")
to_pickle(sub_dataset, IMD_DIR, "sub_dataset.pkl")

train_dataset.pkl : 202001141557


In [515]:
sub_dataset = build_dataset(watch, bid, auction, "submission", dataset_type_to_period, sub_users.sample(frac=0.04))

In [516]:
to_pickle(sub_dataset, IMD_DIR, "sub_dataset.pkl")

sub_dataset.pkl : 202001141717


In [517]:
sub_dataset.head()

Unnamed: 0,KaiinID,AuctionID,SaishuppinKaisuu,ConditionID,DanjobetsuID,watch_ua_cnt,watch_ua_newest,watch_ua_oldest,watch_period,watch_AuctionID_cnt,watch_ShouhinID_cnt,watch_BrandID_cnt,watch_LineID_cnt,watch_KaiinID_ShouhinID_cnt,watch_KaiinID_BrandID_cnt,watch_KaiinID_GenreGroupID_cnt,watch_KaiinID_LineID_cnt,bid_ua_cnt,bid_ua_newest,bid_ua_oldest,bid_period,bid_AuctionID_cnt,bid_ShouhinID_cnt,bid_BrandID_cnt,bid_LineID_cnt,bid_KaiinID_ShouhinID_cnt,bid_KaiinID_BrandID_cnt,bid_KaiinID_GenreGroupID_cnt,bid_KaiinID_LineID_cnt
0,20003,1847096,0,7,0,2.0,38.0,34.0,4.0,9.0,9.0,54457.0,3956908,2.0,2.0,2.0,11.0,1.0,33.0,33.0,0.0,1.0,1.0,4706.0,572160.0,1.0,1.0,1.0,4.0
1,59387,1847096,0,7,0,1.0,44.0,44.0,0.0,9.0,9.0,54457.0,3956908,1.0,85.0,166.0,407.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,4706.0,572160.0,-1.0,-1.0,1.0,2.0
2,20003,3775407,0,6,0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,15071.0,3956908,-1.0,6.0,3.0,11.0,1.0,43.0,43.0,0.0,1.0,1.0,2468.0,572160.0,1.0,3.0,1.0,4.0
3,20003,3826672,0,6,0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,15071.0,3956908,-1.0,6.0,3.0,11.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,2468.0,572160.0,1.0,3.0,1.0,4.0
4,20003,310142,0,6,0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,15071.0,3956908,-1.0,6.0,3.0,11.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,2468.0,572160.0,1.0,3.0,1.0,4.0


In [518]:
train_dataset = pd.read_pickle(IMD_DIR + "202001141557" + "_train_dataset.pkl")
valid_dataset = pd.read_pickle(IMD_DIR + "202001141715" + "_valid_dataset.pkl")
sub_dataset = pd.read_pickle(IMD_DIR + "202001141717" + "_sub_dataset.pkl")

In [523]:
lbg_params = {
    "objective": "binary",
    "nround": 500,
    "learning_rate": 0.01,
    "max_depth": 6,
    "num_leaves": 127
}

drop_cols = ["KaiinID", "AuctionID", "watch_actioned", "bid_actioned"]

watch_model = lgb.train(
    params=lbg_params,
    train_set=lgb.Dataset(np.array(train_dataset.drop(drop_cols, axis=1)),label = np.array(train_dataset["watch_actioned"])),
    valid_sets=lgb.Dataset(np.array(valid_dataset.drop(drop_cols, axis=1)),label = np.array(valid_dataset["watch_actioned"])),    
)
nyuusatsu_model = lgb.train(
    params=lbg_params,
    train_set=lgb.Dataset(np.array(train_dataset.drop(drop_cols, axis=1)),label = np.array(train_dataset["bid_actioned"])),
    valid_sets=lgb.Dataset(np.array(valid_dataset.drop(drop_cols, axis=1)),label = np.array(valid_dataset["bid_actioned"])),    
)    

[1]	valid_0's binary_logloss: 0.00830571
[2]	valid_0's binary_logloss: 0.00827365
[3]	valid_0's binary_logloss: 0.00823343
[4]	valid_0's binary_logloss: 0.008197
[5]	valid_0's binary_logloss: 0.00816266
[6]	valid_0's binary_logloss: 0.00813186
[7]	valid_0's binary_logloss: 0.00810127
[8]	valid_0's binary_logloss: 0.00807245
[9]	valid_0's binary_logloss: 0.00804358
[10]	valid_0's binary_logloss: 0.00801887
[11]	valid_0's binary_logloss: 0.00799502
[12]	valid_0's binary_logloss: 0.00797294
[13]	valid_0's binary_logloss: 0.00795189
[14]	valid_0's binary_logloss: 0.00793205
[15]	valid_0's binary_logloss: 0.00791273
[16]	valid_0's binary_logloss: 0.00789435
[17]	valid_0's binary_logloss: 0.00787609
[18]	valid_0's binary_logloss: 0.00785996
[19]	valid_0's binary_logloss: 0.00784385
[20]	valid_0's binary_logloss: 0.00782667
[21]	valid_0's binary_logloss: 0.00780986
[22]	valid_0's binary_logloss: 0.00779581
[23]	valid_0's binary_logloss: 0.00778076
[24]	valid_0's binary_logloss: 0.00776552
[25

[97]	valid_0's binary_logloss: 0.00593099
[98]	valid_0's binary_logloss: 0.00592496
[99]	valid_0's binary_logloss: 0.00591936
[100]	valid_0's binary_logloss: 0.00591379


In [524]:
watch_pred = watch_model.predict(np.array(sub_dataset.drop(["AuctionID", "KaiinID"], axis=1)))
nyuusatsu_pred = nyuusatsu_model.predict(np.array(sub_dataset.drop(["AuctionID", "KaiinID"], axis=1)))

In [525]:
sub_dataset["watch_pred"] = watch_pred
sub_dataset["nyuusatsu_pred"] = nyuusatsu_pred

In [526]:
sub_pred = sub_dataset[["KaiinID", "AuctionID", "watch_pred", "nyuusatsu_pred"]]
sub_pred["score"] = sub_pred["watch_pred"] * 0.2 + sub_pred["nyuusatsu_pred"] * 0.8
sub_ranks = sub_users.merge(sub_pred, on="KaiinID", how="left")[["KaiinID", "AuctionID", "score"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [527]:
def comple_submit_auc(df):
    candidate_aucs = (
        df[["AuctionID", "score"]].groupby("AuctionID", as_index=False).mean().sort_values("score", ascending=False).iloc[:40,:]
    )
    candidate_aucs["score"] = 0
    target_users = df.groupby("KaiinID", as_index=False).count().query("score < 20")["KaiinID"].tolist()
    buf = []
    for user in target_users:
        candidate_aucs_tmp = candidate_aucs.copy()
        candidate_aucs_tmp["KaiinID"] = user
        buf.append(candidate_aucs_tmp)
    df_comple = pd.concat(buf)
    df_colmled = pd.concat([df, df_comple])
    return df_colmled

In [528]:
sub_ranks = comple_submit_auc(sub_ranks)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]


In [529]:
sub_ranks.sort_values(['KaiinID', 'score'], ascending=[True, False], inplace=True)
sub_ranks['rank'] = sub_ranks.groupby('KaiinID')['score'].cumcount()
sub_valid = sub_ranks.query("rank < =19")
sub_valid.sort_values(['KaiinID', 'score'], ascending=[True, False], inplace=True)
sub_valid[["KaiinID", "AuctionID"]].to_csv(datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
