In [1]:
from os.path import dirname
import os
import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns', 50)

In [51]:
# %load_ext autoreload
# %autoreload 2
import brandear_est as be

In [5]:
MIMIC_DIR = os.path.join(os.getcwd(), "../../data/mimic/")
INPUT_DIR = os.path.join(os.getcwd(), "../../data/input/")
IMD_DIR = os.path.join(os.getcwd(), "../../data/intermediate/")
SUBMIT_DIR = os.path.join(os.getcwd(), "../../data/submit/")

dset_to_period = {
    "train": {"oldest": datetime.datetime(2019, 9, 3, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 10, 0, 0, 0)},
    "valid_for_train": {"oldest": datetime.datetime(2019, 9, 10, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 17, 0, 0, 0),},    
    "valid_for_sub": {"oldest": datetime.datetime(2019, 9, 17, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 24, 0, 0, 0),},
    "submission": {"oldest": datetime.datetime(2019, 9, 24, 0, 0, 0),
              "newest": datetime.datetime(2019, 10, 1, 0, 0, 0),},    
}

In [6]:
# 元データ読み込み
try:
    auction= pd.read_pickle(IMD_DIR + "202001171749" + "_auction.pkl")
    watch= pd.read_pickle(IMD_DIR + "202001171749" + "_watch.pkl")
    bid= pd.read_pickle(IMD_DIR + "202001171749" + "_bid.pkl")
    bid_success= pd.read_pickle(IMD_DIR + "202001171749" + "_bid_success.pkl")
    sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")    
    
    sub_users = sample_sub[["KaiinID"]].drop_duplicates()
    
except:
    auction = be.read_csv(INPUT_DIR + "auction.csv")
    watch = be.read_csv(INPUT_DIR + "watchlist.csv")
    bid = be.read_csv(INPUT_DIR + "shudounyuusatsu.csv")
    bid_success = be.read_csv(INPUT_DIR + "rakusatsu.csv")
    sample_sub = be.read_csv(INPUT_DIR + "sample_submission.csv")

    auction = be.to_datetime(auction)
    watch = be.to_datetime(watch)
    bid = be.to_datetime(bid)
    bid_success = be.to_datetime(bid_success)

    sub_users = sample_sub[["KaiinID"]].drop_duplicates()

In [35]:
# 特徴量計算対象データセット作成
try:
    train_dataset_base= pd.read_pickle(IMD_DIR + "202001201749" + "_train_dataset_base.pkl")
    vt_dataset_base= pd.read_pickle(IMD_DIR + "202001201749" + "_vt_dataset_base.pkl")
    vs_dataset_base= pd.read_pickle(IMD_DIR + "202001201749" + "_vs_dataset_base.pkl")
    sub_dataset_base= pd.read_pickle(IMD_DIR + "202001201749" + "_sub_dataset_base.pkl") 
except:
    train_dataset_base = be.build_dataset_base(watch, bid, auction, bid_success, "train", dset_to_period["train"], target_users=None)
    vt_dataset_base = be.build_dataset_base(watch, bid, auction, bid_success, "valid_for_train", dset_to_period["valid_for_train"], target_users=None)
    vs_dataset_base = be.build_dataset_base(watch, bid, auction, bid_success, "valid_for_sub", dset_to_period["valid_for_sub"], target_users=None)
    sub_dataset_base = be.build_dataset_base(watch, bid, auction, bid_success, "submission", dset_to_period["submission"], target_users=sub_users)    

In [7]:
# 特徴量計算
try:
#     train_dataset = pd.read_pickle(IMD_DIR + "202001201925" + "_train_dataset.pkl")
    vt_dataset = pd.read_pickle(IMD_DIR + "202001211552" + "_vt_dataset.pkl")
    vs_dataset = pd.read_pickle(IMD_DIR + "202001211552" + "_vs_dataset.pkl")
    sub_dataset = pd.read_pickle(IMD_DIR + "202001211552" + "_sub_dataset.pkl") 
except:
    train_dataset = be.add_features(train_dataset_base, watch, bid, auction, dset_to_period["train"])
    vt_dataset = be.add_features(vt_dataset_base, watch, bid, auction, dset_to_period["valid_for_train"])
    vs_dataset = be.add_features(vs_dataset_base, watch, bid, auction, dset_to_period["valid_for_sub"])    
    sub_dataset = be.add_features(sub_dataset_base, watch, bid, auction, dset_to_period["submission"])    

In [90]:
# 学習
lgb_rank = be.LgbLambdaLank()
lgb_rank.train(train_data=vt_dataset, valid_data=vs_dataset)

[1]	valid_0's ndcg@20: 0.701726
[2]	valid_0's ndcg@20: 0.701726
[3]	valid_0's ndcg@20: 0.701726
[4]	valid_0's ndcg@20: 0.701726
[5]	valid_0's ndcg@20: 0.701726
[6]	valid_0's ndcg@20: 0.701726


KeyboardInterrupt: 

In [11]:
drop_cols = ["KaiinID", "AuctionID", "watch_actioned", "bid_actioned",
             "CreateDate", "watch_ua_cnt", "watch_ua_newest", "watch_ua_oldest", "watch_period",
             "bid_ua_cnt", "bid_ua_newest", "bid_ua_oldest", "bid_period"]
importance = pd.DataFrame(lgb_rank.valid_model.feature_importance(), 
                          index=be.drop(vt_dataset, drop_cols).columns,
                          columns=['importance'])
importance

Unnamed: 0,importance
SaishuppinKaisuu,345
ConditionID,158
ColorID,317
DanjobetsuID,26
SankouKakaku,590
watch_AuctionID_cnt,429
watch_ShouhinID_cnt,267
watch_BrandID_cnt,316
watch_LineID_cnt,1
watch_KaiinID_ShouhinID_cnt,245


In [68]:
# 手元のデータセットでndcg計算
lgb_rank.retrain(train_data=vt_dataset)
pred = lgb_rank.predict(vs_dataset)
sub = be.adjust_sub_form(vs_dataset[["KaiinID"]].drop_duplicates(), pred, drop=True)
y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
ndcg_score = be.calc_ndcg(y_true, sub)
print("ndcg_score : ", ndcg_score)

ndcg_score :  0.024913193360575472


In [96]:
# 手元のデータで最善の予測をした際の結果
cheat_pred = be.eval.get_cheat_pred(vs_dataset, y_true)
cheat_sub = be.adjust_sub_form(vs_dataset[["KaiinID"]].drop_duplicates(), cheat_pred, drop=True)
ndcg_score = be.calc_ndcg(y_true, cheat_sub)
print("ndcg_score : ", ndcg_score)

ndcg_score :  0.1552900258660804


In [107]:
# ランダムな予測の場合
random_pred = vs_dataset[["KaiinID", "AuctionID"]]
random_pred["score"] = pd.Series(np.random.random(len(random_pred)), index=random_pred.index)
random_sub = be.adjust_sub_form(vs_dataset[["KaiinID"]].drop_duplicates(), random_pred, drop=True)
ndcg_score = be.calc_ndcg(y_true, random_sub)
print("ndcg_score : ", ndcg_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


ndcg_score :  0.0075125090014460634


In [19]:
# 再学習、予測、提出
lgb_rank.retrain(train_data=vs_dataset)
pred = lgb_rank.predict(sub_dataset)
sub = be.adjust_sub_form(sub_users, pred, drop=True)
sub.to_csv(SUBMIT_DIR + datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)
sub.head()

Unnamed: 0,KaiinID,AuctionID
1523900,24,3952189.0
1349885,24,3501540.0
473052,24,1226061.0
758010,24,1965456.0
1424337,24,3693939.0


In [108]:
# 学習
lgb_bi = be.LgbBinaryClassifier()
lgb_bi.train(train_data=vt_dataset, valid_data=vs_dataset)

[1]	valid_0's auc: 0.840685
[2]	valid_0's auc: 0.84175
[3]	valid_0's auc: 0.843543
[4]	valid_0's auc: 0.844674
[5]	valid_0's auc: 0.845012
[6]	valid_0's auc: 0.845288
[7]	valid_0's auc: 0.845751
[8]	valid_0's auc: 0.845961
[9]	valid_0's auc: 0.846216
[10]	valid_0's auc: 0.846126
[11]	valid_0's auc: 0.847416
[12]	valid_0's auc: 0.848057
[13]	valid_0's auc: 0.848696
[14]	valid_0's auc: 0.84878
[15]	valid_0's auc: 0.849014
[16]	valid_0's auc: 0.849282
[17]	valid_0's auc: 0.849922
[18]	valid_0's auc: 0.850012
[19]	valid_0's auc: 0.850269
[20]	valid_0's auc: 0.850366
[21]	valid_0's auc: 0.850152
[22]	valid_0's auc: 0.850134
[23]	valid_0's auc: 0.850365
[24]	valid_0's auc: 0.851024
[25]	valid_0's auc: 0.851298
[26]	valid_0's auc: 0.851581
[27]	valid_0's auc: 0.852005
[28]	valid_0's auc: 0.852369
[29]	valid_0's auc: 0.852749
[30]	valid_0's auc: 0.852988
[31]	valid_0's auc: 0.853181
[32]	valid_0's auc: 0.853591
[33]	valid_0's auc: 0.853558
[34]	valid_0's auc: 0.853794
[35]	valid_0's auc: 0.854

In [169]:
drop_cols = ["KaiinID", "AuctionID", "watch_actioned", "bid_actioned",
             "CreateDate", "watch_ua_cnt", "watch_ua_newest", "watch_ua_oldest", "watch_period",
             "bid_ua_cnt", "bid_ua_newest", "bid_ua_oldest", "bid_period"]
importance = pd.DataFrame(lgb_bi.watch_valid_model.feature_importance(), 
                          index=train_dataset.drop(drop_cols, axis=1).columns,
                          columns=['importance'])
importance

Unnamed: 0,importance
SaishuppinKaisuu,375
ConditionID,238
ColorID,29
DanjobetsuID,10
SankouKakaku,277
watch_AuctionID_cnt,353
watch_ShouhinID_cnt,270
watch_BrandID_cnt,194
watch_LineID_cnt,3
watch_KaiinID_ShouhinID_cnt,339


In [184]:
# 再学習、予測、提出
lgb_bi.retrain(train_data=vt_dataset)
pred = lgb_bi.predict(vs_dataset)
sub = be.adjust_sub_form(vs_dataset[["KaiinID"]].drop_duplicates(), pred, drop=True)
y_true = be.extract_target_actions(watch, bid, dset_to_period["valid_for_sub"])
ndcg_score = be.calc_ndcg(y_true, sub)
print("ndcg_score : ", ndcg_score)

ndcg_score :  0.023283969272730128


In [186]:
# 再学習、予測、提出
lgb_bi.retrain(train_data=vs_dataset)
pred = lgb_bi.predict(sub_dataset)
sub = be.adjust_sub_form(sub_users, pred, drop=True)
sub.to_csv(SUBMIT_DIR + datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)
sub.head()

Unnamed: 0,KaiinID,AuctionID
3,24,2693996
6,24,155861
5,24,1372202
2,24,2533430
1,24,2705691
