In [1]:
from os.path import dirname
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns', 50)

In [3]:
%load_ext autoreload
%autoreload 2
import brandear_est as be

In [5]:
MIMIC_DIR = os.path.join(os.getcwd(), "../../data/mimic/")
INPUT_DIR = os.path.join(os.getcwd(), "../../data/input/")
IMD_DIR = os.path.join(os.getcwd(), "../../data/intermediate/")

dset_to_period = {
    "train": {"oldest": datetime.datetime(2019, 9, 10, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 17, 0, 0, 0)},
    "valid": {"oldest": datetime.datetime(2019, 9, 17, 0, 0, 0),
              "newest": datetime.datetime(2019, 9, 24, 0, 0, 0),},
    "submission": {"oldest": datetime.datetime(2019, 9, 24, 0, 0, 0),
              "newest": datetime.datetime(2019, 10, 1, 0, 0, 0),},    
}

In [13]:
try:
    auction= pd.read_pickle(IMD_DIR + "202001151119" + "_auction.pkl")
    watch= pd.read_pickle(IMD_DIR + "202001151119" + "_watch.pkl")
    bid= pd.read_pickle(IMD_DIR + "202001151119" + "_bid.pkl")
    bid_success= pd.read_pickle(IMD_DIR + "202001151119" + "_bid_success.pkl")
    sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")    
    
    sub_users = sample_sub[["KaiinID"]].drop_duplicates()
    
except:
    auction = pd.read_csv(INPUT_DIR + "auction.csv")
    watch = pd.read_csv(INPUT_DIR + "watchlist.csv")
    bid = pd.read_csv(INPUT_DIR + "shudounyuusatsu.csv")
    bid_success = pd.read_csv(INPUT_DIR + "rakusatsu.csv")
    sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")

    auction = to_datetime(auction)
    watch = to_datetime(watch)
    bid = to_datetime(bid)
    bid_success = to_datetime(bid_success)

    watch = watch.merge(auction, on="AuctionID")
    bid = bid.merge(auction, on="AuctionID")
    bid_success = bid_success.merge(auction, on="AuctionID")

    sub_users = sample_sub[["KaiinID"]].drop_duplicates()

    to_pickle(auction, IMD_DIR, "auction.pkl")
    to_pickle(watch, IMD_DIR, "watch.pkl")
    to_pickle(bid, IMD_DIR, "bid.pkl")
    to_pickle(bid_success, IMD_DIR, "bid_success.pkl")

In [31]:
%%time
try:
    train_dataset = pd.read_pickle(IMD_DIR + "202001151205" + "_train_dataset.pkl")
    valid_dataset = pd.read_pickle(IMD_DIR + "202001151205" + "_valid_dataset.pkl")
    sub_dataset = pd.read_pickle(IMD_DIR + "202001151205" + "_sub_dataset.pkl")    
except:
    train_dataset = be.build_dataset(watch, bid, auction, "train", dset_to_period)
    valid_dataset = be.build_dataset(watch, bid, auction, "valid", dset_to_period)
    sub_dataset = be.build_dataset(watch, bid, auction, "submission", dset_to_period, sub_users)
    
    be.to_pickle(train_dataset, IMD_DIR, "train_dataset.pkl")
    be.to_pickle(valid_dataset, IMD_DIR, "valid_dataset.pkl")
    be.to_pickle(sub_dataset, IMD_DIR, "sub_dataset.pkl")

CPU times: user 753 ms, sys: 2.45 s, total: 3.2 s
Wall time: 3.57 s


In [122]:
train_sampled_dataset = (
    pd.concat([train_dataset.query("(watch_actioned == 0) & (bid_actioned == 0)").sample(n=70000),
               train_dataset.query("(watch_actioned == 1) | (bid_actioned == 1)")])
)

In [141]:
train_sampled_dataset.sort_values("KaiinID", inplace=True)
train_label = np.array(train_sampled_dataset[["watch_actioned", "bid_actioned"]].astype(int)).max(axis=1)
train_weight = (
    np.stack([
        np.array(train_sampled_dataset["watch_actioned"].astype(int)),
        (np.array(train_sampled_dataset["bid_actioned"]).astype(int) * 2),
        np.ones((train_sampled_dataset.shape[0], ))
    ], 1).max(axis=1)
)
train_group = train_sampled_dataset[["KaiinID", "AuctionID"]].groupby("KaiinID", as_index=False).count().sort_values("KaiinID")["AuctionID"]

In [142]:
valid_dataset.sort_values("KaiinID", inplace=True)
valid_label = np.array(valid_dataset[["watch_actioned", "bid_actioned"]].astype(int)).max(axis=1)
valid_weight = (
    np.stack([np.array(valid_dataset["watch_actioned"].astype(int)),
              (np.array(valid_dataset["bid_actioned"]).astype(int) * 2),
            np.ones((valid_dataset.shape[0], ))             
    ], 1).max(axis=1)
)
valid_group = valid_dataset[["KaiinID", "AuctionID"]].groupby("KaiinID", as_index=False).count().sort_values("KaiinID")["AuctionID"]

In [150]:
lbg_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    "ndcg_at": 20,
    "nround": 500,
    "learning_rate": 0.01,
    "max_depth": 6,
    "num_leaves": 127,
}

drop_cols = ["KaiinID", "AuctionID", "watch_actioned", "bid_actioned"]

lgbtrain = lgb.Dataset(data=np.array(train_sampled_dataset.drop(drop_cols, axis=1)),
                      label=train_label,
                      weight=train_weight,
                      group=train_group
                      )
lgbvalid = lgb.Dataset(data=np.array(valid_dataset.drop(drop_cols, axis=1)),
                      label=valid_label,
                      weight=valid_weight,
                      group=valid_group)

lgb_clf = lgb.train(
    params=lbg_params,
    train_set=lgbtrain,
    valid_sets=lgbvalid
)



[1]	valid_0's ndcg@20: 0.737658
[2]	valid_0's ndcg@20: 0.738882
[3]	valid_0's ndcg@20: 0.741733
[4]	valid_0's ndcg@20: 0.741705
[5]	valid_0's ndcg@20: 0.741677
[6]	valid_0's ndcg@20: 0.742088
[7]	valid_0's ndcg@20: 0.742183
[8]	valid_0's ndcg@20: 0.74203
[9]	valid_0's ndcg@20: 0.741954
[10]	valid_0's ndcg@20: 0.741849
[11]	valid_0's ndcg@20: 0.742061
[12]	valid_0's ndcg@20: 0.742985
[13]	valid_0's ndcg@20: 0.743133
[14]	valid_0's ndcg@20: 0.743317
[15]	valid_0's ndcg@20: 0.743046
[16]	valid_0's ndcg@20: 0.742989
[17]	valid_0's ndcg@20: 0.743157
[18]	valid_0's ndcg@20: 0.743048
[19]	valid_0's ndcg@20: 0.743058
[20]	valid_0's ndcg@20: 0.742925
[21]	valid_0's ndcg@20: 0.743015
[22]	valid_0's ndcg@20: 0.742908
[23]	valid_0's ndcg@20: 0.742749
[24]	valid_0's ndcg@20: 0.743044
[25]	valid_0's ndcg@20: 0.743435
[26]	valid_0's ndcg@20: 0.744038
[27]	valid_0's ndcg@20: 0.743971
[28]	valid_0's ndcg@20: 0.744405
[29]	valid_0's ndcg@20: 0.744396
[30]	valid_0's ndcg@20: 0.744327
[31]	valid_0's ndcg@

[246]	valid_0's ndcg@20: 0.748256
[247]	valid_0's ndcg@20: 0.748255
[248]	valid_0's ndcg@20: 0.748289
[249]	valid_0's ndcg@20: 0.748237
[250]	valid_0's ndcg@20: 0.748247
[251]	valid_0's ndcg@20: 0.748181
[252]	valid_0's ndcg@20: 0.748192
[253]	valid_0's ndcg@20: 0.748092
[254]	valid_0's ndcg@20: 0.748134
[255]	valid_0's ndcg@20: 0.748116
[256]	valid_0's ndcg@20: 0.748071
[257]	valid_0's ndcg@20: 0.748218
[258]	valid_0's ndcg@20: 0.748181
[259]	valid_0's ndcg@20: 0.74821
[260]	valid_0's ndcg@20: 0.748357
[261]	valid_0's ndcg@20: 0.748444
[262]	valid_0's ndcg@20: 0.748479
[263]	valid_0's ndcg@20: 0.748301
[264]	valid_0's ndcg@20: 0.74827
[265]	valid_0's ndcg@20: 0.748348
[266]	valid_0's ndcg@20: 0.748481
[267]	valid_0's ndcg@20: 0.748569
[268]	valid_0's ndcg@20: 0.748552
[269]	valid_0's ndcg@20: 0.748579
[270]	valid_0's ndcg@20: 0.748481
[271]	valid_0's ndcg@20: 0.748539
[272]	valid_0's ndcg@20: 0.748585
[273]	valid_0's ndcg@20: 0.748556
[274]	valid_0's ndcg@20: 0.748445
[275]	valid_0's 

[490]	valid_0's ndcg@20: 0.749446
[491]	valid_0's ndcg@20: 0.749378
[492]	valid_0's ndcg@20: 0.749305
[493]	valid_0's ndcg@20: 0.749339
[494]	valid_0's ndcg@20: 0.749284
[495]	valid_0's ndcg@20: 0.749415
[496]	valid_0's ndcg@20: 0.749521
[497]	valid_0's ndcg@20: 0.749563
[498]	valid_0's ndcg@20: 0.749513
[499]	valid_0's ndcg@20: 0.749523
[500]	valid_0's ndcg@20: 0.749563


In [179]:
importance = pd.DataFrame(lgb_clf.feature_importance(), 
                          index=train_sampled_dataset.drop(drop_cols, axis=1).columns,
                          columns=['importance'])
importance

Unnamed: 0,importance
SaishuppinKaisuu,2167
ConditionID,729
DanjobetsuID,566
watch_ua_cnt,575
watch_ua_newest,1537
watch_ua_oldest,1646
watch_period,121
watch_AuctionID_cnt,1418
watch_ShouhinID_cnt,1631
watch_BrandID_cnt,1913


In [170]:
valid_sampled_dataset = (
    pd.concat([valid_dataset.query("(watch_actioned == 0) & (bid_actioned == 0)").sample(n=70000),
               valid_dataset.query("(watch_actioned == 1) | (bid_actioned == 1)")])
)

valid_sampled_dataset.sort_values("KaiinID", inplace=True)
valid_label = np.array(valid_sampled_dataset[["watch_actioned", "bid_actioned"]].astype(int)).max(axis=1)
valid_weight = (
    np.stack([np.array(valid_sampled_dataset["watch_actioned"].astype(int)),
              (np.array(valid_sampled_dataset["bid_actioned"]).astype(int) * 2),
            np.ones((valid_sampled_dataset.shape[0], ))             
    ], 1).max(axis=1)
)
valid_group = valid_sampled_dataset[["KaiinID", "AuctionID"]].groupby("KaiinID", as_index=False).count().sort_values("KaiinID")["AuctionID"]

In [171]:
lgbvalid = lgb.Dataset(data=np.array(valid_sampled_dataset.drop(drop_cols, axis=1)),
                      label=valid_label,
                      weight=valid_weight,
                      group=valid_group)

In [173]:
lgb_for_sub = lgb.train(
    params=lbg_params,
    train_set=lgbvalid,
)
pred = lgb_for_sub.predict(
    data=np.array(sub_dataset.sort_values(["KaiinID", "AuctionID"]).drop(["AuctionID", "KaiinID"], axis=1)),
    group=np.array(sub_dataset[["KaiinID", "AuctionID"]].groupby("KaiinID", as_index=False).count().sort_values("KaiinID")["AuctionID"])
)
pd.Series(pred).unique()

array([-2.22717918, -1.91036766, -0.38131559, ...,  1.13413547,
        1.74235853,  0.86493182])

In [165]:
# lbg_params = {
#     "objective": "binary",
#     "nround": 500,
#     "learning_rate": 0.01,
#     "max_depth": 6,
#     "num_leaves": 127
# }

# drop_cols = ["KaiinID", "AuctionID", "watch_actioned", "bid_actioned"]

# watch_model = lgb.train(
#     params=lbg_params,
#     train_set=lgb.Dataset(np.array(train_dataset.drop(drop_cols, axis=1)),label = np.array(train_dataset["watch_actioned"])),
#     valid_sets=lgb.Dataset(np.array(valid_dataset.drop(drop_cols, axis=1)),label = np.array(valid_dataset["watch_actioned"])),    
# )
# nyuusatsu_model = lgb.train(
#     params=lbg_params,
#     train_set=lgb.Dataset(np.array(train_dataset.drop(drop_cols, axis=1)),label = np.array(train_dataset["bid_actioned"])),
#     valid_sets=lgb.Dataset(np.array(valid_dataset.drop(drop_cols, axis=1)),label = np.array(valid_dataset["bid_actioned"])),    
# )    
# watch_pred = watch_model.predict(np.array(sub_dataset.drop(["AuctionID", "KaiinID"], axis=1)))
# nyuusatsu_pred = nyuusatsu_model.predict(np.array(sub_dataset.drop(["AuctionID", "KaiinID"], axis=1)))

# sub_dataset["watch_pred"] = watch_pred
# sub_dataset["nyuusatsu_pred"] = nyuusatsu_pred
# sub_pred = sub_dataset[["KaiinID", "AuctionID", "watch_pred", "nyuusatsu_pred"]]
# sub_pred["score"] = sub_pred["watch_pred"] * 0.2 + sub_pred["nyuusatsu_pred"] * 0.8

In [174]:
sub_pred = sub_dataset[["KaiinID", "AuctionID"]]
sub_pred["score"] = pred
sub_ranks = sub_users.merge(sub_pred, on="KaiinID", how="left")[["KaiinID", "AuctionID", "score"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [176]:
sub_ranks = comple_submit_auc(sub_ranks)
sub_ranks.sort_values(['KaiinID', 'score'], ascending=[True, False], inplace=True)
sub_ranks['rank'] = sub_ranks.groupby('KaiinID')['score'].cumcount()
sub_valid = sub_ranks.query("rank < =19")
sub_valid.sort_values(['KaiinID', 'score'], ascending=[True, False], inplace=True)
sub_valid[["KaiinID", "AuctionID"]].to_csv(datetime.datetime.now().strftime("%Y%m%d%H%M") + "_submit.csv", index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
