In [37]:
from os.path import dirname
import os
from typing import Any
import re

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 50)

In [2]:
INPUT_DIR = os.path.join(os.getcwd(), "../../data/input/")

In [3]:
auction = pd.read_csv(INPUT_DIR + "auction.csv")
brand = pd.read_csv(INPUT_DIR + "brand.csv")
category = pd.read_csv(INPUT_DIR + "category.csv")
color = pd.read_csv(INPUT_DIR + "color.csv")
danjobetsu = pd.read_csv(INPUT_DIR + "danjobetsu.csv")
genre = pd.read_csv(INPUT_DIR + "genre.csv")
genregroup = pd.read_csv(INPUT_DIR + "genregroup.csv")
itemshou = pd.read_csv(INPUT_DIR + "itemshou.csv")
line = pd.read_csv(INPUT_DIR + "line.csv")
kaiin = pd.read_csv(INPUT_DIR + "kaiin.csv")
watchlist = pd.read_csv(INPUT_DIR + "watchlist.csv")
shudounyuusatsu = pd.read_csv(INPUT_DIR + "shudounyuusatsu.csv")
rakusatsu = pd.read_csv(INPUT_DIR + "rakusatsu.csv")
nyuuka_oshirase = pd.read_csv(INPUT_DIR + "nyuuka_oshirase.csv")
search_log = pd.read_csv(INPUT_DIR + "search_log.tsv", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def rename(df, prefix):
    target_columns = ["ModifyDate", "CreateDate"]
    df_renamed = df.copy()
    
    for target_column in target_columns:
        if target_column in df_renamed.columns:
            df_renamed.rename(columns={target_column: prefix + target_column}, inplace=True)
    
    return df_renamed

In [5]:
itemshou = rename(itemshou, prefix="ItemShow")
category = rename(category, prefix="Category")
genregroup = rename(genregroup, prefix="GenreGroup")
genre = rename(genre, prefix="Genre")

brand = rename(brand, prefix="Brand")
color = rename(color, prefix="Color")
danjobetsu = rename(danjobetsu, prefix="Danjobetsu")
line = rename(line, prefix="Line")

kaiin = rename(kaiin, prefix="Kaiin")
nyuuka_oshirase = rename(nyuuka_oshirase, prefix="Nyuuka")

# line.drop(["BrandID"], axis=1, inplace=True)
watchlist["target_flg"] = 1
watchlist.rename(columns={"TourokuDate": "Date"}, inplace=True)
shudounyuusatsu["target_flg"] = 2
shudounyuusatsu.rename(columns={"ShudouNyuusatsuDate": "Date"}, inplace=True)

In [6]:
genre_mst = (
    genre
    .merge(itemshou, on="ItemShouID", how="inner")
    .merge(category, on="CategoryID", how="inner")
    .merge(genregroup, on="GenreGroupID", how="inner")    
)
genre_mst.drop(["ShouhinShubetsuID", "GenreGroupID"], axis=1, inplace=True)
genre_mst.head()

Unnamed: 0,GenreID,GenreName,ItemShouID,CategoryID,GenreModifyDate,GenreCreateDate,ItemShouName,ItemDaiID,ItemShowModifyDate,ItemShowCreateDate,CategoryName,CategoryModifyDate,CategoryCreateDate,GenreGroupName,GenreGroupModifyDate,GenreGroupCreateDate
0,1,その他アクセサリー,36,4,2010-09-15 15:29:28,2010-04-02 13:25:06,その他,3,2010-04-02 13:25:06,2010-04-02 13:25:06,アクセサリー,2010-09-15 14:55:36,2010-09-15 14:55:36,アクセサリー（他）,2010-04-02 13:25:06,2010-04-02 13:25:06
1,107,スカーフリング,36,4,2010-09-15 15:29:28,2010-04-02 13:25:06,その他,3,2010-04-02 13:25:06,2010-04-02 13:25:06,アクセサリー,2010-09-15 14:55:36,2010-09-15 14:55:36,アクセサリー（他）,2010-04-02 13:25:06,2010-04-02 13:25:06
2,152,ペンダントトップ,36,4,2010-09-15 15:29:28,2010-04-02 13:25:06,その他,3,2010-04-02 13:25:06,2010-04-02 13:25:06,アクセサリー,2010-09-15 14:55:36,2010-09-15 14:55:36,アクセサリー（他）,2010-04-02 13:25:06,2010-04-02 13:25:06
3,349,ブローチ,35,4,2010-09-15 15:29:28,2010-04-02 13:25:06,ブローチ,3,2010-04-02 13:25:06,2010-04-02 13:25:06,アクセサリー,2010-09-15 14:55:36,2010-09-15 14:55:36,アクセサリー（他）,2010-04-02 13:25:06,2010-04-02 13:25:06
4,3,イヤリング,32,4,2010-09-15 15:29:28,2010-04-02 13:25:06,イヤリング,3,2010-04-02 13:25:06,2010-04-02 13:25:06,アクセサリー,2010-09-15 14:55:36,2010-09-15 14:55:36,アクセサリー（耳）,2010-04-02 13:25:06,2010-04-02 13:25:06


In [7]:
auction_mst = (
    auction
    .merge(genre_mst, on="GenreID", how="inner")    
    .merge(brand, on="BrandID", how="inner")    
    .merge(color, on="ColorID", how="inner")    
    .merge(danjobetsu, on="DanjobetsuID", how="inner")    
    .merge(line, on="LineID", how="inner")        
)
auction_mst.head()

Unnamed: 0,AuctionID,ShouhinShubetsuID,ShouhinID,SaishuppinKaisuu,ConditionID,BrandID_x,GenreID,GenreGroupID,LineID,ColorID,...,ColorModifyDate,ColorCreateDate,DanjobetsuName,DanjobetsuModifyDate,DanjobetsuCreateDate,LineName,BrandID_y,ItemLineID,LineModifyDate,LineCreateDate
0,2715892,1,664658,315,5,114,340,35,17,15,...,2010-04-02 13:25:06,2010-04-02 13:25:06,レディース,2010-04-02 13:25:07,2010-04-02 13:25:07,ラブコレクション,114,8,2010-04-02 13:25:06,2010-04-02 13:25:06
1,3964507,1,185380,0,6,114,338,35,19,15,...,2010-04-02 13:25:06,2010-04-02 13:25:06,レディース,2010-04-02 13:25:07,2010-04-02 13:25:07,マヒナ,114,8,2010-04-02 13:25:06,2010-04-02 13:25:06
2,797248,1,2260447,0,6,114,340,35,19,6,...,2010-04-02 13:25:06,2010-04-02 13:25:06,レディース,2010-04-02 13:25:07,2010-04-02 13:25:07,マヒナ,114,8,2010-04-02 13:25:06,2010-04-02 13:25:06
3,2395596,1,2260447,0,6,114,340,35,19,6,...,2010-04-02 13:25:06,2010-04-02 13:25:06,レディース,2010-04-02 13:25:07,2010-04-02 13:25:07,マヒナ,114,8,2010-04-02 13:25:06,2010-04-02 13:25:06
4,3805257,1,2260447,0,6,114,340,35,19,6,...,2010-04-02 13:25:06,2010-04-02 13:25:06,レディース,2010-04-02 13:25:07,2010-04-02 13:25:07,マヒナ,114,8,2010-04-02 13:25:06,2010-04-02 13:25:06


In [8]:
nyuuka_oshirase_tr = (
    nyuuka_oshirase.merge(kaiin, on="KaiinID", how="inner")
)
nyuuka_oshirase_tr

Unnamed: 0,KaiinID,BrandID,CategoryID,GenreID,KaishikakakuLow,KaishikakakuHigh,ItemColorID,ConditionID,conditionFrom,conditionTo,SearchFullText,Size,MailSendFlag,NyuukaCreateDate,RepeaterFlag,SeinengappiDate,KaiinCreateDate
0,273964,9712.0,,,,,,,,,,,0,2019-09-01 00:06:25,0.0,,2019-08-31 16:45:30
1,64255,283.0,,,,2000.0,,,7.0,4.0,,,0,2019-09-01 00:10:20,1.0,,2018-02-24 09:56:04
2,64255,100.0,5.0,86,,,,,,,,,0,2019-09-22 23:52:29,1.0,,2018-02-24 09:56:04
3,64255,,4.0,,3000.0,20000.0,,,2.0,1.0,,,0,2019-08-07 00:40:01,1.0,,2018-02-24 09:56:04
4,274994,,,,,,,,,,ファー,,0,2019-09-01 00:14:43,1.0,,2019-08-20 10:41:20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209960,31310,3597.0,,,,,,,,,,,0,2019-08-31 23:08:47,1.0,,2019-08-27 08:10:24
209961,31310,,,,,,,,,,Diagram GRACE CONTINENTAL,,0,2019-08-31 23:08:47,1.0,,2019-08-27 08:10:24
209962,31310,1347.0,,,,,,,,,,,0,2019-08-31 23:08:47,1.0,,2019-08-27 08:10:24
209963,31310,9525.0,,,,,,,,,,,0,2019-08-31 23:08:47,1.0,,2019-08-27 08:10:24


In [9]:
watch_nyuusatsu = pd.concat([watchlist, shudounyuusatsu], sort=False)

In [10]:
train = (
    watch_nyuusatsu
    .merge(auction_mst, on="AuctionID", how="inner")
)
train.head()

Unnamed: 0,KaiinID,AuctionID,Date,SakujoFlag,target_flg,Kingaku,Suuryou,SokketsuFlag,ShouhinShubetsuID,ShouhinID,...,ColorModifyDate,ColorCreateDate,DanjobetsuName,DanjobetsuModifyDate,DanjobetsuCreateDate,LineName,BrandID_y,ItemLineID,LineModifyDate,LineCreateDate
0,151286,1493264,2018-09-02 15:44:35,1,1,,,,1,212407,...,2010-04-02 13:25:07,2010-04-02 13:25:07,メンズ,2010-04-02 13:25:07,2010-04-02 13:25:07,ダミエ,114,5,2010-04-02 13:25:06,2010-04-02 13:25:06
1,260315,1493264,2018-09-06 08:15:10,1,1,,,,1,212407,...,2010-04-02 13:25:07,2010-04-02 13:25:07,メンズ,2010-04-02 13:25:07,2010-04-02 13:25:07,ダミエ,114,5,2010-04-02 13:25:06,2010-04-02 13:25:06
2,41167,1493264,2018-09-23 07:44:24,1,1,,,,1,212407,...,2010-04-02 13:25:07,2010-04-02 13:25:07,メンズ,2010-04-02 13:25:07,2010-04-02 13:25:07,ダミエ,114,5,2010-04-02 13:25:06,2010-04-02 13:25:06
3,156595,1493264,2018-09-25 19:44:00,1,1,,,,1,212407,...,2010-04-02 13:25:07,2010-04-02 13:25:07,メンズ,2010-04-02 13:25:07,2010-04-02 13:25:07,ダミエ,114,5,2010-04-02 13:25:06,2010-04-02 13:25:06
4,258936,2062203,2018-09-03 22:24:58,1,1,,,,1,1277730,...,2010-04-02 13:25:06,2010-04-02 13:25:06,レディース,2010-04-02 13:25:07,2010-04-02 13:25:07,ダミエ,114,5,2010-04-02 13:25:06,2010-04-02 13:25:06


In [11]:
def to_datetime(df):
    cols = df.columns
    datestr_cols = [col for col in cols if re.search('[Dd]ate$', col)]
    for datestr_col in datestr_cols:
        df[datestr_col] = pd.to_datetime(df[datestr_col], format='%Y-%m-%d %H:%M:%S')
    return df

def add_datepart(df: pd.DataFrame, field_name: str,
                 prefix: str = None, drop: bool = True, time: bool = True, date: bool = True):
    """
    Helper function that adds columns relevant to a date in the column `field_name` of `df`.
    from fastai: https://github.com/fastai/fastai/blob/master/fastai/tabular/transform.py#L55
    dtのカラム(field_name)から年月、月初などの特徴量を作成する関数
    """
    field = df[field_name]
    prefix = re.sub('[Dd]ate$', '', field_name)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Is_month_end', 'Is_month_start']
    if time:
        attr = attr + ['Hour', 'Minute']
    for n in attr:
        df[prefix + n] = getattr(field.dt, n.lower())
    df.drop(field_name, axis=1, inplace=True)        

In [24]:
drop_candidate = ["Date", "GenreName", "ItemShouName", "CategoryName", 'GenreGroupName',
             'BrandName1', 'BrandName2', 'ColorName', 'DanjobetsuName', 'LineName',
        'CreateDate', 'GenreModifyDate', 'GenreCreateDate',
       'ItemShowModifyDate', 'ItemShowCreateDate', 'CategoryModifyDate',
       'CategoryCreateDate', 'GenreGroupModifyDate', 'GenreGroupCreateDate',
       'BrandModifyDate', 'BrandCreateDate', 'ColorModifyDate',
       'ColorCreateDate', 'DanjobetsuModifyDate', 'DanjobetsuCreateDate',
       'LineModifyDate', 'LineCreateDate']
drop_cols = [col for col in drop_candidate if col in train.columns]
train.drop(drop_cols, axis=1, inplace=True)

In [38]:
train.head()

Unnamed: 0,KaiinID,AuctionID,SakujoFlag,target_flg,Kingaku,Suuryou,SokketsuFlag,ShouhinShubetsuID,ShouhinID,SaishuppinKaisuu,ConditionID,BrandID_x,GenreID,GenreGroupID,LineID,ColorID,DanjobetsuID,SankouKakaku,ItemShouID,CategoryID,ItemDaiID,ItemColorID,BrandID_y,ItemLineID
0,151286,1493264,1,1,,,,1,212407,0,6,114,22,22,3,297,1,6000,17,3,2,12,114,5
1,260315,1493264,1,1,,,,1,212407,0,6,114,22,22,3,297,1,6000,17,3,2,12,114,5
2,41167,1493264,1,1,,,,1,212407,0,6,114,22,22,3,297,1,6000,17,3,2,12,114,5
3,156595,1493264,1,1,,,,1,212407,0,6,114,22,22,3,297,1,6000,17,3,2,12,114,5
4,258936,2062203,1,1,,,,1,1277730,0,6,114,340,35,3,3,2,12750,59,10,8,1,114,5


In [48]:
X = train.drop(['KaiinID', 'AuctionID', 'target_flg'], axis=1)
y = train[["target_flg"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [49]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [50]:
lgbm_params = {
    'objective': 'binary'
}

In [51]:
model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)

[1]	valid_0's binary_logloss: 0
[2]	valid_0's binary_logloss: 0
[3]	valid_0's binary_logloss: 0
[4]	valid_0's binary_logloss: 0
[5]	valid_0's binary_logloss: 0
[6]	valid_0's binary_logloss: 0
[7]	valid_0's binary_logloss: 0
[8]	valid_0's binary_logloss: 0
[9]	valid_0's binary_logloss: 0
[10]	valid_0's binary_logloss: 0
[11]	valid_0's binary_logloss: 0
[12]	valid_0's binary_logloss: 0
[13]	valid_0's binary_logloss: 0
[14]	valid_0's binary_logloss: 0
[15]	valid_0's binary_logloss: 0
[16]	valid_0's binary_logloss: 0
[17]	valid_0's binary_logloss: 0
[18]	valid_0's binary_logloss: 0
[19]	valid_0's binary_logloss: 0
[20]	valid_0's binary_logloss: 0
[21]	valid_0's binary_logloss: 0
[22]	valid_0's binary_logloss: 0
[23]	valid_0's binary_logloss: 0
[24]	valid_0's binary_logloss: 0
[25]	valid_0's binary_logloss: 0
[26]	valid_0's binary_logloss: 0
[27]	valid_0's binary_logloss: 0
[28]	valid_0's binary_logloss: 0
[29]	valid_0's binary_logloss: 0
[30]	valid_0's binary_logloss: 0
[31]	valid_0's bina

In [52]:
y_pred = model.predict(X_test)
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1.])

In [53]:
print(y_pred.shape)
y_pred.sum()

(36,)


35.999999999999964