# import library

In [1]:
# import 
import numpy as np
import pandas as pd
import os
from math import sqrt
from pathlib import Path
from tqdm import tqdm
tqdm.pandas() # これでDataFrameにprogress_apply()が差し込まれるらしい

import datetime

In [3]:
class DataSet:
    # クラス変数の定義
    DRIVE_DIR = r'/content/drive/MyDrive/Colab Notebooks/kaggle/H_and_M_Personalized_Fashion_Recommendations'
    INPUT_DIR = os.path.join(DRIVE_DIR, 'input')

    def __init__(self) -> None:
        # インスタンス変数(属性の初期化)
        self.ALL_ITEMS = []
        self.ALL_USERS = []
        self.df_val: pd.DataFrame
        pass

    def read_data(self, c_id_short: bool = True):

        # ファイルパスを用意
        csv_train = os.path.join(DataSet.INPUT_DIR, 'transactions_train.csv')
        csv_sub = os.path.join(DataSet.INPUT_DIR, 'sample_submission.csv')
        csv_users = os.path.join(DataSet.INPUT_DIR, 'customers.csv')
        csv_items = os.path.join(DataSet.INPUT_DIR, 'articles.csv')

        # データをDataFrame型で読み込み
        if c_id_short == True:
            # 実際の購買記録の情報
            self.df = pd.read_parquet(os.path.join(
                DataSet.DRIVE_DIR, 'transactions_train.parquet'))
            # dfのcustomer_idはshort版に加工されてるから、カラム名を変更しておく
            self.df.rename(
                columns={'customer_id': 'customer_id_short'}, inplace=True)

            # dfのarticle_idを文字列に為ておく?
            # 各顧客の情報(メタデータ)
            self.dfu = pd.read_parquet(os.path.join(
                DataSet.DRIVE_DIR, 'customers.parquet'))
            self.dfu.rename(
                columns={'customer_id': 'customer_id_short'}, inplace=True)
            # 各商品の情報(メタデータ)
            self.dfi = pd.read_parquet(os.path.join(
                DataSet.DRIVE_DIR, 'articles.parquet'))
        else:
            self.df = pd.read_csv(csv_train, dtype={'article_id': str},
                                  parse_dates=['t_dat']  # datetime型で読み込み
                                  )
            self.dfu = pd.read_csv(csv_users)  # 各顧客の情報(メタデータ)
            self.dfi = pd.read_csv(
                csv_items, dtype={'article_id': str})  # 各商品の情報(メタデータ)

            # customer_id_shortカラムを生成
            self.df['customer_id_short'] = self.df["customer_id"].apply(lambda s: int(s[-16:], 16)).astype("uint64")
            self.dfu['customer_id_short'] =self.dfu["customer_id"].apply(lambda s: int(s[-16:], 16)).astype("uint64")

        # price カラムを×10^3しておく...その方が、小数点以下と整数で分けやすい??
        self.df['price'] = self.df['price'] * (10 **3)

        # 提出用のサンプル
        self.df_sub = pd.read_csv(csv_sub)
        

        # customer_idカラムのみのpd.DataFrameを作っておく(たぶん色々便利なので)
        self.df_sub["customer_id_short"] = pd.DataFrame(
            self.df_sub["customer_id"].apply(lambda s: int(s[-16:], 16))).astype("uint64")
        self.cid = pd.DataFrame(self.df_sub["customer_id_short"])

    def read_data_sampled(self, sampling_percentage: float = 5):
        # ファイルパスを用意
        sampled_data_dir = os.path.join(DataSet.INPUT_DIR, 'sampling_dir')
        path_transactions = os.path.join(
            sampled_data_dir, f'transactions_train_sample{sampling_percentage}.csv.gz')
        path_article = os.path.join(
            sampled_data_dir, f'articles_train_sample{sampling_percentage}.csv.gz')
        path_customers = os.path.join(
            sampled_data_dir, f'customers_sample{sampling_percentage}.csv.gz')

        # インスタンス変数として読み込み
        self.df = pd.read_csv(path_transactions,
                              dtype={'article_id': str},
                              parse_dates=['t_dat']  # datetime型で読み込み
                              )
        # price カラムを×10^3しておく...その方が、小数点以下と整数で分けやすい??
        self.df['price'] = self.df['price'] * (10 **3)
        self.dfi = pd.read_csv(path_article, dtype={'article_id': str})
        self.dfu = pd.read_csv(path_customers)
        # df_subはそのまま
        csv_sub = os.path.join(DataSet.INPUT_DIR, 'sample_submission.csv')
        self.df_sub = pd.read_csv(csv_sub)
        # customer_id_shortカラムを作る.
        self.df_sub["customer_id_short"] = pd.DataFrame(
            self.df_sub["customer_id"].apply(lambda s: int(s[-16:], 16))).astype("uint64")

        # customer_idカラムのみのpd.DataFrameを作っておく(たぶん色々便利なので)
        self.cid = pd.DataFrame(self.dfu["customer_id_short"].copy())
        print(self.cid)

In [4]:
Colab_bool = False
# Load data
if Colab_bool==False:
    df_t = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\transactions_train_sample5.csv.gz')
    df_i = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\articles_train_sample5.csv.gz')
    df_u = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\customers_sample5.csv.gz')

# 本番環境(=colab)では...
if Colab_bool :
    # DataSetオブジェクトの読み込み
    dataset = DataSet()
    # DataFrameとしてデータ読み込み
    dataset.read_data(c_id_short=True)

    # データをDataFrame型で読み込み
    df_t = dataset.df
    df_sub = dataset.df_sub # 提出用のサンプル
    df_u = dataset.dfu # 各顧客の情報(メタデータ)
    df_i = dataset.dfi # 各商品の情報(メタデータ)

# datetime型に変換
df_t['t_dat'] = pd.to_datetime(df_t['t_dat'])

In [13]:
item_feature_origin = pd.read_parquet(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\item_features.parquet')
item_feature_my = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\item_features_my_fullT.csv')


display(item_feature_origin.head())
display(item_feature_my.head())

Unnamed: 0_level_0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,...,product_group_name_3,graphical_appearance_name_3,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108775015,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,0,1
108775044,0,0,0,1,1,1,0,0,0,0,...,1,1,1,1,1,0,1,1,0,1
108775051,0,0,1,2,2,1,0,0,0,0,...,1,0,0,1,1,0,1,1,0,1
110065001,1,1,0,0,0,0,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0
110065002,1,1,0,1,1,1,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0


Unnamed: 0,article_id,mean_item_price,std_item_price,max_item_price,min_item_price,median_item_price,sum_item_price,max_minus_min_item_price,max_minus_mean_item_price,mean_minus_min_item_price,...,max_item_price_over_point,min_item_price_under_point,min_item_price_over_point,median_item_price_under_point,median_item_price_over_point,sum_item_price_under_point,sum_item_price_over_point,item_mean_offline_or_online,item_median_offline_or_online,item_sum_offline_or_online
0,108775015,8.141582,0.652115,9.152542,1.338983,8.457627,88262.88,7.81356,1.010961,6.802598,...,9.0,0.338983,1.0,0.457627,8.0,0.881356,88262.0,1.770778,2.0,19197.0
1,108775044,8.113955,0.735318,8.508474,1.423729,8.457627,58826.168,7.084746,0.39452,6.690226,...,8.0,0.423729,1.0,0.457627,8.0,0.169492,58826.0,1.710207,2.0,12399.0
2,108775051,4.97974,0.543883,8.457627,3.237288,5.067797,1070.644,5.220339,3.477887,1.742452,...,8.0,0.237288,3.0,0.067797,5.0,0.644068,1070.0,1.995349,2.0,429.0
3,110065001,20.219105,6.843639,25.40678,2.525424,24.415255,21108.746,22.881355,5.187674,17.693682,...,25.0,0.525424,2.0,0.415254,24.0,0.745763,21108.0,1.375479,1.0,1436.0
4,110065002,18.205465,6.003564,25.40678,4.542373,16.932203,9812.746,20.864407,7.201314,13.663093,...,25.0,0.542373,4.0,0.932203,16.0,0.745763,9812.0,1.654916,2.0,892.0


In [21]:
# print(item_feature_origin.nunique())
print(item_feature_origin.shape[1])
print(item_feature_my.shape[1])


28
24


In [22]:
print(item_feature_origin.nunique())

product_type_name                   130
product_group_name                   19
graphical_appearance_name            30
colour_group_name                    50
perceived_colour_value_name           8
perceived_colour_master_name         20
department_name                     250
index_name                           10
index_group_name                      5
section_name                         56
garment_group_name                   21
count                              3750
sum_price                         81999
mean_price                        87127
min_price                          3187
max_price                          1143
prod_name_3                           2
product_type_name_3                   2
product_group_name_3                  2
graphical_appearance_name_3           2
colour_group_name_3                   2
perceived_colour_value_name_3         2
perceived_colour_master_name_3        2
department_name_3                     2
index_name_3                          2


In [None]:
# トランザクションログの最終日を取得
last_ts = df_t['t_dat'].max()
# トランザクションログのt_datを一列のDataFrameとしてコピー
tmp = df_t[['t_dat']].copy()

# 曜日カラムを生成。dayofweek属性は、曜日のindex(月曜=0, 日曜=6)を返す。
tmp['dow'] = tmp['t_dat'].dt.dayofweek
print(last_ts)
tmp.head()

Unnamed: 0,t_dat,dow
0,2018-09-20,3
1,2018-09-20,3
2,2018-09-20,3
3,2018-09-20,3
4,2018-09-20,3


In [42]:
# 最終日は何曜日？？=>1=火曜日
last_ts.day_of_week



1

In [30]:
print(tmp['dow'].head())
pd.TimedeltaIndex(data=tmp['dow'] - 1, unit='D')

0    3
1    3
2    3
3    3
4    3
Name: dow, dtype: int64


TimedeltaIndex(['2 days', '2 days', '2 days', '2 days', '2 days', '2 days',
                '2 days', '2 days', '2 days', '2 days',
                ...
                '0 days', '0 days', '0 days', '0 days', '0 days', '0 days',
                '0 days', '0 days', '0 days', '0 days'],
               dtype='timedelta64[ns]', name='dow', length=1584950, freq=None)

In [None]:

# t_datの週内最終日をlast_day_of_bought_weekとして取得.
tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(data=tmp['dow'] - 1, unit='D')
# (もしt_datが火曜日の場合は、ldbw=t_dat)
# (もしt_datが月曜日の場合は、ldbw=t_dat-(-1)=t_dat+1=火曜日)

tmp.head()
tmp['ldbw2'] = tmp['t_dat'].progress_apply(
            # datetime型.floor():切り下げ（今回は7days毎）
            lambda d: last_ts - (last_ts - d).floor('7D')
        )


In [28]:
tmp['t_dat'].iloc[1000:2000].progress_apply(
            # datetime型.floor():切り下げ（今回は7days毎）
            # lambda d: last_ts - (last_ts - d).floor('7D')
            lambda d: (last_ts - d).floor('7D')
        )

100%|██████████| 1000/1000 [00:00<00:00, 4277.81it/s]


1000   728 days
1001   728 days
1002   728 days
1003   728 days
1004   728 days
         ...   
1995   728 days
1996   728 days
1997   728 days
1998   728 days
1999   728 days
Name: t_dat, Length: 1000, dtype: timedelta64[ns]

In [22]:
tmp[tmp['dow'] == 4]

Unnamed: 0,t_dat,dow,ldbw,ldbw2
2320,2018-09-21,4,2018-09-18,2018-09-21
2321,2018-09-21,4,2018-09-18,2018-09-21
2322,2018-09-21,4,2018-09-18,2018-09-21
2323,2018-09-21,4,2018-09-18,2018-09-21
2324,2018-09-21,4,2018-09-18,2018-09-21
...,...,...,...,...
1578178,2020-09-18,4,2020-09-15,2020-09-18
1578179,2020-09-18,4,2020-09-15,2020-09-18
1578180,2020-09-18,4,2020-09-15,2020-09-18
1578181,2020-09-18,4,2020-09-15,2020-09-18


In [31]:

# 水曜日以降のt_datのldbwに対して
tmp.loc[tmp['dow'] >= 2, 'ldbw'] = tmp.loc[tmp['dow'] >= 2, 'ldbw'] + \
    pd.TimedeltaIndex(
        np.ones(len(tmp.loc[tmp['dow'] >= 2])) * 7, unit='D')
tmp.head()

Unnamed: 0,t_dat,dow,ldbw,ldbw2
0,2018-09-20,3,2018-09-25,2018-09-20
1,2018-09-20,3,2018-09-25,2018-09-20
2,2018-09-20,3,2018-09-25,2018-09-20
3,2018-09-20,3,2018-09-25,2018-09-20
4,2018-09-20,3,2018-09-25,2018-09-20


In [35]:
tmp.loc[tmp['dow'] >= 2, 'ldbw'] + \
    pd.TimedeltaIndex(
        np.ones(len(tmp.loc[tmp['dow'] >= 2])) * 7, unit='D')

0         2018-10-02
1         2018-10-02
2         2018-10-02
3         2018-10-02
4         2018-10-02
             ...    
1581697   2020-09-29
1581698   2020-09-29
1581699   2020-09-29
1581700   2020-09-29
1581701   2020-09-29
Length: 1171315, dtype: datetime64[ns]

In [36]:
tmp.loc[tmp['dow'] >= 2, 'ldbw'] + datetime.timedelta(days=7)
# tmp.loc[tmp['dow'] >= 2]

0         2018-10-02
1         2018-10-02
2         2018-10-02
3         2018-10-02
4         2018-10-02
             ...    
1581697   2020-09-29
1581698   2020-09-29
1581699   2020-09-29
1581700   2020-09-29
1581701   2020-09-29
Name: ldbw, Length: 1171315, dtype: datetime64[ns]

In [13]:
pd.to_datetime('2018-09-18').day_of_week

1

In [43]:
x=19929
str(x).zfill(10)

'0000019929'