# import library

In [1]:
# import 
import numpy as np
import pandas as pd
import os
from math import sqrt
from pathlib import Path
from tqdm import tqdm
tqdm.pandas() # これでDataFrameにprogress_apply()が差し込まれるらしい

import datetime

In [2]:
class DataSet:
    # クラス変数の定義
    DRIVE_DIR = r'/content/drive/MyDrive/Colab Notebooks/kaggle/H_and_M_Personalized_Fashion_Recommendations'
    INPUT_DIR = os.path.join(DRIVE_DIR, 'input')

    def __init__(self) -> None:
        # インスタンス変数(属性の初期化)
        self.ALL_ITEMS = []
        self.ALL_USERS = []
        self.df_val: pd.DataFrame
        pass

    def read_data(self, c_id_short: bool = True):

        # ファイルパスを用意
        csv_train = os.path.join(DataSet.INPUT_DIR, 'transactions_train.csv')
        csv_sub = os.path.join(DataSet.INPUT_DIR, 'sample_submission.csv')
        csv_users = os.path.join(DataSet.INPUT_DIR, 'customers.csv')
        csv_items = os.path.join(DataSet.INPUT_DIR, 'articles.csv')

        # データをDataFrame型で読み込み
        if c_id_short == True:
            # 実際の購買記録の情報
            self.df = pd.read_parquet(os.path.join(
                DataSet.DRIVE_DIR, 'transactions_train.parquet'))
            # dfのcustomer_idはshort版に加工されてるから、カラム名を変更しておく
            self.df.rename(
                columns={'customer_id': 'customer_id_short'}, inplace=True)

            # dfのarticle_idを文字列に為ておく?
            # 各顧客の情報(メタデータ)
            self.dfu = pd.read_parquet(os.path.join(
                DataSet.DRIVE_DIR, 'customers.parquet'))
            self.dfu.rename(
                columns={'customer_id': 'customer_id_short'}, inplace=True)
            # 各商品の情報(メタデータ)
            self.dfi = pd.read_parquet(os.path.join(
                DataSet.DRIVE_DIR, 'articles.parquet'))
        else:
            self.df = pd.read_csv(csv_train, dtype={'article_id': str},
                                  parse_dates=['t_dat']  # datetime型で読み込み
                                  )
            self.dfu = pd.read_csv(csv_users)  # 各顧客の情報(メタデータ)
            self.dfi = pd.read_csv(
                csv_items, dtype={'article_id': str})  # 各商品の情報(メタデータ)

            # customer_id_shortカラムを生成
            self.df['customer_id_short'] = self.df["customer_id"].apply(lambda s: int(s[-16:], 16)).astype("uint64")
            self.dfu['customer_id_short'] =self.dfu["customer_id"].apply(lambda s: int(s[-16:], 16)).astype("uint64")

        # price カラムを×10^3しておく...その方が、小数点以下と整数で分けやすい??
        self.df['price'] = self.df['price'] * (10 **3)

        # 提出用のサンプル
        self.df_sub = pd.read_csv(csv_sub)
        

        # customer_idカラムのみのpd.DataFrameを作っておく(たぶん色々便利なので)
        self.df_sub["customer_id_short"] = pd.DataFrame(
            self.df_sub["customer_id"].apply(lambda s: int(s[-16:], 16))).astype("uint64")
        self.cid = pd.DataFrame(self.df_sub["customer_id_short"])

    def read_data_sampled(self, sampling_percentage: float = 5):
        # ファイルパスを用意
        sampled_data_dir = os.path.join(DataSet.INPUT_DIR, 'sampling_dir')
        path_transactions = os.path.join(
            sampled_data_dir, f'transactions_train_sample{sampling_percentage}.csv.gz')
        path_article = os.path.join(
            sampled_data_dir, f'articles_train_sample{sampling_percentage}.csv.gz')
        path_customers = os.path.join(
            sampled_data_dir, f'customers_sample{sampling_percentage}.csv.gz')

        # インスタンス変数として読み込み
        self.df = pd.read_csv(path_transactions,
                              dtype={'article_id': str},
                              parse_dates=['t_dat']  # datetime型で読み込み
                              )
        # price カラムを×10^3しておく...その方が、小数点以下と整数で分けやすい??
        self.df['price'] = self.df['price'] * (10 **3)
        self.dfi = pd.read_csv(path_article, dtype={'article_id': str})
        self.dfu = pd.read_csv(path_customers)
        # df_subはそのまま
        csv_sub = os.path.join(DataSet.INPUT_DIR, 'sample_submission.csv')
        self.df_sub = pd.read_csv(csv_sub)
        # customer_id_shortカラムを作る.
        self.df_sub["customer_id_short"] = pd.DataFrame(
            self.df_sub["customer_id"].apply(lambda s: int(s[-16:], 16))).astype("uint64")

        # customer_idカラムのみのpd.DataFrameを作っておく(たぶん色々便利なので)
        self.cid = pd.DataFrame(self.dfu["customer_id_short"].copy())
        print(self.cid)

In [3]:
Colab_bool = False
# Load data
if Colab_bool==False:
    df_t = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\transactions_train_sample5.csv.gz')
    df_i = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\articles_train_sample5.csv.gz')
    df_u = pd.read_csv(r'C:\Users\Masat\デスクトップ_Instead\webアプリ開発\H_and_M_Personalized_Fashion_Recommendations\input\customers_sample5.csv.gz')

# 本番環境(=colab)では...
if Colab_bool :
    # DataSetオブジェクトの読み込み
    dataset = DataSet()
    # DataFrameとしてデータ読み込み
    dataset.read_data(c_id_short=True)

    # データをDataFrame型で読み込み
    df_t = dataset.df
    df_sub = dataset.df_sub # 提出用のサンプル
    df_u = dataset.dfu # 各顧客の情報(メタデータ)
    df_i = dataset.dfi # 各商品の情報(メタデータ)

# datetime型に変換
df_t['t_dat'] = pd.to_datetime(df_t['t_dat'])

In [None]:
# トランザクションログの最終日を取得
last_ts = df_t['t_dat'].max()
# トランザクションログのt_datを一列のDataFrameとしてコピー
tmp = df_t[['t_dat']].copy()

# 曜日カラムを生成。dayofweek属性は、曜日のindex(月曜=0, 日曜=6)を返す。
tmp['dow'] = tmp['t_dat'].dt.dayofweek
print(last_ts)
tmp.head()

Unnamed: 0,t_dat,dow
0,2018-09-20,3
1,2018-09-20,3
2,2018-09-20,3
3,2018-09-20,3
4,2018-09-20,3


In [24]:

# t_datの週内最終日をlast_day_of_bought_weekとして取得.
tmp['ldbw'] = tmp['t_dat'] - \
    pd.TimedeltaIndex(data=tmp['dow'] - 1, unit='D')
tmp.head()
tmp['ldbw2'] = tmp['t_dat'].progress_apply(
            # datetime型.floor():切り下げ（今回は7days毎）
            lambda d: last_ts - (last_ts - d).floor('7D')
        )


  2%|▏         | 38499/1584950 [00:18<12:27, 2069.97it/s]


KeyboardInterrupt: 

In [26]:
tmp['t_dat'].iloc[1000:2000].progress_apply(
            # datetime型.floor():切り下げ（今回は7days毎）
            lambda d: last_ts - (last_ts - d).floor('7D')
            # lambda d: d.floor('7D')
        )

100%|██████████| 1000/1000 [00:00<00:00, 3710.05it/s]


1000   2018-09-20
1001   2018-09-20
1002   2018-09-20
1003   2018-09-20
1004   2018-09-20
          ...    
1995   2018-09-20
1996   2018-09-20
1997   2018-09-20
1998   2018-09-20
1999   2018-09-20
Name: t_dat, Length: 1000, dtype: datetime64[ns]

In [22]:
tmp[tmp['dow'] == 4]

Unnamed: 0,t_dat,dow,ldbw,ldbw2
2320,2018-09-21,4,2018-09-18,2018-09-21
2321,2018-09-21,4,2018-09-18,2018-09-21
2322,2018-09-21,4,2018-09-18,2018-09-21
2323,2018-09-21,4,2018-09-18,2018-09-21
2324,2018-09-21,4,2018-09-18,2018-09-21
...,...,...,...,...
1578178,2020-09-18,4,2020-09-15,2020-09-18
1578179,2020-09-18,4,2020-09-15,2020-09-18
1578180,2020-09-18,4,2020-09-15,2020-09-18
1578181,2020-09-18,4,2020-09-15,2020-09-18


In [9]:

# 水曜日以降のt_datのldbwに対して
tmp.loc[tmp['dow'] >= 2, 'ldbw'] = tmp.loc[tmp['dow'] >= 2, 'ldbw'] + \
    pd.TimedeltaIndex(
        np.ones(len(tmp.loc[tmp['dow'] >= 2])) * 7, unit='D')
tmp.head()

Unnamed: 0,t_dat,dow,ldbw
0,2018-09-20,3,2018-09-25
1,2018-09-20,3,2018-09-25
2,2018-09-20,3,2018-09-25
3,2018-09-20,3,2018-09-25
4,2018-09-20,3,2018-09-25


In [13]:
pd.to_datetime('2018-09-18').day_of_week

1