In [3]:

import pandas as pd
from dateutil.relativedelta import *
from model.TimeBasedCV import TimeBasedCV


In [2]:
transactions = pd.read_parquet('data\\HM_parquet\\transactions_train.parquet')
customers = pd.read_parquet('data\\HM_parquet\\customers.parquet')
articles = pd.read_parquet('data\\HM_parquet\\articles.parquet')

tscv = TimeBasedCV(freq='days')
    

# implicit.ALS

In [3]:
from model.ImplicitALS import ImplicitALS as als

model = als()

## 測試一個月的資料

In [4]:
# 測試一個月的資料
train_one_month = pd.read_parquet('data/train_one_month.parquet')
val_one_parquet = pd.read_parquet('data/val_one_month.parquet')

train_one_month = model.data_preprocess(train_one_month,customers,articles)
val_one_parquet = model.data_preprocess(val_one_parquet,customers,articles)

train_period, val_period, stride = 30, 7, 30
scores_one_month = pd.DataFrame(columns=["train_period","val_period","stride"])

one_fold_scores = model.train_ALS(train_one_month, val_one_parquet, train_period, val_period, stride,start_val=0)
scores_one_month = pd.concat([scores_one_month,one_fold_scores], axis=0 ,ignore_index=True)

scores_one_month


{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 3, 'regularization': 0.01, 'map12': 0.003136429035660723}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 10, 'regularization': 0.01, 'map12': 0.004087349085210472}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'map12': 0.004186328448466547}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 50, 'iterations': 3, 'regularization': 0.01, 'map12': 0.0037242955926242703}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 50, 'iterations': 10, 'regularization': 0.01, 'map12': 0.004775751555119398}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 50, 'iterations': 20, 'regularization': 0.01, 'map12': 0.004815271571309318}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 

Unnamed: 0,train_period,val_period,stride,start_val,factors,iterations,regularization,map12
0,30,7,30,0.0,25.0,3.0,0.01,0.003136
1,30,7,30,0.0,25.0,10.0,0.01,0.004087
2,30,7,30,0.0,25.0,20.0,0.01,0.004186
3,30,7,30,0.0,50.0,3.0,0.01,0.003724
4,30,7,30,0.0,50.0,10.0,0.01,0.004776
5,30,7,30,0.0,50.0,20.0,0.01,0.004815
6,30,7,30,0.0,100.0,3.0,0.01,0.004235
7,30,7,30,0.0,100.0,10.0,0.01,0.00526
8,30,7,30,0.0,100.0,20.0,0.01,0.005238
9,30,7,30,0.0,200.0,3.0,0.01,0.004667


In [5]:
scores_one_month

Unnamed: 0,train_period,val_period,stride,start_val,factors,iterations,regularization,map12
0,30,7,30,0.0,25.0,3.0,0.01,0.003136
1,30,7,30,0.0,25.0,10.0,0.01,0.004087
2,30,7,30,0.0,25.0,20.0,0.01,0.004186
3,30,7,30,0.0,50.0,3.0,0.01,0.003724
4,30,7,30,0.0,50.0,10.0,0.01,0.004776
5,30,7,30,0.0,50.0,20.0,0.01,0.004815
6,30,7,30,0.0,100.0,3.0,0.01,0.004235
7,30,7,30,0.0,100.0,10.0,0.01,0.00526
8,30,7,30,0.0,100.0,20.0,0.01,0.005238
9,30,7,30,0.0,200.0,3.0,0.01,0.004667


## 分別取不同時間段

In [6]:
transactions = model.data_preprocess(transactions,customers,articles)

In [8]:
# 做 time based split 30天
train_period, val_period, stride = 30, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

scores.to_parquet('model/params/implicit_ALS_30.parquet')

{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2018, 10, 20), 'factors': 25, 'iterations': 3, 'regularization': 0.01, 'map12': 0.002593905433116847}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2018, 10, 20), 'factors': 25, 'iterations': 10, 'regularization': 0.01, 'map12': 0.0032684788594467564}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2018, 10, 20), 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'map12': 0.003290259035402417}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2018, 10, 20), 'factors': 50, 'iterations': 3, 'regularization': 0.01, 'map12': 0.002736789168495201}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2018, 10, 20), 'factors': 50, 'iterations': 10, 'regularization': 0.01, 'map12': 0.0036098906723746357}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2018, 10, 2

In [11]:
# 做 time based split 60天
train_period, val_period, stride = 60, 7, 60
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/implicit_ALS_60.parquet')

Train period: 2018-09-20 - 2018-11-19 , val period 2018-11-19 - 2018-11-26 # train records 2629140 , # val records 454319
Train period: 2018-11-19 - 2019-01-18 , val period 2019-01-18 - 2019-01-25 # train records 2489250 , # val records 262688
Train period: 2019-01-18 - 2019-03-19 , val period 2019-03-19 - 2019-03-26 # train records 2423234 , # val records 285183
Train period: 2019-03-19 - 2019-05-18 , val period 2019-05-18 - 2019-05-25 # train records 2879624 , # val records 333628
Train period: 2019-05-18 - 2019-07-17 , val period 2019-07-17 - 2019-07-24 # train records 3544435 , # val records 389212
Train period: 2019-07-17 - 2019-09-15 , val period 2019-09-15 - 2019-09-22 # train records 2660828 , # val records 229440
Train period: 2019-09-15 - 2019-11-14 , val period 2019-11-14 - 2019-11-21 # train records 2313117 , # val records 192719
Train period: 2019-11-14 - 2020-01-13 , val period 2020-01-13 - 2020-01-20 # train records 2291415 , # val records 225283
Train period: 2020-01-13

In [12]:
# 做 time based split 90天
train_period, val_period, stride = 90, 7, 90
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/implicit_ALS_90.parquet')

Train period: 2018-09-20 - 2018-12-19 , val period 2018-12-19 - 2018-12-26 # train records 3868529 , # val records 318840
Train period: 2018-12-19 - 2019-03-19 , val period 2019-03-19 - 2019-03-26 # train records 3673095 , # val records 285183
Train period: 2019-03-19 - 2019-06-17 , val period 2019-06-17 - 2019-06-24 # train records 4392180 , # val records 560076
Train period: 2019-06-17 - 2019-09-15 , val period 2019-09-15 - 2019-09-22 # train records 4692707 , # val records 229440
Train period: 2019-09-15 - 2019-12-14 , val period 2019-12-14 - 2019-12-21 # train records 3484312 , # val records 296902
Train period: 2019-12-14 - 2020-03-13 , val period 2020-03-13 - 2020-03-20 # train records 3254449 , # val records 183518
Train period: 2020-03-13 - 2020-06-11 , val period 2020-06-11 - 2020-06-18 # train records 3712913 , # val records 367286
Train period: 2020-06-11 - 2020-09-09 , val period 2020-09-09 - 2020-09-16 # train records 4214587 , # val records 255241
{'train_period': 90, 'va

In [13]:
# 做 time based split 180天
train_period, val_period, stride = 180, 7, 180
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/implicit_ALS_180.parquet')

Train period: 2018-09-20 - 2019-03-19 , val period 2019-03-19 - 2019-03-26 # train records 7541624 , # val records 285183
Train period: 2019-03-19 - 2019-09-15 , val period 2019-09-15 - 2019-09-22 # train records 9084887 , # val records 229440
Train period: 2019-09-15 - 2020-03-13 , val period 2020-03-13 - 2020-03-20 # train records 6738761 , # val records 183518
Train period: 2020-03-13 - 2020-09-09 , val period 2020-09-09 - 2020-09-16 # train records 7927500 , # val records 255241
{'train_period': 180, 'val_period': 7, 'stride': 180, 'start_val': datetime.date(2019, 3, 19), 'factors': 25, 'iterations': 3, 'regularization': 0.01, 'map12': 0.0028284422534116753}
{'train_period': 180, 'val_period': 7, 'stride': 180, 'start_val': datetime.date(2019, 3, 19), 'factors': 25, 'iterations': 10, 'regularization': 0.01, 'map12': 0.0036855363976268794}
{'train_period': 180, 'val_period': 7, 'stride': 180, 'start_val': datetime.date(2019, 3, 19), 'factors': 25, 'iterations': 20, 'regularization':

In [14]:
# 做 time based split 270天
train_period, val_period, stride = 270, 7, 270
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/implicit_ALS_270.parquet')

Train period: 2018-09-20 - 2019-06-17 , val period 2019-06-17 - 2019-06-24 # train records 11933804 , # val records 560076
Train period: 2019-06-17 - 2020-03-13 , val period 2020-03-13 - 2020-03-20 # train records 11431468 , # val records 183518
{'train_period': 270, 'val_period': 7, 'stride': 270, 'start_val': datetime.date(2019, 6, 17), 'factors': 25, 'iterations': 3, 'regularization': 0.01, 'map12': 0.0023605584564438432}
{'train_period': 270, 'val_period': 7, 'stride': 270, 'start_val': datetime.date(2019, 6, 17), 'factors': 25, 'iterations': 10, 'regularization': 0.01, 'map12': 0.0031128026285045}
{'train_period': 270, 'val_period': 7, 'stride': 270, 'start_val': datetime.date(2019, 6, 17), 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'map12': 0.0031840248841572298}
{'train_period': 270, 'val_period': 7, 'stride': 270, 'start_val': datetime.date(2019, 6, 17), 'factors': 50, 'iterations': 3, 'regularization': 0.01, 'map12': 0.0028714283188170818}
{'train_period': 270, '

In [15]:
# 做 time based split 360天
train_period, val_period, stride = 360, 7, 360
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/implicit_ALS_360.parquet')

Train period: 2018-09-20 - 2019-09-15 , val period 2019-09-15 - 2019-09-22 # train records 16626511 , # val records 229440
Train period: 2019-09-15 - 2020-09-09 , val period 2020-09-09 - 2020-09-16 # train records 14666261 , # val records 255241
{'train_period': 360, 'val_period': 7, 'stride': 360, 'start_val': datetime.date(2019, 9, 15), 'factors': 25, 'iterations': 3, 'regularization': 0.01, 'map12': 0.00318373785643781}
{'train_period': 360, 'val_period': 7, 'stride': 360, 'start_val': datetime.date(2019, 9, 15), 'factors': 25, 'iterations': 10, 'regularization': 0.01, 'map12': 0.003910377516036913}
{'train_period': 360, 'val_period': 7, 'stride': 360, 'start_val': datetime.date(2019, 9, 15), 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'map12': 0.003869535980415549}
{'train_period': 360, 'val_period': 7, 'stride': 360, 'start_val': datetime.date(2019, 9, 15), 'factors': 50, 'iterations': 3, 'regularization': 0.01, 'map12': 0.0034501277264461544}
{'train_period': 360, 'v

In [4]:
# 合併所有dataframe
periods = [30,60,90,180,270,360]
total_scores = pd.DataFrame()

for period in periods:
    perios_socres = pd.read_parquet(f"model/params/implicit_ALS_{period}.parquet")
    total_scores = pd.concat([total_scores,perios_socres],axis=0,ignore_index=True)
    
total_scores

Unnamed: 0,train_period,val_period,stride,start_val,factors,iterations,regularization,map12
0,30,7,30,2018-10-20,25.0,3.0,0.01,0.002594
1,30,7,30,2018-10-20,25.0,10.0,0.01,0.003268
2,30,7,30,2018-10-20,25.0,20.0,0.01,0.003290
3,30,7,30,2018-10-20,50.0,3.0,0.01,0.002737
4,30,7,30,2018-10-20,50.0,10.0,0.01,0.003610
...,...,...,...,...,...,...,...,...
619,360,7,360,2020-09-09,100.0,10.0,0.01,0.004300
620,360,7,360,2020-09-09,100.0,20.0,0.01,0.004220
621,360,7,360,2020-09-09,200.0,3.0,0.01,0.003846
622,360,7,360,2020-09-09,200.0,10.0,0.01,0.003826


In [5]:
# 看最佳的參數組合
total_scores.groupby(['factors','iterations','regularization']).mean('map12').sort_values('map12',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_period,val_period,stride,map12
factors,iterations,regularization,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200.0,10.0,0.01,79.615385,7.0,79.615385,0.00453
200.0,20.0,0.01,79.615385,7.0,79.615385,0.004453
100.0,10.0,0.01,79.615385,7.0,79.615385,0.004218
100.0,20.0,0.01,79.615385,7.0,79.615385,0.004166
200.0,3.0,0.01,79.615385,7.0,79.615385,0.004094
50.0,10.0,0.01,79.615385,7.0,79.615385,0.00385
50.0,20.0,0.01,79.615385,7.0,79.615385,0.003828
100.0,3.0,0.01,79.615385,7.0,79.615385,0.003593
25.0,20.0,0.01,79.615385,7.0,79.615385,0.003431
25.0,10.0,0.01,79.615385,7.0,79.615385,0.003423


## 彙整

In [1]:

import pandas as pd
from dateutil.relativedelta import *
from model.TimeBasedCV import TimeBasedCV

transactions = pd.read_parquet('data\\HM_parquet\\transactions_train.parquet')
customers = pd.read_parquet('data\\HM_parquet\\customers.parquet')
articles = pd.read_parquet('data\\HM_parquet\\articles.parquet')

tscv = TimeBasedCV(freq='days')

In [9]:

# 測試將函數傳入函數
def test(model,train_data,val_data,train_period,stride):
    val_period = 7
    scores_one_month = pd.DataFrame(columns=["train_period","val_period","stride"])

    one_fold_scores = model(train_data, val_data, train_period, val_period, stride,start_val=0)
    scores_one_month = pd.concat([scores_one_month,one_fold_scores], axis=0 ,ignore_index=True)
    return scores_one_month

model = als()

# 測試一個月的資料
# transactions = pd.read_parquet('data\\HM_parquet\\transactions_train.parquet')
train_one_month = pd.read_parquet('data/train_one_month.parquet')
val_one_parquet = pd.read_parquet('data/val_one_month.parquet')

train_one_month = model.data_preprocess(train_one_month,customers,articles)
val_one_parquet = model.data_preprocess(val_one_parquet,customers,articles)

scores_one_month = test(model.train_ALS,train_one_month,val_one_parquet,30,30)
scores_one_month

{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 3, 'regularization': 0.01, 'map12': 0.003136429035660723}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 10, 'regularization': 0.01, 'map12': 0.004087349085210472}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'map12': 0.004186328448466547}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 50, 'iterations': 3, 'regularization': 0.01, 'map12': 0.0037242955926242703}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 50, 'iterations': 10, 'regularization': 0.01, 'map12': 0.004775751555119398}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 50, 'iterations': 20, 'regularization': 0.01, 'map12': 0.004815271571309318}
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 

Unnamed: 0,train_period,val_period,stride,start_val,factors,iterations,regularization,map12
0,30,7,30,0.0,25.0,3.0,0.01,0.003136
1,30,7,30,0.0,25.0,10.0,0.01,0.004087
2,30,7,30,0.0,25.0,20.0,0.01,0.004186
3,30,7,30,0.0,50.0,3.0,0.01,0.003724
4,30,7,30,0.0,50.0,10.0,0.01,0.004776
5,30,7,30,0.0,50.0,20.0,0.01,0.004815
6,30,7,30,0.0,100.0,3.0,0.01,0.004235
7,30,7,30,0.0,100.0,10.0,0.01,0.00526
8,30,7,30,0.0,100.0,20.0,0.01,0.005238
9,30,7,30,0.0,200.0,3.0,0.01,0.004667


In [2]:

def time_base_model(model,data,date_column,train_period,stride):
    # time based split
    val_period=7
    index_output = tscv.split(data, date_column, train_period, val_period, stride, show_progress=False)

    # 做 time based CV
    scores = pd.DataFrame(columns=["train_period","val_period","stride"])

    for train_index, val_index in index_output:
        train_data = transactions.loc[train_index]
        val_data = transactions.loc[val_index]
        # 取得val開始日期
        val_data.reset_index(inplace=True, drop=True)
        start_val = val_data[date_column][0]
        # 呼叫訓練模型的function
        one_fold_scores = model(train_data, val_data, train_period, val_period, stride, start_val)
        scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)
        
    return scores

In [4]:
# 用迴圈跑不同時間段
from model.ImplicitALS import ImplicitALS as als

model = als()

data = model.data_preprocess(transactions,customers,articles)
date_column='t_dat'
train_periods = [30,60,90,180,270,360]
stride = 30
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

# 做 time based split 60天
for train_period in train_periods:
    one_period_scores = time_base_model(model,transactions,date_column,train_period,stride)
    scores = pd.concat([scores,one_period_scores], axis=0 ,ignore_index=True)

scores.to_parquet('model/params/implicit_ALS_total.parquet')

In [None]:
12313544564646