In [12]:

import pandas as pd
from dateutil.relativedelta import *
from TimeBasedCV import TimeBasedCV


In [13]:
transactions = pd.read_parquet('../data/HM_parquet/transactions_train.parquet')
customers = pd.read_parquet('../data/HM_parquet/customers.parquet')
articles = pd.read_parquet('../data/HM_parquet/articles.parquet')

tscv = TimeBasedCV(freq='days')
    

# surprise.SVD

### 前置作業(安裝導入套件)

In [14]:
import pandas as pd
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp,SVD
from surprise import accuracy
from surprise.model_selection import cross_validate,GridSearchCV,train_test_split
from collections import defaultdict
import numpy as np
import ml_metrics as metrics

def get_top_n(predictions, n=12):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


def get_rating(df):
    rating = df[['customer_id','article_id','price']].groupby(['customer_id','article_id']).count().reset_index()
    rating.columns = ['customer_id','article_id','rating']
    return rating

def data_preprocess(data):
    data_rating = get_rating(data)
    reader = Reader(rating_scale=(1, 500))
    data_set = Dataset.load_from_df(data_rating[['customer_id','article_id','rating']], reader)
    return data_set

## 測試一個月的資料

In [15]:
# 測試一個月的資料
train_one_month = pd.read_parquet('../data/HM_parquet/train_one_month.parquet')
val_one_month = pd.read_parquet('../data/HM_parquet/val_one_month.parquet')

In [16]:
## 讀取評分資料為surprise可以訓練的格式
trainset = data_preprocess(train_one_month)
testset = data_preprocess(val_one_month)

In [17]:
## 模型訓練與驗證(svd)
scores = pd.DataFrame()
for factors in [25,50,100,150,200]:
    for iterations in [20,30,40,50]:
        for regularization in [0.01]:

            algo = SVD(n_factors = factors,
                       n_epochs=iterations,
                       reg_all=regularization,
                       random_state=42)

            # 訓練模型
            algo.fit(trainset.build_full_trainset())
            # step3 - testing(train_test_split way)
            
            ##### rmse #####
            testset2 = [testset.df.loc[i].to_list() for i in range(len(testset.df))]
            predictions = algo.test(testset2)
            rmse = accuracy.rmse(predictions)

            ## map@k testing需要產的資料
            val_one_month.loc[:,'rating']=0
            test_processed = Dataset.load_from_df(val_one_month[['customer_id','article_id','rating']], reader) 
            NA, test2 = train_test_split(test_processed, test_size=1.0)

            ##### map@k #####
            predictions_map = algo.test(test2)
            # est = [i.est for i in predictions_map] 

            ## ======= 消費者的預測清單 =======
            top_n = get_top_n(predictions_map, n=12)

            cust_pred_list = []
            for uid, user_ratings in top_n.items():
                cust_pred_tuple = (uid, [str(iid) for (iid, _) in user_ratings])
                cust_pred_list.append(cust_pred_tuple)
            
            # ======= 消費者的實際購買清單 =======
            val_one_month['article_id'] = val_one_month['article_id'].astype('str')
            test_uni = val_one_month.drop_duplicates(subset=['customer_id', 'article_id'], keep='first')
            buy_n = test_uni[['customer_id','article_id']].groupby('customer_id')['article_id'].apply(list).to_dict()

            cust_actual_list = []
            for uid, user_ratings in buy_n.items():
                cust_pred_tuple = (uid, [iid for iid in user_ratings])
                cust_actual_list.append(cust_pred_tuple)

            final_list = list(zip(cust_actual_list, cust_pred_list))
            


            #map@k計算 
            mapk_list = []
            for i in range(len(final_list)):
              map_k = metrics.mapk([final_list[i][0][1]],[final_list[i][1][1]],12)
              mapk_list.append(map_k)

            def Average(lst):
                return sum(lst) / len(lst)

            map_k = Average(mapk_list)

            newRow = {
                        # =====填寫參數名稱===============
                        'factors':factors, 
                        'iterations':iterations, 
                        'regularization':regularization, 
                        # ===============================
                        'rmse':rmse,
                        'map@k':map_k
                        }
            print(newRow)
            newDF = pd.DataFrame([newRow])
            scores = pd.concat([scores, newDF], axis=0 ,ignore_index=True)

scores

RMSE: 0.4548
{'factors': 25, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.4548226270871537, 'map@k': 0.0006934235506988925}
RMSE: 0.4567
{'factors': 25, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.45673652012935334, 'map@k': 0.0007755140243779781}
RMSE: 0.4587
{'factors': 25, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.4586807454781601, 'map@k': 0.0006906737139759364}
RMSE: 0.4607
{'factors': 25, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.4607378817159362, 'map@k': 0.0007664917950177387}
RMSE: 0.4554
{'factors': 50, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.45544503589409663, 'map@k': 0.0005892022967062667}
RMSE: 0.4573
{'factors': 50, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.45732918412556306, 'map@k': 0.0007298004467408034}


KeyboardInterrupt: 

In [None]:

train_period, val_period, stride = 30, 7, 30
scores_one_month = pd.DataFrame(columns=["train_period","val_period","stride"])

one_fold_scores = model.train_ALS(train_one_month, val_one_month, train_period, val_period, stride,start_val=0)
scores_one_month = pd.concat([scores_one_month,one_fold_scores], axis=0 ,ignore_index=True)

scores_one_month


In [None]:
scores_one_month

## 分別取不同時間段

In [None]:

import pandas as pd
from dateutil.relativedelta import *
from model.TimeBasedCV import TimeBasedCV
from model.ImplicitALS import ImplicitALS as als

model = als()


In [None]:
transactions = pd.read_parquet('data\\HM_parquet\\transactions_train.parquet')
customers = pd.read_parquet('data\\HM_parquet\\customers.parquet')
articles = pd.read_parquet('data\\HM_parquet\\articles.parquet')

tscv = TimeBasedCV(freq='days')
    

In [None]:
transactions = model.data_preprocess(transactions,customers,articles)

In [None]:
# 做 time based split 30天
train_period, val_period, stride = 30, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

scores.to_parquet('model/params/params_stride30/implicit_ALS_train30.parquet')

In [None]:
# 做 time based split 60天
train_period, val_period, stride = 60, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/params_stride30/implicit_ALS_train60.parquet')

In [None]:
# 做 time based split 90天
train_period, val_period, stride = 90, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/params_stride30/implicit_ALS_train90.parquet')

In [None]:
# 做 time based split 180天
train_period, val_period, stride = 180, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/params_stride30/implicit_ALS_train180.parquet')

In [None]:
# 做 time based split 270天
train_period, val_period, stride = 270, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/params_stride30/implicit_ALS_train270.parquet')

In [None]:
# 做 time based split 360天
train_period, val_period, stride = 360, 7, 30
index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, val_period=val_period, stride=stride,show_progress=True)

# 做 time based CV
scores = pd.DataFrame(columns=["train_period","val_period","stride"])

for train_index, val_index in index_output:
    train_data = transactions.loc[train_index]
    val_data = transactions.loc[val_index]
    # 取得val開始日期
    val_data.reset_index(inplace=True, drop=True)
    start_val = val_data['t_dat'][0]
    # 呼叫訓練模型的function
    one_fold_scores = model.train_ALS(train_data, val_data, train_period, val_period, stride, start_val)
    scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

# this is the average accuracy over all folds
# average_r2score = np.mean(scores)

scores.to_parquet('model/params/params_stride30/implicit_ALS_train360.parquet')

In [None]:
# 合併所有dataframe
periods = [30,60,90,180,270,360]
total_scores = pd.DataFrame()

for period in periods:
    perios_socres = pd.read_parquet(f"model/params/params_stride30/implicit_ALS_train{period}.parquet")
    total_scores = pd.concat([total_scores,perios_socres],axis=0,ignore_index=True)
    
total_scores

In [None]:
# 看最佳的參數組合
total_scores.groupby(['train_period','factors','iterations','regularization']).mean('map12').sort_values('map12',ascending=False)

## 彙整

In [None]:

import pandas as pd
from dateutil.relativedelta import *
from model.TimeBasedCV import TimeBasedCV

transactions = pd.read_parquet('data\\HM_parquet\\transactions_train.parquet')
customers = pd.read_parquet('data\\HM_parquet\\customers.parquet')
articles = pd.read_parquet('data\\HM_parquet\\articles.parquet')

tscv = TimeBasedCV(freq='days')

In [None]:

# 測試將函數傳入函數
def test(model,train_data,val_data,train_period,stride):
    val_period = 7
    scores_one_month = pd.DataFrame(columns=["train_period","val_period","stride"])

    one_fold_scores = model(train_data, val_data, train_period, val_period, stride,start_val=0)
    scores_one_month = pd.concat([scores_one_month,one_fold_scores], axis=0 ,ignore_index=True)
    return scores_one_month

model = als()

# 測試一個月的資料
# transactions = pd.read_parquet('data\\HM_parquet\\transactions_train.parquet')
train_one_month = pd.read_parquet('data/train_one_month.parquet')
val_one_parquet = pd.read_parquet('data/val_one_month.parquet')

train_one_month = model.data_preprocess(train_one_month,customers,articles)
val_one_parquet = model.data_preprocess(val_one_parquet,customers,articles)

scores_one_month = test(model.train_ALS,train_one_month,val_one_parquet,30,30)
scores_one_month

In [None]:

def time_base_model(model,data,date_column,train_period,stride):
    # time based split
    val_period=7
    index_output = tscv.split(data, date_column, train_period, val_period, stride, show_progress=False)

    # 做 time based CV
    scores = pd.DataFrame(columns=["train_period","val_period","stride"])

    for train_index, val_index in index_output:
        train_data = transactions.loc[train_index]
        val_data = transactions.loc[val_index]
        # 取得val開始日期
        val_data.reset_index(inplace=True, drop=True)
        start_val = val_data[date_column][0]
        # 呼叫訓練模型的function
        one_fold_scores = model(train_data, val_data, train_period, val_period, stride, start_val)
        scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)
        
    return scores

In [None]:
# 用迴圈跑不同時間段
from model.ImplicitALS import ImplicitALS as als

model = als()

data = model.data_preprocess(transactions,customers,articles)
date_column='t_dat'
train_periods = [30,60,90,180,270,360]
stride = 30

# 做 time based split
for train_period in train_periods:
    one_period_scores = time_base_model(model,transactions,date_column,train_period,stride)
    one_period_scores.to_parquet(f'model/params/params_stride30/implicit_ALS_train{train_period}.parquet')




In [None]:
666666666