# surprise.SVD

In [5]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp,SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict
import numpy as np
import ml_metrics as metrics

class surpriseSVD():
    def __init__(self):
        self = self

    def get_top_n(self, predictions, n=12):
        """Return the top-N recommendation for each user from a set of predictions.
        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.
        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        """

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n

    def get_set(self,df):
        reader = Reader(rating_scale=(1, 500))
        data_set = Dataset.load_from_df(df[['customer_id','article_id','rating']], reader)
        return data_set

    def get_rating_set(self,df):
        rating = df[['customer_id','article_id','price']].groupby(['customer_id','article_id']).count().reset_index()
        rating.columns = ['customer_id','article_id','rating']
        rating_set = self.get_set(rating)
        return rating_set


    def train_SVD(self, train_data, test_data, train_period, val_period, stride, start_val):

        ## 讀取評分資料為surprise可以訓練的格式
        trainset = self.get_rating_set(train_data)
        testset = self.get_rating_set(test_data)

        ## rmse 需要的資料
        testset2 = [testset.df.loc[i].to_list() for i in range(len(testset.df))]

        ## map@k testing 需要產的資料
        test_data.loc[:,'rating']=0
        test_processed = self.get_set(test_data)
        NA, test2 = train_test_split(test_processed, test_size=1.0)

        # ======= 消費者的實際購買清單 =======
        test_data['article_id'] = test_data['article_id'].astype('str')
        test_uni = test_data.drop_duplicates(subset=['customer_id', 'article_id'], keep='first')
        buy_n = test_uni[['customer_id','article_id']].groupby('customer_id')['article_id'].apply(list).to_dict()

        cust_actual_list = []
        for uid, user_ratings in buy_n.items():
            cust_pred_tuple = (uid, [iid for iid in user_ratings])
            cust_actual_list.append(cust_pred_tuple)

        # ======= 訓練 SVD 模型 =======
        scores = pd.DataFrame()
        for factors in [25,50,100,150,200]:
            for iterations in [20,30,40,50]:
        # for factors in [25]:
        #     for iterations in [20]:
                for regularization in [0.01]:

                    algo = SVD(n_factors = factors,
                            n_epochs=iterations,
                            reg_all=regularization,
                            random_state=42)

                    # 訓練模型
                    algo.fit(trainset.build_full_trainset())
                    # step3 - testing(train_test_split way)
                    
                    ##### rmse #####
                    predictions = algo.test(testset2)
                    rmse = accuracy.rmse(predictions)

                    ##### map@k #####
                    predictions_map = algo.test(test2)
                    # est = [i.est for i in predictions_map] 

                    ##  消費者的預測清單 
                    top_n = self.get_top_n(predictions=predictions_map, n=12)

                    cust_pred_list = []
                    for uid, user_ratings in top_n.items():
                        cust_pred_tuple = (uid, [str(iid) for (iid, _) in user_ratings])
                        cust_pred_list.append(cust_pred_tuple)

                    final_list = list(zip(cust_actual_list, cust_pred_list))

                    # map@k計算 
                    mapk_list = []
                    for i in range(len(final_list)):
                        map_k = metrics.mapk([final_list[i][0][1]],[final_list[i][1][1]],12)
                        mapk_list.append(map_k)

                        # def Average(lst):
                        #     return sum(lst) / len(lst)

                        # map_k = Average(mapk_list)

                    map_k = sum(mapk_list)/len(mapk_list)

                    newRow = {
                            'train_period':train_period, 
                            'val_period':val_period, 
                            'stride':stride, 
                            'start_val':start_val,
                            # =====填寫參數名稱===============
                            'factors':factors, 
                            'iterations':iterations, 
                            'regularization':regularization, 
                            # ===============================
                            'rmse':rmse,
                            'map@k':map_k
                            }
                    newDF = pd.DataFrame([newRow])
                    scores = pd.concat([scores, newDF], axis=0 ,ignore_index=True)
                    print(newRow)

        return scores

## 測試一個月的資料

In [3]:
# 測試一個月的資料
train_data = pd.read_parquet('../data/HM_parquet/train_one_month.parquet')
test_data = pd.read_parquet('../data/HM_parquet/val_one_month.parquet')

In [18]:

train_period, val_period, stride = 30, 7, 30
start_val = 0
scores_one_month = pd.DataFrame(columns=["train_period","val_period","stride"])

model = surpriseSVD()
one_fold_scores = model.train_SVD(train_data, test_data, train_period, val_period, stride,start_val=0)
# scores_one_month = pd.concat([scores_one_month,one_fold_scores], axis=0 ,ignore_index=True)


RMSE: 0.4437
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': 0, 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.4436772841441815, 'map@k': 0.0007862185238658419}


In [20]:
# one_fold_scores.to_parquet('../model/params/params_SVD_stride30/surprise_SVD_train_test.parquet')


## 分別取不同時間段

In [2]:

import pandas as pd
from dateutil.relativedelta import *
from TimeBasedCV import TimeBasedCV
from surpriseSVD import surpriseSVD as svd

model = svd()


In [5]:
transactions = pd.read_parquet('../data/HM_parquet/transactions_train.parquet')
# customers = pd.read_parquet('../data/HM_parquet/customers.parquet')
# articles = pd.read_parquet('../data/HM_parquet/articles.parquet')

tscv = TimeBasedCV(freq='days')
    

In [3]:
def time_split_model(train_period):
    # 做 time based split
    test_period, stride = 7, 30
    index_output = tscv.split(transactions, date_column='t_dat', train_period=train_period, test_period=test_period, stride=stride,show_progress=False)

    # 做 time based CV
    scores = pd.DataFrame(columns=["train_period","val_period","stride"])

    for train_index, val_index in index_output:
        train_data = transactions.loc[train_index]
        val_data = transactions.loc[val_index]
        # 取得val開始日期
        val_data.reset_index(inplace=True, drop=True)
        start_val = val_data['t_dat'][0]
        # 呼叫訓練模型的function
        one_fold_scores = model.train_SVD(train_data, val_data, train_period, test_period, stride, start_val)
        scores = pd.concat([scores,one_fold_scores], axis=0 ,ignore_index=True)

    scores.to_parquet(f'../model/params/params_SVD_stride30/surprise_SVD_train{train_period}.parquet')
    print(f"完成存檔: surprise_SVD_train{train_period}.parquet")

In [4]:
time_split_model(30)

RMSE: 0.4528
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.45276431259391275, 'map@k': 0.0006913308508440936}
RMSE: 0.4548
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.4548128052351651, 'map@k': 0.0006876213180254817}
RMSE: 0.4569
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.4568824916272082, 'map@k': 0.0006916635874972759}
RMSE: 0.4591
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.45908533838776755, 'map@k': 0.0006909250263884655}
RMSE: 0.4533
{'train_period': 30, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 

In [5]:
time_split_model(60)

RMSE: 0.4531
{'train_period': 60, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.4531398644334987, 'map@k': 0.0007722399090764102}
RMSE: 0.4565
{'train_period': 60, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.4565305517177666, 'map@k': 0.0007746446786606745}
RMSE: 0.4597
{'train_period': 60, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.45971192531975374, 'map@k': 0.0007819734085723183}
RMSE: 0.4629
{'train_period': 60, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), 'factors': 25, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.4629445644832729, 'map@k': 0.0007740209429489192}
RMSE: 0.4546
{'train_period': 60, 'val_period': 7, 'stride': 30, 'start_val': datetime.date(2020, 9, 15), '

In [None]:
time_split_model(90)

In [None]:
time_split_model(180)

In [None]:
time_split_model(270)

In [None]:
time_split_model(360)

In [None]:
# 合併所有dataframe
periods = [30,60,90,180,270,360]
total_scores = pd.DataFrame()

for period in periods:
    perios_socres = pd.read_parquet(f'model/params/params_SVD_stride30/surprise_SVD_train{period}.parquet')
    total_scores = pd.concat([total_scores,perios_socres],axis=0,ignore_index=True)
    
total_scores

In [None]:
# 看最佳的參數組合
total_scores.groupby(['train_period','factors','iterations','regularization']).mean('map12').sort_values('map12',ascending=False)