# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json
import numpy as np
import re 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from itertools import combinations


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [10]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC
'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC


In [2]:
metadata = getDF('../content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('../content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [3]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

# EDA

In [4]:
# 重複資料
df = ratings[ratings.duplicated(keep = False)]
print(df.shape)
# 刪除重複資料
ratings = ratings.drop_duplicates()
# ratings.columns

(17994, 5)


In [5]:
# 可能會有同個 user 對同個 item 的重複評分資料
ratings.groupby(['reviewerID','asin'],as_index=False).agg(count=('reviewerID','count')).sort_values(['count'], ascending = False)

Unnamed: 0,reviewerID,asin,count
38281,A1EGCED01USBA9,B00W259T7G,7
5400,A1210QJT54O8T0,B00W259T7G,5
358767,AYWLGJPC5O7AQ,B00W259T7G,4
274461,A3VBXQKRM7A4JR,B00W259T7G,4
176411,A2UEIN7SIPZFRP,B00W259T7G,4
...,...,...,...
120625,A29ET8324U5H68,B00AV2YL98,1
120624,A29ESW6I6L1C11,B0051S94NA,1
120623,A29ESUU8CY73TF,B01CIVDSCA,1
120622,A29ES620PVEXCV,B014FXGIYO,1


In [6]:
# 針對評價同一個使用者評價同一個產品，依據時間先後把重複的資料過濾掉，如果重複的話，只留下最新的一筆評分。
ratings = (ratings
     .sort_values("unixReviewTime", ascending=False)
     .groupby(['reviewerID', 'asin']).head(1)
)

In [7]:
# 客戶購買次數
ratings['reviewerID'].value_counts().describe()

count    324038.000000
mean          1.115934
std           0.434325
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          24.000000
Name: reviewerID, dtype: float64

In [8]:
ratings['reviewerID'].value_counts().quantile(q = [0.75,0.8,0.85,0.9,0.95,0.98, 0.99])

0.75    1.0
0.80    1.0
0.85    1.0
0.90    1.0
0.95    2.0
0.98    2.0
0.99    3.0
Name: reviewerID, dtype: float64

平均評價次數 1.11, Q3 -> 1次 pr90 -> 1次，說明當排除使用者 < 3的時候 訓練集已經只剩下5%以下資料 
導致樣本更少

In [9]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [10]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
368861,B01GKCUEPM,AE9BNPJIYXPBN,3.0,1538438400,2018-10-02
368798,B01GJQXRLC,A3ORA680FWE8SW,5.0,1538352000,2018-10-01
368687,B01GIYE88G,A3RTH0S9XVBPVK,5.0,1538352000,2018-10-01
369982,B01GZWUSYY,A1M8YMS915ZK0B,1.0,1538265600,2018-09-30
370903,B01HC6G4D6,A1RAXJN2Q4V20X,4.0,1538265600,2018-09-30


## 資料整理

## 資料切分

In [11]:
# Training date 
training_start_date = '2018-01-01'
training_end_date = '2018-09-01'
testing_end_date = '2018-09-30'

def split_data(training_end_date, testing_end_date, training_start_date = None):
    if training_start_date is None:
        ratings_trainings = ratings[
            (ratings['DATE'] < training_end_date)
        ]   
        ratings_testings = ratings[
            (ratings['DATE'] >= training_end_date) & 
            (ratings['DATE'] <= testing_end_date)
        ]
    else:
        ratings_trainings = ratings[
            (ratings['DATE'] >= training_start_date) & 
            (ratings['DATE'] < training_end_date)
        ]   
        ratings_testings = ratings[
            (ratings['DATE'] >= training_end_date) & 
            (ratings['DATE'] <= testing_end_date)
        ]
    # 過濾重複資料
    # ratings_trainings = (
    # ratings_trainings
    # .sort_values("DATE", ascending=False)
    # .groupby(['reviewerID', 'asin']).head(1)
    # )
    
    ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
    ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
    users = list(ratings_testings_by_user.keys())

    return ratings_trainings, ratings_testings, ratings_testings_by_user, users

# ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = training_start_date, training_end_date = training_end_date, testing_end_date = testing_end_date)


## 方法一：User-based Collaborative Filtering

## 資料清洗

In [12]:
# 觀察資料型態 
# metadata.nunique(axis = 0, dropna = True) -> error, 因資料型態為list -> 強制轉str
print(metadata.astype(str).nunique())
# category/ fit/ tech2/ main_cat為空資料
# tech1/ date/ feture 資訊 or 重複資料量過多 
# 此次不使用圖片連結
metadata = metadata[['asin', 'description', 'title', 'also_buy', 'brand', 'rank', 'also_view', 'price', 'details']]
metadata = metadata.drop_duplicates('asin')

category               1
tech1                 11
description        13751
fit                    1
title              32300
also_buy            6482
tech2                  1
brand               7863
feature              230
rank               31949
also_view           7894
details            32329
main_cat               1
similar_item        1301
date                  20
price               3530
asin               32488
imageURL           15737
imageURLHighRes    15737
dtype: int64


In [13]:
# also_buy
df_also_buy = metadata[['asin', 'also_buy']].apply(pd.Series.explode).reset_index(drop=True).dropna().value_counts().reset_index(name = 'also_buy_counts')
# also_review
df_also_view = metadata[['asin', 'also_view']].apply(pd.Series.explode).reset_index(drop=True).dropna().value_counts().reset_index(name = 'also_view_counts')

def rulebase_commander(training_data, user, k):
    global metadata, df_also_buy, df_also_view
    # metadata = metadata[['asin', 'also_buy', 'also_view']]
    # 先以Also_Buy 推薦 再以Also_Review推薦 若使用者為新客戶(先前無資料 則推薦最熱門的產品)
    # 針對metadata每個購買清單也會購買的產品排序
    #  # 前百大熱門產品
    popularity = training_data["asin"].value_counts().index.to_list()[0:100]
            # print(user)
    user_review = training_data[training_data['reviewerID'] == user]
    user_review_rank = user_review.groupby(['asin']).agg({'overall': 'mean'}).reset_index().sort_values(['overall'], ascending = False)
    # 1.篩選最喜好的產品top k 
    user_review_rank_max = user_review_rank[user_review_rank['overall'] >= user_review_rank.overall.max()]
    # 判斷是否有在熱門清單裡面 並重新推薦給客戶
    popularity_and_have_buy = [item for item in user_review_rank_max.asin if item in popularity][0:k]
    # popularity user_review_rank.asin
    # 2.篩選買過的產品 其他人 Also_buy 在熱門清單選項
    also_buy = df_also_buy[df_also_buy.asin.isin(user_review_rank.asin)]
    also_buy = [item for item in also_buy.also_buy if item in popularity]
    # 3.篩選買過的產品 其他人 Also_view 在熱門清單選項
    also_view = df_also_view[df_also_view.asin.isin(user_review_rank.asin)]
    also_view = [item for item in also_view.also_view if item in popularity]
    return also_buy, also_view, popularity_and_have_buy, popularity

# also_buy, also_view, popularity_and_have_buy, popularity = rulebase_commander(ratings_trainings, users[0], k = 10)


## 結果評估

In [14]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score


In [15]:
def cf_user_based_recommender(training_data, users=[], add_cf_commender = True, add_rule_based_commender = True, k=10, user_rating_threshold = 0):


    if add_cf_commender:
        user_to_items = defaultdict(dict)
        for index, row in training_data.iterrows():
            # print(index)
            row = dict(row)
            user = row['reviewerID']
            item = row['asin']
            rating = float(row['overall'])
            user_to_items[user][item] = rating
        print("total users before filtering: ", len(user_to_items))

        # 是否過濾評分次數少的使用者
        # 過濾門檻
        # user_rating_threshold = 3
        all_users = list(user_to_items.keys())
        for user in all_users:
            ratings = user_to_items[user]
            if user_rating_threshold and len(ratings) < user_rating_threshold:
                del user_to_items[user]

        print("total users after filtering: ", len(user_to_items))
        # 轉置資料 增加運算效率
        # 每一個產品 購買使用者的評分
        item_to_users = defaultdict(dict)
        for user, items in user_to_items.items():
            for item, rating in items.items():
                item_to_users[item][user] = rating

        # item_to_users["B00005JS5C"]
        from itertools import combinations
        # 計算使用者矩陣
        init_sim = lambda: [0 for _ in range(3)]
        factory = lambda: defaultdict(init_sim)
        pre_user_similarity = defaultdict(factory)
        # 商品個數
        n = len(item_to_users)
        index = 0
        # 針對每個Item進行運算
        for item, user_ratings in item_to_users.items():
            if len(user_ratings) > 1:
                # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
                # 使用者排列組合
                for user1, user2 in combinations(user_ratings.keys(), 2):
                        xy = user_ratings[user1] * user_ratings[user2]
                        xx = user_ratings[user1] ** 2
                        yy = user_ratings[user2] ** 2
                        pre_user_similarity[user1][user2][0] += xy
                        pre_user_similarity[user1][user2][1] += xx
                        pre_user_similarity[user1][user2][2] += yy

                        pre_user_similarity[user2][user1][0] += xy
                        pre_user_similarity[user2][user1][1] += xx
                        pre_user_similarity[user2][user1][2] += yy
                index += 1

        # 做完計算之後將結果依大小順序加入到 list 
        user_similarity = {}
        for src_user in pre_user_similarity:
            user_similarity_order = []
            for dst_user, val in pre_user_similarity[src_user].items():
                xy = val[0]
                xx = val[1]
                yy = val[2]
                div = ((xx*yy) ** 0.5)
                if div == 0:
                    continue
                similarity = xy / div
                if similarity < 0:
                    continue
                for i, s in enumerate(user_similarity_order):
                    target_similarity = s[1]
                    if target_similarity < similarity:
                        user_similarity_order.insert(i, (dst_user, similarity))
                        break
                else:
                    user_similarity_order.append((dst_user, similarity))
            user_similarity[src_user] = user_similarity_order

    # user_similarity['A3K0801D5H1F0C']
    recommendation = {}
    # k = 5
    for user in users:
        recommended_items = []

        if add_cf_commender:
            if user in user_similarity:
                sim_users = user_similarity[user]
                # recommended_items = []
                recommended_items_set = set()
                user_have_rated = set(user_to_items[user])
                stop_recommend = False
                for sim_user, _ in sim_users:
                    items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
                    for item, _ in items_from_sim_user:
                        # 排除已經購買的產品
                        if item not in user_have_rated and item not in recommended_items_set:
                            recommended_items.append(item)
                            recommended_items_set.add(item)
                        if len(recommended_items) >= k:
                            stop_recommend = True
                            break
                    if stop_recommend:
                        break
                # recommendation[user] = recommended_items
            # else:
                # recommendation[user] = []
        
        # 是否增加 rule based 推薦名單
        if add_rule_based_commender:        
            # profile filtering / rule based
            also_buy, also_view, popularity_and_have_buy, popularity = rulebase_commander(training_data, user, k=k)
            # 整併所有推薦名單
            user_recommendations = recommended_items + popularity_and_have_buy + also_buy + also_view + popularity
            # TOP K
            recommendation[user] = user_recommendations[0:k]
        else:
            recommendation[user] = recommended_items


    return recommendation

### 推薦系統 評分

In [18]:
# rule based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_user_based_recommender(ratings_trainings, users, add_cf_commender=False, add_rule_based_commender=True, k=10, user_rating_threshold = 3)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'rule based Method Score: {method}')
# # cf-item based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_user_based_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=False, k=10, user_rating_threshold = 3)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf_user_based Method Score: {method}')
# # cf-item based + rule based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_user_based_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=True, k=10, user_rating_threshold = 3)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf_user_based + rule based Method Score: {method}')

rule based Method Score: 0.13389830508474576
total users before filtering:  6912
total users after filtering:  4
cf_user_based Method Score: 0.0
total users before filtering:  6912
total users after filtering:  4
cf_user_based + rule based Method Score: 0.13389830508474576


# 方法二：Item-based collaborative filtering 

In [19]:
def cf_item_based_recommender(training_data = ratings_trainings, users=[], add_cf_commender = True, add_rule_based_commender = True, k = 10):

    item_to_users = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])
        item_to_users[item][user] = rating
        
    # 轉置資料 增加運算效率
    # 每一個使用者 購買所有Item的評分
    print("data converted")

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
        for user, rating in rating_users.items():
            user_to_items[user][item] = rating

    print("data inverted")

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)
    for user, items in user_to_items.items():
        if len(items) > 1:
            for i1, i2 in combinations(items.keys(), 2):
                xy = items[i1] * items[i2]
                xx = items[i1] ** 2
                yy = items[i2] ** 2
                pre_item_similarity[i1][i2][0] += xy
                pre_item_similarity[i1][i2][1] += xx
                pre_item_similarity[i1][i2][2] += yy

                pre_item_similarity[i2][i1][0] += xy
                pre_item_similarity[i2][i1][1] += xx
                pre_item_similarity[i2][i1][2] += yy

    print("sim data prepared")

    item_similarity = {}
    for src_item in pre_item_similarity:
        item_similarity_order = []
        for dst_item, val in pre_item_similarity[src_item].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(item_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    item_similarity_order.insert(i, (dst_item, similarity))
                    break
            else:
                item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order

    # print(f"get {k} recommendation items for user: {users}")

    recommendation = {}
    for user in users:
        items = []
        if add_cf_commender:
            items_set = set()
            stop = False
            user_has_rated = set(user_to_items[user])
            for item in user_has_rated:
                if item in item_similarity:
                    for sim_item, _ in item_similarity[item]:
                        # skip the item user has rated
                        if sim_item not in user_has_rated and sim_item not in items_set:
                            items.append(sim_item)
                            items_set.add(sim_item)
                        if len(items) >= k:
                            stop = True
                            break
                    if stop:
                        break
            # recommendation[user] = items

        # 是否增加 rule based 推薦名單
        if add_rule_based_commender:        
            # profile filtering / rule based
            also_buy, also_view, popularity_and_have_buy, popularity = rulebase_commander(training_data, user, k=k)
            # 整併所有推薦名單
            user_recommendations = items + popularity_and_have_buy + also_buy + also_view + popularity
            # TOP K
            recommendation[user] = user_recommendations[0:k]
        else:
            recommendation[user] = items


    return recommendation

In [20]:
# rule based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_item_based_recommender(ratings_trainings, users, add_cf_commender=False, add_rule_based_commender=True)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'rule based Method Score: {method}')
# cf-item based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_item_based_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=False)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf-item based Method Score: {method}')
# cf-item based + rule based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_item_based_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=True)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf-item based + rule based Method Score: {method}')

data converted
data inverted
sim data prepared
rule based Method Score: 0.13389830508474576
data converted
data inverted
sim data prepared
cf-item based Method Score: 0.0
data converted
data inverted
sim data prepared
cf-item based + rule based Method Score: 0.13389830508474576


# 方法三: 利用套件 surprise 實作 collaborative filtering

In [21]:
import time
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

In [22]:
def cf_surprise_recommender(training_data = ratings_trainings, users=[], add_cf_commender = True, add_rule_based_commender = True, k = 10, user_based=False, algo=KNNBasic):

    # 過濾重複資料
    training_data = (
    training_data
    .sort_values("DATE", ascending=False)
    .groupby(['reviewerID', 'asin']).head(1)
    )
    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    for user in users:
        recommend_item_list = []
        if add_cf_commender:
            items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
            recommend_item_set = set()
            for item in items_user_rated:
                iid = algo_impl.trainset.to_inner_iid(item)
                recommend_items_iid = algo_impl.get_neighbors(iid, k)
                for sim_item_iid in recommend_items_iid:
                    item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                    if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                        recommend_item_list.append(item_raw_id)
                        recommend_item_set.add(item_raw_id)

                if len(recommend_item_list) >= k:
                    recommend_item_list = recommend_item_list[:k]
                    break

        # 是否增加 rule based 推薦名單
        if add_rule_based_commender:        
            # profile filtering / rule based
            also_buy, also_view, popularity_and_have_buy, popularity = rulebase_commander(training_data, user, k)
            # 整併所有推薦名單
            user_recommendations = recommend_item_list + popularity_and_have_buy + also_buy + also_view + popularity
            # TOP K
            recommendation[user] = user_recommendations[0:k]
        else:
            recommendation[user] = recommend_item_list


    return recommendation

In [26]:
# rule based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2017-09-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_surprise_recommender(ratings_trainings, users, add_cf_commender=False, add_rule_based_commender=True)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'rule based Method Score: {method}')
# cf-item based
ratings_by_user = cf_surprise_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=False)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf-item based Method Score: {method}')
# # cf-item based + rule based
ratings_by_user = cf_surprise_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=True)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf-item based + rule based Method Score: {method}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
rule based Method Score: 0.1053484602917342
Computing the cosine similarity matrix...
Done computing similarity matrix.
cf-item based Method Score: 0.0016207455429497568
Computing the cosine similarity matrix...
Done computing similarity matrix.
cf-item based + rule based Method Score: 0.1053484602917342


In [25]:
# 訓練集測試集切割 觀察測試集使用者在訓練集個數
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2018-06-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
# 檢查測試集使用者ID出現在訓練集個數
np.isin(ratings_testings.reviewerID.unique(), ratings_trainings.reviewerID.unique()).sum()

index = pd.date_range("2018-01-01", end = '2018-09-01', freq='1M')

testing_end_date = '2018-09-30'

for training_end_date in index:
    training_start_date = '2017-01-01'
    ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = training_start_date, training_end_date = training_end_date, testing_end_date = testing_end_date)
    # user_num = np.isin(ratings_testings.reviewerID.unique(), ratings_trainings.reviewerID.unique()).sum()
    user_num = len(set(ratings_testings.reviewerID) & set(ratings_trainings.reviewerID))
    print(f'There are {user_num} during {training_end_date} - {testing_end_date} in training set.')


There are 468 during 2018-01-31 00:00:00 - 2018-09-30 in training set.
There are 395 during 2018-02-28 00:00:00 - 2018-09-30 in training set.
There are 336 during 2018-03-31 00:00:00 - 2018-09-30 in training set.
There are 279 during 2018-04-30 00:00:00 - 2018-09-30 in training set.
There are 189 during 2018-05-31 00:00:00 - 2018-09-30 in training set.
There are 111 during 2018-06-30 00:00:00 - 2018-09-30 in training set.
There are 53 during 2018-07-31 00:00:00 - 2018-09-30 in training set.
There are 21 during 2018-08-31 00:00:00 - 2018-09-30 in training set.


In [28]:
training_start_date = '2017-09-01'
training_end_date = '2018-03-01'
# rule based
ratings_trainings, ratings_testings, ratings_testings_by_user, users = split_data(training_start_date = '2017-09-01', training_end_date = training_end_date, testing_end_date = testing_end_date)
ratings_by_user = cf_surprise_recommender(ratings_trainings, users, add_cf_commender=False, add_rule_based_commender=True)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'rule based Method Score: {method}')
# cf-item based
ratings_by_user = cf_surprise_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=False)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf-item based Method Score: {method}')
# # cf-item based + rule based
ratings_by_user = cf_surprise_recommender(ratings_trainings, users, add_cf_commender=True, add_rule_based_commender=True)
method = evaluate(ratings_testings_by_user, ratings_by_user)
print(f'cf-item based + rule based Method Score: {method}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
rule based Method Score: 0.11490160227108695
Computing the cosine similarity matrix...
Done computing similarity matrix.
cf-item based Method Score: 0.0009142087282875427
Computing the cosine similarity matrix...
Done computing similarity matrix.
cf-item based + rule based Method Score: 0.11437232353365731


這邊可以發現cf-itembase在測試集拉長時間範圍的情況下有成功推薦，但是因為跟單純推薦rule-based的情況下，反而因為先推薦了相似item而排擠到了原先預測準確的熱門產品而使recall準確率下降。