<a href="https://colab.research.google.com/github/milanazhang/data-course-sample/blob/main/sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

## 基礎建設

In [95]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [96]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-25 16:50:56--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.2’


2021-12-25 16:50:56 (65.3 MB/s) - ‘All_Beauty.csv.2’ saved [15499476/15499476]

--2021-12-25 16:50:57--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.2’


2021-12-25 16:50:57 (55.6 MB/s) - ‘meta_All_Beauty.json.gz.2’ saved [10329961/10329961]



In [97]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 資料整理

In [98]:
#將資料裡空list換成np.NaN，統計每個欄位有多少空值
metadata.head()
metadata_fillna = metadata.mask(metadata.applymap(str).eq('[]'))
metadata_fillna = metadata_fillna.replace('',np.NaN)
metadata_fillna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32892 entries, 0 to 32891
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         0 non-null      float64
 1   tech1            10 non-null     object 
 2   description      15119 non-null  object 
 3   fit              0 non-null      float64
 4   title            32891 non-null  object 
 5   also_buy         6597 non-null   object 
 6   tech2            0 non-null      float64
 7   brand            17219 non-null  object 
 8   feature          269 non-null    object 
 9   rank             32515 non-null  object 
 10  also_view        8132 non-null   object 
 11  details          32892 non-null  object 
 12  main_cat         32892 non-null  object 
 13  similar_item     1304 non-null   object 
 14  date             19 non-null     object 
 15  price            11459 non-null  object 
 16  asin             32892 non-null  object 
 17  imageURL    

In [99]:
#移除資料裡有重複的部分，保留不重複的資料的index
metadata_fillna=metadata_fillna.astype('str')
metadata_fillna.drop_duplicates(inplace=True)
remain_index = metadata_fillna.index

In [100]:
import numpy as np
import re
#挑選需要欄位與資料與整理欄位內的特殊符號
metadata_df = metadata.iloc[list(remain_index)][["asin","also_buy","also_view","rank","similar_item","price","brand"]]
metadata_df = metadata_df.replace(r'^\s*$', np.NaN, regex = True)

In [101]:
#將rank分成排名與商品子集
metadata_df['rank'] = metadata_df['rank'].str.replace(',', '')
metadata_df['sub_cat'] = metadata_df['rank'].str.extract(r'(\d+) in (.*) \($')[1].str.replace('&amp;','&')
metadata_df['rank_num'] = pd.to_numeric(metadata_df['rank'].str.extract(r'(\d+) in (.*) \($')[0])
metadata_df = metadata_df.drop("rank",axis=1)

In [102]:
#統計商品子集有多少種及各種所佔的數量(不用main_cat的原因為，所有資料的main_cat都是一樣的)
rank_dict = {}
for i in metadata_df['sub_cat'].to_list():
  if rank_dict.get(i) != None:
    rank_dict[i] += 1
  else:
    rank_dict[i] = 1
for key in rank_dict.keys():
  print(key,rank_dict[key])

Beauty & Personal Care 32015
nan 402
Grocery & Gourmet Food 37
Health & Household 21
Sports & Outdoors 3
Toys & Games 3
Baby 2
Clothing Shoes & Jewelry 2
Home & Kitchen 1
Automotive 1
Tools & Home Improvement 1


In [103]:
metadata_df.head()

Unnamed: 0,asin,also_buy,also_view,similar_item,price,brand,sub_cat,rank_num
0,6546546450,[],[],,,idea village,Beauty & Personal Care,2938573.0
1,7178680776,"[B01E7LCSL6, B008X5RVME]",[],"class=""a-bordered a-horizontal-stripes a-spa...",$44.99,,Beauty & Personal Care,872854.0
2,7250468162,[],"[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]",,$28.76,No7,Beauty & Personal Care,956696.0
3,7367905066,[B0041PBXX8],[],,,,Beauty & Personal Care,1870258.0
4,7414204790,[],"[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...",,$12.15,Pirmal Healthcare,Beauty & Personal Care,67701.0


In [104]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [105]:
#資料依時間前後分為train與test
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

##創造特徵

In [106]:
#計算商品總聲量與正面聲量(正面定義為評分4以上)
volume_sum = {ratings["asin"]:ratings["reviewerID"] for ratings in ratings_trainings.groupby("asin").count().reset_index()[["asin","reviewerID"]].to_dict('records')}
good_ratings = ratings_trainings[ratings_trainings["overall"] >= 4]
volume_good = {ratings["asin"]:ratings["reviewerID"] for ratings in good_ratings.groupby("asin").count().reset_index()[["asin","reviewerID"]].to_dict('records')}

In [107]:
#計算商品回購間隔時間
from datetime import timedelta
date_sort = ratings_trainings.sort_values(['DATE'],ascending=True).groupby(["asin", "reviewerID"]).head(11)
date_sort['dur'] = date_sort.groupby(['asin', 'reviewerID'])["DATE"].apply(lambda x: x.diff(1))
repurchase = date_sort[date_sort["dur"].notna()][date_sort[date_sort["dur"].notna()]["dur"] != "0 days"]
repurchase["dur"] = repurchase["dur"].astype("str").str.extract(r'(\d+) (.*$)')[0].astype(int)
purchase_dur = {d["asin"]:d["dur"] for d in repurchase.groupby("asin").agg({"dur":pd.Series.median}).reset_index()[["asin","dur"]].to_dict('records')}

In [108]:
#計算每個商品的回購人數與總購買人數
repurchase_num = {d["asin"]:len(set(d["reviewerID"])) for d in repurchase[['reviewerID', 'asin']].groupby('asin').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')}
purchase_sum = {d["asin"]:len(set(d["reviewerID"])) for d in ratings_trainings[['reviewerID', 'asin']].groupby('asin').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')}

In [109]:
#將上述產生的商品特徵整併進metadata裡
metadata_df["volume_sum"] = [volume_sum[v] if v in volume_sum.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["volume_good"] = [volume_good[v] if v in volume_good.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["purchase_dur"] = [purchase_dur[v] if v in purchase_dur.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["repurchase_num"] = [repurchase_num[v] if v in repurchase_num.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["purchase_sum"] = [purchase_sum[v] if v in purchase_sum.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["good_review_rate"] = [g/s if s>0 else 0 for g,s in list(zip(metadata_df["volume_good"].to_list(),metadata_df["volume_sum"].to_list()))]
metadata_df["repurchase_rate"] = [n/s if s>0 else 0 for n,s in list(zip(metadata_df["repurchase_num"].to_list(),metadata_df["purchase_sum"].to_list()))]

In [110]:
#將商品依據排序規則(正面聲量比例->總聲量->銷售排名)排序，將每個商品做出推薦排名
sorted_meta = metadata_df[["asin","good_review_rate","volume_sum","rank_num"]].sort_values(by=["good_review_rate","volume_sum","rank_num"], ascending=[False,False,True])
sorted_meta = sorted_meta.reset_index().drop("index",axis=1)
product_weight = {}
for i in range(0,len(sorted_meta)):
  if product_weight.get(sorted_meta.iloc[i]["asin"]) == None:
    product_weight[sorted_meta.iloc[i]["asin"]] = i

In [111]:
metadata_df.head()

Unnamed: 0,asin,also_buy,also_view,similar_item,price,brand,sub_cat,rank_num,volume_sum,volume_good,purchase_dur,repurchase_num,purchase_sum,good_review_rate,repurchase_rate
0,6546546450,[],[],,,idea village,Beauty & Personal Care,2938573.0,2,1,0.0,0,2,0.5,0.0
1,7178680776,"[B01E7LCSL6, B008X5RVME]",[],"class=""a-bordered a-horizontal-stripes a-spa...",$44.99,,Beauty & Personal Care,872854.0,1,0,0.0,0,1,0.0,0.0
2,7250468162,[],"[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]",,$28.76,No7,Beauty & Personal Care,956696.0,1,1,0.0,0,1,1.0,0.0
3,7367905066,[B0041PBXX8],[],,,,Beauty & Personal Care,1870258.0,1,1,0.0,0,1,1.0,0.0
4,7414204790,[],"[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...",,$12.15,Pirmal Healthcare,Beauty & Personal Care,67701.0,15,13,0.0,0,15,0.866667,0.0


##製作推薦商品列表

In [112]:
#將also_buy與also_view欄位經排序過後的結果存入also_buy_sort與also_view_sort欄位
metadata_df["also_buy_dict"] = metadata_df["also_buy"].apply(lambda x: {k:product_weight[k] if len(x) != 0 and product_weight.get(k)!=None else 40000 for k in x})
metadata_df["also_view_dict"] = metadata_df["also_view"].apply(lambda x: {k:product_weight[k] if len(x) != 0 and product_weight.get(k)!=None else 40000 for k in x})
metadata_df["also_buy_sort"] = metadata_df["also_buy_dict"].apply(lambda x : [k for k, v in sorted(x.items(), key=lambda item: item[1]) if v!=40000])
metadata_df["also_view_sort"] = metadata_df["also_view_dict"].apply(lambda x : [k for k, v in sorted(x.items(), key=lambda item: item[1]) if v!=40000])
metadata_df.drop(["also_buy","also_view","also_buy_dict","also_view_dict"],axis=1,inplace = True)

In [113]:
#將similar_item從字串擷取出來，並經排序後存入similar_item_sort的欄位
metadata_df["similar_item"] = metadata_df["similar_item"].apply(lambda x : {i[1]:product_weight[i[1]] for i in re.findall(r'(/dp/)(.*)(/ref=)',x) if product_weight.get(i[1])!=None} if isinstance(x,str) else {})
metadata_df["similar_item_sort"] = metadata_df["similar_item"].apply(lambda x : [k for k, v in sorted(x.items(), key=lambda item: item[1]) if v!=40000])

In [114]:
#將同品牌的商品做成商品串
import warnings
warnings.filterwarnings("ignore")
metadata_df['brand'].fillna("notfound",inplace=True)
brand_dict = {d["brand"]:d["asin"] for d in metadata_df.groupby("brand").agg(list).reset_index()[['brand','asin']].to_dict("record")}

In [115]:
#將同品牌的商品排序存入same_brand_item_sort欄位
brand_weight_dict = {}
for i in brand_dict.keys():
  if brand_dict[i]!="notfound":
    tmp_dict = {}
    for k in brand_dict[i]:
      if product_weight.get(k) != None:
        tmp_dict[k] = product_weight[k]
    brand_weight_dict[i] = tmp_dict
brand_weight_sorted_dict = {k:[vk for vk,vv in sorted(v.items(),key=lambda item:item[1])] for k,v in brand_weight_dict.items()}

same_brand_item_list = []
for i in metadata_df["brand"].to_list():
  if brand_dict[i]!="notfound":
    same_brand_item_list.append(brand_weight_sorted_dict[i])
  else:
    same_brand_item_list.append({})

metadata_df["same_brand_item_sort"] = same_brand_item_list

In [116]:
#drop掉用不到的欄位
metadata_df.drop(["similar_item","brand","volume_good","repurchase_num","purchase_sum"],axis=1,inplace=True)
metadata_df.head()

Unnamed: 0,asin,price,sub_cat,rank_num,volume_sum,purchase_dur,good_review_rate,repurchase_rate,also_buy_sort,also_view_sort,similar_item_sort,same_brand_item_sort
0,6546546450,,Beauty & Personal Care,2938573.0,2,0.0,0.5,0.0,[],[],[],[6546546450]
1,7178680776,$44.99,Beauty & Personal Care,872854.0,1,0.0,0.0,0.0,[],[],[],"[B01BRJ8TF0, B01AD16KAO, B016QX70M0, B00KIYTQ8..."
2,7250468162,$28.76,Beauty & Personal Care,956696.0,1,0.0,1.0,0.0,[],[],[],"[B01FN9KPGG, B01698HYD2, 7250468162, B013L800JI]"
3,7367905066,,Beauty & Personal Care,1870258.0,1,0.0,1.0,0.0,[],[],[],"[B01BRJ8TF0, B01AD16KAO, B016QX70M0, B00KIYTQ8..."
4,7414204790,$12.15,Beauty & Personal Care,67701.0,15,0.0,0.866667,0.0,[],[],[],[7414204790]


## 產生推薦

In [117]:
#推薦最後程序使用銷售排名進行排序
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    metadata_dict = metadata_df[["asin","purchase_dur","repurchase_rate","rank_num", "volume_sum","good_review_rate","also_buy_sort","also_view_sort","similar_item_sort","same_brand_item_sort","sub_cat","rank_num"]].to_dict("record")
    metadata_dict = {d["asin"]:[d["purchase_dur"],d["repurchase_rate"],d["rank_num"], d["volume_sum"],d["good_review_rate"],d["also_buy_sort"],d["also_view_sort"],d["similar_item_sort"],d["same_brand_item_sort"],d["sub_cat"],d["rank_num"]] for d in metadata_dict}
    recommendations = {}
    for u in users:
      recommendations[u] = set()
      products_dict = {d["asin"]:d["overall"] for d in training_data[(training_data['reviewerID'] == u) & (training_data['overall'] >= 4)][["asin","overall"]].to_dict("record")}
      products_sort = {k:v for k,v in sorted(products_dict.items(),key=lambda item:item[1])}
      ##找出買過會再次回購的商品
      for key in products_sort.keys():
        last_buy_time = training_data[(training_data["reviewerID"] == u) & (training_data["asin"] == key)]["DATE"].max() 
        if metadata_dict[key][1]>0 and metadata_dict[key][0]>(training_data["DATE"].max()-last_buy_time).days:
          if len(recommendations[u]) == 10:
            break
          recommendations[u].add(key)
      ##還不夠K個，找also buy中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][5]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找also view中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][6]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找similar item中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][7]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找same brand中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][8]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找所有同種商品中銷量排序後的商品
      metadata_sell_sort = metadata_df.sort_values(['rank_num'],ascending=True).groupby("sub_cat").head(10)
      outside_break = False
      for key in products_sort.keys():
        sub_cat = metadata_dict[key][9]
        for item in metadata_sell_sort[metadata_sell_sort["sub_cat"] == sub_cat]["asin"].to_list():
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不構K個，代表此消費者在training data沒出現過或是同種商品沒到K個，故直接推薦子類別為美妝且最熱銷的產品
      for key in metadata_sell_sort[metadata_sell_sort["sub_cat"] == "Beauty & Personal Care"]["asin"].to_list():
        if len(recommendations[u]) == 10:
          break
        recommendations[u].add(key)
    return recommendations


ratings_by_user = recommender(ratings_trainings, users)

In [118]:
#推薦最後程序使用總聲量進行排序
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    metadata_dict = metadata_df[["asin","purchase_dur","repurchase_rate","rank_num", "volume_sum","good_review_rate","also_buy_sort","also_view_sort","similar_item_sort","same_brand_item_sort","sub_cat","rank_num"]].to_dict("record")
    metadata_dict = {d["asin"]:[d["purchase_dur"],d["repurchase_rate"],d["rank_num"], d["volume_sum"],d["good_review_rate"],d["also_buy_sort"],d["also_view_sort"],d["similar_item_sort"],d["same_brand_item_sort"],d["sub_cat"],d["rank_num"]] for d in metadata_dict}
    recommendations = {}
    for u in users:
      recommendations[u] = set()
      products_dict = {d["asin"]:d["overall"] for d in training_data[(training_data['reviewerID'] == u) & (training_data['overall'] >= 4)][["asin","overall"]].to_dict("record")}
      products_sort = {k:v for k,v in sorted(products_dict.items(),key=lambda item:item[1])}
      ##找出買過會再次回購的商品
      for key in products_sort.keys():
        last_buy_time = training_data[(training_data["reviewerID"] == u) & (training_data["asin"] == key)]["DATE"].max() 
        if metadata_dict[key][1]>0 and metadata_dict[key][0]>(training_data["DATE"].max()-last_buy_time).days:
          if len(recommendations[u]) == 10:
            break
          recommendations[u].add(key)
      ##還不夠K個，找also buy中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][5]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找also view中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][6]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找similar item中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][7]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找same brand中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][8]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找所有同種商品中總聲量排序後的商品
      metadata_sell_sort = metadata_df.sort_values(['volume_sum'],ascending=False).groupby("sub_cat").head(10)
      outside_break = False
      for key in products_sort.keys():
        sub_cat = metadata_dict[key][9]
        for item in metadata_sell_sort[metadata_sell_sort["sub_cat"] == sub_cat]["asin"].to_list():
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不構K個，代表此消費者在training data沒出現過或是同種商品沒到K個，故直接推薦子類別為美妝且最總聲量最高的產品
      for key in metadata_sell_sort[metadata_sell_sort["sub_cat"] == "Beauty & Personal Care"]["asin"].to_list():
        if len(recommendations[u]) == 10:
          break
        recommendations[u].add(key)
    return recommendations


ratings_by_user2 = recommender(ratings_trainings, users)

In [119]:
#將總聲量與正面聲量的計算縮現在一年內
volume_sum = {ratings["asin"]:ratings["reviewerID"] for ratings in ratings_trainings[ratings_trainings["DATE"]>'2017-08-31'].groupby("asin").count().reset_index()[["asin","reviewerID"]].to_dict('records')}
good_ratings = ratings_trainings[(ratings_trainings["overall"] >= 4) & (ratings_trainings["DATE"]>'2017-08-31')]
volume_good = {ratings["asin"]:ratings["reviewerID"] for ratings in good_ratings.groupby("asin").count().reset_index()[["asin","reviewerID"]].to_dict('records')}
metadata_df["volume_sum"] = [volume_sum[v] if v in volume_sum.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["volume_good"] = [volume_good[v] if v in volume_good.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["good_review_rate"] = [g/s if s>0 else 0 for g,s in list(zip(metadata_df["volume_good"].to_list(),metadata_df["volume_sum"].to_list()))]
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    metadata_dict = metadata_df[["asin","purchase_dur","repurchase_rate","rank_num", "volume_sum","good_review_rate","also_buy_sort","also_view_sort","similar_item_sort","same_brand_item_sort","sub_cat","rank_num"]].to_dict("record")
    metadata_dict = {d["asin"]:[d["purchase_dur"],d["repurchase_rate"],d["rank_num"], d["volume_sum"],d["good_review_rate"],d["also_buy_sort"],d["also_view_sort"],d["similar_item_sort"],d["same_brand_item_sort"],d["sub_cat"],d["rank_num"]] for d in metadata_dict}
    recommendations = {}
    for u in users:
      recommendations[u] = set()
      products_dict = {d["asin"]:d["overall"] for d in training_data[(training_data['reviewerID'] == u) & (training_data['overall'] >= 4)][["asin","overall"]].to_dict("record")}
      products_sort = {k:v for k,v in sorted(products_dict.items(),key=lambda item:item[1])}
      ##找出買過會再次回購的商品
      for key in products_sort.keys():
        last_buy_time = training_data[(training_data["reviewerID"] == u) & (training_data["asin"] == key)]["DATE"].max() 
        if metadata_dict[key][1]>0 and metadata_dict[key][0]>(training_data["DATE"].max()-last_buy_time).days:
          if len(recommendations[u]) == 10:
            break
          recommendations[u].add(key)
      ##還不夠K個，找also buy中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][5]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找also view中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][6]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找similar item中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][7]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找same brand中排序過後的商品
      outside_break = False
      for key in products_sort.keys():
        for item in metadata_dict[key][8]:
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不夠K個，找所有同種商品中總聲量排序後的商品
      metadata_sell_sort = metadata_df.sort_values(['volume_sum'],ascending=False).groupby("sub_cat").head(10)
      outside_break = False
      for key in products_sort.keys():
        sub_cat = metadata_dict[key][9]
        for item in metadata_sell_sort[metadata_sell_sort["sub_cat"] == sub_cat]["asin"].to_list():
          if len(recommendations[u]) == 10:
            outside_break = True
            break
          recommendations[u].add(item)
        if outside_break == True:
          break
      ##還不構K個，代表此消費者在training data沒出現過或是同種商品沒到K個，故直接推薦子類別為美妝且最總聲量最高的產品
      for key in metadata_sell_sort[metadata_sell_sort["sub_cat"] == "Beauty & Personal Care"]["asin"].to_list():
        if len(recommendations[u]) == 10:
          break
        recommendations[u].add(key)
    return recommendations


ratings_by_user3 = recommender(ratings_trainings, users)

## 結果評估

In [120]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

for n,r in [("使用銷售排名排序:",ratings_by_user),("使用總聲量排序:",ratings_by_user2),("使用總聲量排序&聲量使用一年內的評論:",ratings_by_user3)]:
  print(n,evaluate(ratings_testings_by_user, r))

使用銷售排名排序: 0.006779661016949152
使用總聲量排序: 0.0847457627118644
使用總聲量排序&聲量使用一年內的評論: 0.1
