# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [10]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC
'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC


In [2]:
metadata = getDF('../content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('../content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [3]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [4]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [5]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [6]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]

ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]


ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [7]:
metadata = metadata[['asin', 'also_buy', 'also_view']]
k = 10
# 先以Also_Buy 推薦 再以Also_Review推薦 若使用者為新客戶(先前無資料 則推薦最熱門的產品)
# 針對metadata每個購買清單也會購買的產品排序
# also_buy
df_also_buy = metadata[['asin', 'also_buy']].apply(pd.Series.explode).reset_index(drop=True).dropna().value_counts().reset_index(name = 'also_buy_counts')
# also_review
df_also_view = metadata[['asin', 'also_view']].apply(pd.Series.explode).reset_index(drop=True).dropna().value_counts().reset_index(name = 'also_view_counts')
# df = df_also_view.merge(df_also_buy, how = 'outer', on = 'asin')
# training_data = ratings_trainings
# training_data.loc[:,['reviewerID']].value_counts().reset_index(name = 'counts').sort_values('counts', ascending = False)
# 前百大熱門產品
popularity = ratings_trainings["asin"].value_counts().index.to_list()[0:100]


## 結果評估

In [9]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score


## 產生推薦
### 推薦系統1. 歷史購買次數Top K

In [8]:
def recommender_1(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    popularity = training_data["asin"].value_counts().index.to_list()[0:k]
    for user in users:
      # user_recommendations = list(set(user_recommendations))[0:k]
      recommendations[user] = popularity
    # popularity = training_data["asin"].value_counts().index.to_list()[0:k]
    # recommendations = {x: popularity for x in users}
    
    return recommendations

## 推薦系統測試2 熱門前百大產品 依序推薦

In [10]:
def recommender_2(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    # 前百大熱門產品
    popularity = training_data["asin"].value_counts().index.to_list()[0:100]
    for user in users:
        # print(user)
        user_review = training_data[training_data['reviewerID'] == user]
        user_review_rank = user_review.groupby(['asin']).agg({'overall': 'mean'}).reset_index().sort_values(['overall'], ascending = False)
        # 1.篩選最喜好的產品top k 
        user_review_rank_max = user_review_rank[user_review_rank['overall'] >= user_review_rank.overall.max()]
        # 判斷是否有在熱門清單裡面 並重新推薦給客戶
        popularity_and_have_buy = [item for item in user_review_rank_max.asin if item in popularity][0:k]
        # popularity user_review_rank.asin
        # 2.篩選買過的產品 其他人 Also_buy 在熱門清單選項
        also_buy = df_also_buy[df_also_buy.asin.isin(user_review_rank.asin)]
        also_buy = [item for item in also_buy.also_buy if item in popularity]
        # 3.篩選買過的產品 其他人 Also_view 在熱門清單選項
        also_view = df_also_view[df_also_view.asin.isin(user_review_rank.asin)]
        also_view = [item for item in also_view.also_view if item in popularity]
        # 4.熱門產品補滿至Topk
        user_recommendations = popularity_and_have_buy + also_buy + also_view #+ popularity
        # user_recommendations = popularity
        # user_recommendations = list(set(user_recommendations))[0:k]
        # user_recommendations = user_recommendations
        recommendations[user] = user_recommendations[0:k]
    return recommendations

### 推薦系統3 : 隨機推薦Also_buy/ Also_view

In [21]:
def recommender_3(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    # 前百大熱門產品
    popularity = training_data["asin"].value_counts().index.to_list()[0:100]
    for user in users:
        # print(user)
        user_review = training_data[training_data['reviewerID'] == user]
        user_review_rank = user_review.groupby(['asin']).agg({'overall': 'mean'}).reset_index().sort_values(['overall'], ascending = False)
        # 1.篩選最喜好的產品top k 
        user_review_rank_max = user_review_rank[user_review_rank['overall'] >= user_review_rank.overall.max()]
        # 判斷是否有在熱門清單裡面 並重新推薦給客戶
        popularity_and_have_buy = [item for item in user_review_rank_max.asin if item in popularity][0:k]
        # popularity user_review_rank.asin
        # 2.篩選其他人買過的產品 
        also_buy = df_also_buy[df_also_buy.asin.isin(user_review_rank.asin)]
        also_buy = [item for item in also_buy.also_buy]
        # 3.篩選其他人看過的產品
        also_view = df_also_view[df_also_view.asin.isin(user_review_rank.asin)]
        also_view = [item for item in also_view.also_view]
        # 4.熱門產品補滿至Topk
        user_recommendations = popularity_and_have_buy + also_buy + also_view + popularity
        recommendations[user] = user_recommendations[0:k]
    return recommendations

### 推薦系統1 評分

In [12]:
ratings_by_user_1 = recommender_1(ratings_trainings, users)
method_1 = evaluate(ratings_testings_by_user, ratings_by_user_1)
print(f'Method1 Score: {method_1}')

Method1 Score: 0.08305084745762711


In [32]:
# 近三個月資料
train_start_date = "2018-06-01"
train_end_date = '2018-09-01'
train_3months = ratings_trainings[
    (ratings_trainings['DATE'] >= train_start_date) & 
    (ratings_trainings['DATE'] < train_end_date)
]
ratings_by_user_1 = recommender_1(train_3months, users)
method_1_time = evaluate(ratings_testings_by_user, ratings_by_user_1)
print(f'Method1 {train_start_date} - {train_end_date} Score: {method_1_time}')

Method1 2018-06-01 - 2018-09-01 Score: 0.13389830508474576


### 推薦系統2 評分

In [13]:
ratings_by_user_2 = recommender_2(ratings_trainings, users)
method_2 = evaluate(ratings_testings_by_user, ratings_by_user_2)
print(f'Method2 Score: {method_2}')

Method2 Score: 0.0


In [34]:
ratings_by_user_2 = recommender_2(train_3months, users)
method_2_time = evaluate(ratings_testings_by_user, ratings_by_user_2)
print(f'Method2 {train_start_date} - {train_end_date} Score: {method_2_time}')

Method2 2018-06-01 - 2018-09-01 Score: 0.0


### 推薦系統3 評分

In [22]:
ratings_by_user_3 = recommender_3(ratings_trainings, users)
method_3 = evaluate(ratings_testings_by_user, ratings_by_user_3)
print(f'Method3 Score: {method_3}')

Method3 Score: 0.08305084745762711


In [35]:
ratings_by_user_3 = recommender_3(train_3months, users)
method_3_time = evaluate(ratings_testings_by_user, ratings_by_user_3)
print(f'method_3 {train_start_date} - {train_end_date} Score: {method_3_time}')

method_3 2018-06-01 - 2018-09-01 Score: 0.13389830508474576


In [47]:
test = metadata.loc[:,["category"]]
.drop_duplicates()