<a href="https://colab.research.google.com/github/milanazhang/data-course-sample/blob/main/sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-24 15:49:27--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-24 15:49:27 (72.4 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-24 15:49:27--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-24 15:49:27 (60.6 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 資料整理

In [130]:
import numpy as np
import re
#挑選需要欄位與整理欄位內的特殊符號
metadata_df = metadata[["asin","also_buy","also_view","rank","main_cat","similar_item","price"]]
metadata_df = metadata_df.replace(r'^\s*$', np.nan, regex = True)
#metadata_df = metadata_df.mask(metadata_df.applymap(str).eq('[]'))

In [131]:
#將rank分成排名與商品子集
metadata_df['rank'] = metadata_df['rank'].str.replace(',', '')
metadata_df['sub_cat'] = metadata_df['rank'].str.extract(r'(\d+) in (.*) \($')[1].str.replace('&amp;','&')
metadata_df['rank_num'] = pd.to_numeric(metadata_df['rank'].str.extract(r'(\d+) in (.*) \($')[0])
metadata_df = metadata_df.drop("rank",axis=1)

In [15]:
#統計商品子集有多少種及各種占比
rank_dict = {}
for i in metadata_df['sub_cat'].to_list():
  if rank_dict.get(i) != None:
    rank_dict[i] += 1
  else:
    rank_dict[i] = 1
for key in rank_dict.keys():
  print(key,rank_dict[key])

Beauty & Personal Care 32380
nan 440
Grocery & Gourmet Food 38
Health & Household 21
Sports & Outdoors 3
Toys & Games 3
Baby 2
Clothing Shoes & Jewelry 2
Home & Kitchen 1
Automotive 1
Tools & Home Improvement 1


In [16]:
metadata_df.head()

Unnamed: 0,asin,also_buy,also_view,main_cat,similar_item,price,sub_cat,rank_num
0,6546546450,,,All Beauty,,,Beauty & Personal Care,2938573.0
1,7178680776,"[B01E7LCSL6, B008X5RVME]",,All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",$44.99,Beauty & Personal Care,872854.0
2,7250468162,,"[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]",All Beauty,,$28.76,Beauty & Personal Care,956696.0
3,7367905066,[B0041PBXX8],,All Beauty,,,Beauty & Personal Care,1870258.0
4,7414204790,,"[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...",All Beauty,,$12.15,Beauty & Personal Care,67701.0


In [18]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [19]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [33]:
#計算商品總聲量與正面聲量
volume_sum = {ratings["asin"]:ratings["reviewerID"] for ratings in ratings_trainings.groupby("asin").count().reset_index()[["asin","reviewerID"]].to_dict('records')}
good_ratings = ratings_trainings[ratings_trainings["overall"] >= 4]
volume_good = {ratings["asin"]:ratings["reviewerID"] for ratings in good_ratings.groupby("asin").count().reset_index()[["asin","reviewerID"]].to_dict('records')}

In [90]:
#計算商品回購間隔時間
from datetime import timedelta
date_sort = ratings_trainings.sort_values(['DATE'],ascending=True).groupby(["asin", "reviewerID"]).head(11)
date_sort['dur'] = date_sort.groupby(['asin', 'reviewerID'])["DATE"].apply(lambda x: x.diff(1))
repurchase = date_sort[date_sort["dur"].notna()][date_sort[date_sort["dur"].notna()]["dur"] != "0 days"]
repurchase["dur"] = repurchase["dur"].astype("str").str.extract(r'(\d+) (.*$)')[0].astype(int)
purchase_dur = {d["asin"]:d["dur"] for d in repurchase.groupby("asin").agg({"dur":pd.Series.median}).reset_index()[["asin","dur"]].to_dict('records')}

In [96]:
#計算每個商品的回購人數與總購買人數
repurchase_num = {d["asin"]:len(set(d["reviewerID"])) for d in repurchase[['reviewerID', 'asin']].groupby('asin').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')}
purchase_sum = {d["asin"]:len(set(d["reviewerID"])) for d in ratings_trainings[['reviewerID', 'asin']].groupby('asin').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')}

In [132]:
#將上述產生的商品特徵整併進metadata裡
metadata_df["volume_sum"] = [volume_sum[v] if v in volume_sum.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["volume_good"] = [volume_good[v] if v in volume_good.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["purchase_dur"] = [purchase_dur[v] if v in purchase_dur.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["repurchase_num"] = [repurchase_num[v] if v in repurchase_num.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["purchase_sum"] = [purchase_sum[v] if v in purchase_sum.keys() else 0 for v in metadata_df["asin"].to_list()]
metadata_df["good_review_rate"] = [g/s if s>0 else 0 for g,s in list(zip(metadata_df["volume_good"].to_list(),metadata_df["volume_sum"].to_list()))]
metadata_df["repurchase_rate"] = [n/s if s>0 else 0 for n,s in list(zip(metadata_df["repurchase_num"].to_list(),metadata_df["purchase_sum"].to_list()))]

In [133]:
sorted_meta = metadata_df[["asin","good_review_rate","volume_sum","rank_num"]].sort_values(by=["good_review_rate","volume_sum","rank_num"], ascending=[False,False,True])
sorted_meta = sorted_meta.reset_index().drop("index",axis=1)
product_weight = {}
for i in range(0,32892):
  if product_weight.get(sorted_meta.iloc[i]["asin"]) == None:
    product_weight[sorted_meta.iloc[i]["asin"]] = i

In [161]:
metadata_df["also_buy_dict"] = metadata_df["also_buy"].apply(lambda x: {k:product_weight[k] if len(x) != 0 and product_weight.get(k)!=None else 40000 for k in x})
metadata_df["also_view_dict"] = metadata_df["also_view"].apply(lambda x: {k:product_weight[k] if len(x) != 0 and product_weight.get(k)!=None else 40000 for k in x})
metadata_df["also_buy_sort"] = metadata_df["also_buy_dict"].apply(lambda x : [k if v!=40000 else continue for k, v in sorted(x.items(), key=lambda item: item[1])])
metadata_df["also_view_sort"] = metadata_df["also_view_dict"].apply(lambda x : [k if v!=40000 else continue for k, v in sorted(x.items(), key=lambda item: item[1])])

SyntaxError: ignored

In [159]:
metadata_df["also_view_dict"].to_list()

[{},
 {},
 {'B014MHXXM8': 40000, 'B01B8BR0NO': 40000, 'B01B8BR0O8': 40000},
 {},
 {'0335336876': 40000,
  '3254895630': 40000,
  '4453451555': 40000,
  '8974525828': 40000,
  'B00027D6SE': 40000,
  'B000T5MGF8': 40000,
  'B000UDC21C': 40000,
  'B000YJ2SLG': 40000,
  'B0017SWIU4': 40000,
  'B001V9SXXU': 40000,
  'B003TRV2V2': 40000,
  'B005631726': 40000,
  'B0064FFP96': 40000,
  'B007VL1D9S': 40000,
  'B00CAZAU62': 40000,
  'B00CE731B2': 40000,
  'B00EH99VY6': 40000,
  'B00EH9A0RI': 40000,
  'B00F2RUKEW': 40000,
  'B00G0EJYFW': 40000,
  'B00JXOTBPY': 40000,
  'B00KM2P7K4': 40000,
  'B00L7RLWK2': 40000,
  'B00MV2MO8G': 40000,
  'B00NR1YQHM': 40000,
  'B00NR1YQK4': 40000,
  'B00OCJ5PUU': 40000,
  'B00VMYKCL0': 40000,
  'B00X6ZNWG0': 40000,
  'B010E1TWGW': 40000,
  'B013BYNHX8': 40000,
  'B013BYQHM6': 40000,
  'B017MOR60S': 40000,
  'B0186U9736': 40000,
  'B019XHPUM8': 40000,
  'B01A0S5326': 40000,
  'B01BPCAWK4': 40000,
  'B01ES349CY': 40000,
  'B01GR475CI': 40000,
  'B01I45XWQQ': 40000,

## 產生推薦

In [None]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    Your Code
    '''
    return recommendations


ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{}

## 結果評估

In [None]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.0