# Sample Code

## 基礎建設

In [None]:
import pandas as pd
import gzip, json
import numpy as np
import re 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from itertools import combinations


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC
'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC


In [None]:
metadata = getDF('../content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('../content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

# EDA

In [None]:
# 重複資料
df = ratings[ratings.duplicated(keep = False)]
print(df.shape)
# 刪除重複資料
ratings = ratings.drop_duplicates()
# ratings.columns

(17994, 5)


In [None]:
# 可能會有同個 user 對同個 item 的重複評分資料
ratings.groupby(['reviewerID','asin'],as_index=False).agg(count=('reviewerID','count')).sort_values(['count'], ascending = False)

Unnamed: 0,reviewerID,asin,count
38281,A1EGCED01USBA9,B00W259T7G,7
5400,A1210QJT54O8T0,B00W259T7G,5
358767,AYWLGJPC5O7AQ,B00W259T7G,4
274461,A3VBXQKRM7A4JR,B00W259T7G,4
176411,A2UEIN7SIPZFRP,B00W259T7G,4
...,...,...,...
120625,A29ET8324U5H68,B00AV2YL98,1
120624,A29ESW6I6L1C11,B0051S94NA,1
120623,A29ESUU8CY73TF,B01CIVDSCA,1
120622,A29ES620PVEXCV,B014FXGIYO,1


In [None]:
# 針對評價同一個使用者評價同一個產品，依據時間先後把重複的資料過濾掉，如果重複的話，只留下最新的一筆評分。
ratings = (ratings
     .sort_values("unixReviewTime", ascending=False)
     .groupby(['reviewerID', 'asin']).head(1)
)

In [None]:
# 客戶購買次數
ratings['reviewerID'].value_counts().describe()

count    324038.000000
mean          1.115934
std           0.434325
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          24.000000
Name: reviewerID, dtype: float64

In [None]:
ratings['reviewerID'].value_counts().quantile(q = [0.75,0.8,0.85,0.9,0.95,0.98, 0.99])

0.75    1.0
0.80    1.0
0.85    1.0
0.90    1.0
0.95    2.0
0.98    2.0
0.99    3.0
Name: reviewerID, dtype: float64

平均評價次數 1.11, Q3 -> 1次 pr90 -> 1次，說明當排除使用者 < 3的時候 訓練集已經只剩下5%以下資料 
導致樣本更少

In [None]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
