# Sample Code

In [1]:
!apt-get -y install openjdk-8-jre-headless
!pip install pyspark

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openjdk-8-jre-headless is already the newest version (8u312-b07-0ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [2]:
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.functions import col, regexp_extract, regexp_replace, lit, when
import pyspark.sql.functions as func
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
from pyspark import StorageLevel

import sys
from datetime import datetime, timedelta
from functools import reduce
from itertools import chain

In [3]:
spark = SparkSession.builder \
    .appName("cf") \
    .config("spark.sql.files.ignoreCorruptFiles", "true") \
    .config("spark.sql.session.timeZone", "Asia/Taipei") \
    .getOrCreate()

## 基礎建設

In [4]:
import pandas as pd
import gzip, json
import re

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [5]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-10 09:04:11--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-10 09:04:12 (17.7 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-10 09:04:13--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-10 09:04:13 (14.2 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [6]:
# schema_rating = StructType([
#                     StructField("asin", StringType())
#                       , StructField("reviewerID", StringType())
#                       , StructField("overall", StringType())
#                       , StructField("unixReviewTime", StringType())
#                       ])

metadata = spark.read.json("file:////content/meta_All_Beauty.json.gz")
# ratings = spark.read.format("csv").option("header", True).schema(schema_rating).load("file:///content/All_Beauty.csv")

# metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
# metadata.head()
# ratings.head()

## 資料整理

In [7]:
metadata = metadata.select(
                col("asin")
                , col("title")
                , func.when(col("title") == '', 0).otherwise(1).alias("if_title")
                , col("feature")
                , func.when(func.size("feature") == 0, 0).otherwise(1).alias("if_feature")
                , col("description")
                , func.when(func.size("description") == 0, 0).otherwise(1).alias("if_description")
                , func.translate(col("price"), ',$', '').cast(DoubleType()).alias("price")
                , col("imageURL"), col("imageURLHighRes")
                , func.when(func.size("imageURL") == 0, 0).otherwise(1).alias("if_imageURL")
                , func.when(func.size("imageURLHighRes") == 0, 0).otherwise(1).alias("if_imageURLHighRes")
                , col("also_buy"), col("also_view")
                , func.translate(func.split(col("rank"), ' in ')[0], ',', '').cast(IntegerType()).alias("rank_sub_category")
                , func.translate(func.split(col("rank"), ' in ')[1], '()"];', '').alias("sub_category")
                , col("brand")
                , col("tech1")
                , func.when(func.length("tech1") == 0, 0).otherwise(1).alias("if_tech1")
                , col("similar_item"), col("date")
                , col("details.\n    Item Weight: \n    ").alias("item_weight")
                , col("details.\n    Product Dimensions: \n    ").alias("product_dimensions")
                , col("details.Batteries").alias("batteries")
                , func.when(col("details.Batteries").isNull(), 0).otherwise(1).alias("if_batteries")
                , col("details.Discontinued by manufacturer:").alias("discountedByManufacturer")
                , func.when(col("details.Discontinued by manufacturer:").isNull(), 0).otherwise(1).alias("if_discountedByManufacturer")
                , col("details.Domestic Shipping: ").alias("domestic_shipping")
                , col("details.International Shipping: ").alias("international_shipping")
                , col("details.Item model number:").alias("item_model_no")
                , col("details.Shipping Advisory:").alias("shipping_advisory")
                , col("details.Shipping Weight:").alias("shipping_weight")
                )

In [8]:
metadata.describe().show()

+-------+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+------------------+-----------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+------------------------+---------------------------+--------------------+----------------------+-------------+--------------------+---------------+
|summary|               asin|               title|            if_title|         if_feature|     if_description|             price|       if_imageURL|if_imageURLHighRes|rank_sub_category| sub_category|   brand|               tech1|            if_tech1|        similar_item|                date|item_weight|  product_dimensions|           batteries|        if_batteries|discountedByManufacturer|if_discountedByManufacturer|   domestic_shipping|international_shipping|item_model_no|   s

In [9]:
metadata = metadata.select(col("*"), when(col("sub_category") == 'Beauty & Personal Care ', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty & Personal Care See Top 100', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty & Personal Care See top 100', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty &amp Personal Care ', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty &amp Personal Care', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Grocery & Gourmet Food ', 'Grocery & Gourmet Food')
                  .when(col("sub_category") == 'Grocery & Gourmet Food See Top 100', 'Grocery & Gourmet Food')
                  .when(col("sub_category") == 'Grocery &amp Gourmet Food ', 'Grocery & Gourmet Food')
                  .when(col("sub_category") == 'Health & Household ', 'Health & Household')
                  .when(col("sub_category") == 'Health &amp Household ', 'Health & Household')
                  .when(col("sub_category").isNull(), 'no Category')
                  .otherwise(col("sub_category")).alias("subcategory")
                  , when(col("brand") == '', 'no Category')
                  .when(col("brand") == '*', 'no Category')
                  .when(col("brand") == '-', 'no Category')
                  .when(col("brand") == '....', 'no Category')
                  .when(col("brand") == '.......', 'no Category')
                  .when(col("brand") == '.........', 'no Category')
                  .when(col("brand").isNull(), 'no Category')
                  .otherwise(col("brand")).alias("brand_preprocessed")
                  , when((col("price") >= 0.01) & (col("price") < 8.99), 'first')
                  .when((col("price") >= 8.99) & (col("price") < 15.99), 'second')
                  .when((col("price") >= 15.99) & (col("price") < 29.99), 'avg')
                  .when((col("price") >= 29.99) & (col("price") < 200.00), 'fourth')
                  .when((col("price") >= 200.00) & (col("price") < 2150.0), 'luxury')
                  .otherwise('abnormal').alias("price_range"))

In [10]:
# if the info on below variables are not null, then sum as 1
# log10 rank by sub_category and multiply 100 as score
metadata_scored = metadata.withColumn('score_if', func.sum(col("if_title")+col("if_feature")+col("if_description")+col("if_imageURL") \
                                      +col("if_imageURLHighRes")+col("if_tech1")+col("if_batteries") \
                                      +col("if_discountedByManufacturer")).over(Window.partitionBy("asin"))) \
                          .withColumn('score_rank', (100 / func.log10(col("rank_sub_category"))).alias("score_rank")) \
                          .withColumn('product_score', col("score_if") + col("score_rank"))

In [11]:
metadata_scored.describe().show()

+-------+--------------------+--------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+-----------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+------------------------+---------------------------+--------------------+----------------------+-------------+--------------------+---------------+-----------+------------------+-----------+------------------+------------------+------------------+
|summary|                asin|               title|            if_title|         if_feature|     if_description|             price|        if_imageURL| if_imageURLHighRes|rank_sub_category| sub_category|   brand|               tech1|            if_tech1|        similar_item|                date|item_weight|  product_dimensions|           batteries|        if_batteries|discounte

In [12]:
# metadata.stat.corr("rank_sub_category", "score_if")
# metadata_scored.summary().show(10, False)
metadata_selected = metadata_scored.select("asin", "title", "description", "brand_preprocessed", "rank_sub_category", "subcategory", "price", "price_range", "score_rank", "score_if")
metadata_selected.summary().show(10, False)

+-------+--------------------+---------------------------------------------------------------------+------------------+-----------------+-----------+------------------+-----------+------------------+-----------------+
|summary|asin                |title                                                                |brand_preprocessed|rank_sub_category|subcategory|price             |price_range|score_rank        |score_if         |
+-------+--------------------+---------------------------------------------------------------------+------------------+-----------------+-----------+------------------+-----------+------------------+-----------------+
|count  |32892               |32892                                                                |32892             |32452            |32892      |11270             |32892      |32452             |32892            |
|mean   |8.525224324631579E9 |658469.21                                                            |Infinity          |1288189.8

In [13]:
metadata_pd = metadata_selected.toPandas()

In [14]:
metadata_pd['score_rank'].fillna(99, inplace=True)
metadata_pd['score_if'].fillna(1, inplace=True)

In [15]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [16]:
# only extract past 3 months data for training because beauty product is seasonal
ratings_trainings = ratings[
    (ratings['DATE'] >= '2018-06-01') & (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [19]:
!pip install surprise
import time
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

def recommender(training_data, users=[], k=10, user_based=False, algo=KNNBasic, min_k=1):

    training_data = (
        training_data
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based,  # compute similarities between items
        'min_k': 3
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    for user in users:
        items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
        recommend_item_list = []
        recommend_item_set = set()
        for item in items_user_rated:
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, k)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                    recommend_item_list.append(item_raw_id)
                    recommend_item_set.add(item_raw_id)

            if len(recommend_item_list) >= k:
                recommend_item_list = recommend_item_list[:k]
                break
        recommendation[user] = recommend_item_list

    return recommendation

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 18.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619409 sha256=f2d60d7f4632818b2f12792e6f7bdef3a6bdd5e2144fe27782b4f3e8dc9d6c68
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': [],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': [],
 'A174YOBOSW9WDN': [],
 'A1786SKRAJXH86': [],
 'A17K2BUZ20WD2': [],
 'A17LYRFV645L0V': [],
 'A18LNGVXDZBTUR': [],
 'A19503XX7GU6J2': [],
 'A19HVHRZDYFEOP': [],
 'A19JM38B861BO

In [20]:
def recommender_rule(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    rule-based by best popular products
    '''
    ratings_trainings = training_data
    for user in users:

        # rule based
        rating_product_popularity = pd.DataFrame(ratings_trainings.asin.value_counts())
        rating_product_popularity.reset_index(inplace=True)
        rating_product_popularity.columns = ['asin', 'count']
        recommendations[user] = rating_product_popularity.asin[:k].tolist()

    # recommendations = {user: recommend_items(ratings_metadata[ratings_metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['unstructured'].tolist(), k) for user in users}
    
    return recommendations

ratings_by_user = recommender_rule(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A103T1QOGFCSEH': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A106UKKSJ2KXPF': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A10A7GV4D5A11V': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A1119JJ37ZLB8R': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A113UOOLBSZN52': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01A

## 結果評估

In [21]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

ratings_cf_surprise = recommender(ratings_trainings, users)
rating_rule = recommender_rule(ratings_trainings, users)

score_cf_surprise = evaluate(ratings_testings_by_user, ratings_cf_surprise)
score_rule = evaluate(ratings_testings_by_user, rating_rule)

print(f'score_cf_surprise: \n{round(score_cf_surprise, 4)}')
print(f'score_rule: \n{round(score_rule, 4)}')

# random 0.003389830508474576
# rule-based 0.013559322033898305
# rule based + content based 0.098
# rule based + content (只抽取一個單字) 0.13389830508474576
# score_cf_item: 0.0
# score_cf_user: 0.0
# score_rule: 0.1339

Computing the cosine similarity matrix...
Done computing similarity matrix.
score_cf_surprise: 
0.0
score_rule: 
0.1339
