# Sample Code

In [1]:
!apt-get -y install openjdk-8-jre-headless
!pip install pyspark

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  libnss-mdns fonts-dejavu-extra fonts-ipafont-gothic fonts-ipafont-mincho
  fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  openjdk-8-jre-headless
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 28.2 MB of archives.
After this operation, 104 MB of additional disk space will be used.
Ign:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre-headless amd64 8u292-b10-0ubuntu1~18.04
Err:1 http://security.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre-headless amd64 8u292-b10-0ubuntu1~18.04
  404  Not Found [IP: 91.189.88.142 80]
E: Failed to fetch http://security.ubuntu.com/ubuntu/pool/universe/o/openjdk-8/openjdk-8-jre-headless_8u292-b10-0ubuntu1~18.04_amd64.deb  404  Not Found [IP: 91.189.88.142 80]
E: Unable to fetch some archives, maybe run apt-get update o

In [2]:
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.functions import col, regexp_extract, regexp_replace, lit, when
import pyspark.sql.functions as func
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
from pyspark import StorageLevel

import sys
from datetime import datetime, timedelta
from functools import reduce
from itertools import chain

In [3]:
spark = SparkSession.builder \
    .appName("content-based") \
    .config("spark.sql.files.ignoreCorruptFiles", "true") \
    .config("spark.sql.session.timeZone", "Asia/Taipei") \
    .getOrCreate()

## 基礎建設

In [4]:
import pandas as pd
import gzip, json
import re

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [5]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-04 09:11:17--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-04 09:11:17 (73.6 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-04 09:11:17--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-04 09:11:18 (61.1 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [6]:
# schema_rating = StructType([
#                     StructField("asin", StringType())
#                       , StructField("reviewerID", StringType())
#                       , StructField("overall", StringType())
#                       , StructField("unixReviewTime", StringType())
#                       ])

metadata = spark.read.json("file:////content/meta_All_Beauty.json.gz")
# ratings = spark.read.format("csv").option("header", True).schema(schema_rating).load("file:///content/All_Beauty.csv")

# metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
# metadata.head()
# ratings.head()

## 資料整理

In [7]:
metadata = metadata.select(
                col("asin")
                , col("title")
                , func.when(col("title") == '', 0).otherwise(1).alias("if_title")
                , col("feature")
                , func.when(func.size("feature") == 0, 0).otherwise(1).alias("if_feature")
                , col("description")
                , func.when(func.size("description") == 0, 0).otherwise(1).alias("if_description")
                , func.translate(col("price"), ',$', '').cast(DoubleType()).alias("price")
                , col("imageURL"), col("imageURLHighRes")
                , func.when(func.size("imageURL") == 0, 0).otherwise(1).alias("if_imageURL")
                , func.when(func.size("imageURLHighRes") == 0, 0).otherwise(1).alias("if_imageURLHighRes")
                , col("also_buy"), col("also_view")
                , func.translate(func.split(col("rank"), ' in ')[0], ',', '').cast(IntegerType()).alias("rank_sub_category")
                , func.translate(func.split(col("rank"), ' in ')[1], '()"];', '').alias("sub_category")
                , col("brand")
                , col("tech1")
                , func.when(func.length("tech1") == 0, 0).otherwise(1).alias("if_tech1")
                , col("similar_item"), col("date")
                , col("details.\n    Item Weight: \n    ").alias("item_weight")
                , col("details.\n    Product Dimensions: \n    ").alias("product_dimensions")
                , col("details.Batteries").alias("batteries")
                , func.when(col("details.Batteries").isNull(), 0).otherwise(1).alias("if_batteries")
                , col("details.Discontinued by manufacturer:").alias("discountedByManufacturer")
                , func.when(col("details.Discontinued by manufacturer:").isNull(), 0).otherwise(1).alias("if_discountedByManufacturer")
                , col("details.Domestic Shipping: ").alias("domestic_shipping")
                , col("details.International Shipping: ").alias("international_shipping")
                , col("details.Item model number:").alias("item_model_no")
                , col("details.Shipping Advisory:").alias("shipping_advisory")
                , col("details.Shipping Weight:").alias("shipping_weight")
                )

In [8]:
metadata.describe().show()

+-------+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+------------------+-----------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+------------------------+---------------------------+--------------------+----------------------+-------------+--------------------+---------------+
|summary|               asin|               title|            if_title|         if_feature|     if_description|             price|       if_imageURL|if_imageURLHighRes|rank_sub_category| sub_category|   brand|               tech1|            if_tech1|        similar_item|                date|item_weight|  product_dimensions|           batteries|        if_batteries|discountedByManufacturer|if_discountedByManufacturer|   domestic_shipping|international_shipping|item_model_no|   s

In [9]:
metadata = metadata.select(col("*"), when(col("sub_category") == 'Beauty & Personal Care ', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty & Personal Care See Top 100', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty & Personal Care See top 100', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty &amp Personal Care ', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Beauty &amp Personal Care', 'Beauty & Personal Care')
                  .when(col("sub_category") == 'Grocery & Gourmet Food ', 'Grocery & Gourmet Food')
                  .when(col("sub_category") == 'Grocery & Gourmet Food See Top 100', 'Grocery & Gourmet Food')
                  .when(col("sub_category") == 'Grocery &amp Gourmet Food ', 'Grocery & Gourmet Food')
                  .when(col("sub_category") == 'Health & Household ', 'Health & Household')
                  .when(col("sub_category") == 'Health &amp Household ', 'Health & Household')
                  .when(col("sub_category").isNull(), 'no Category')
                  .otherwise(col("sub_category")).alias("subcategory")
                  , when(col("brand") == '', 'no Category')
                  .when(col("brand") == '*', 'no Category')
                  .when(col("brand") == '-', 'no Category')
                  .when(col("brand") == '....', 'no Category')
                  .when(col("brand") == '.......', 'no Category')
                  .when(col("brand") == '.........', 'no Category')
                  .when(col("brand").isNull(), 'no Category')
                  .otherwise(col("brand")).alias("brand_preprocessed")
                  , when((col("price") >= 0.01) & (col("price") < 8.99), 'first')
                  .when((col("price") >= 8.99) & (col("price") < 15.99), 'second')
                  .when((col("price") >= 15.99) & (col("price") < 29.99), 'avg')
                  .when((col("price") >= 29.99) & (col("price") < 200.00), 'fourth')
                  .when((col("price") >= 200.00) & (col("price") < 2150.0), 'luxury')
                  .otherwise('abnormal').alias("price_range"))

In [10]:
# if the info on below variables are not null, then sum as 1
# log10 rank by sub_category and multiply 100 as score
metadata_scored = metadata.withColumn('score_if', func.sum(col("if_title")+col("if_feature")+col("if_description")+col("if_imageURL") \
                                      +col("if_imageURLHighRes")+col("if_tech1")+col("if_batteries") \
                                      +col("if_discountedByManufacturer")).over(Window.partitionBy("asin"))) \
                          .withColumn('score_rank', (100 / func.log10(col("rank_sub_category"))).alias("score_rank")) \
                          .withColumn('product_score', col("score_if") + col("score_rank"))

In [11]:
metadata_scored.describe().show()

+-------+--------------------+--------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+-----------------+-------------+--------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+------------------------+---------------------------+--------------------+----------------------+-------------+--------------------+---------------+-----------+------------------+-----------+------------------+------------------+------------------+
|summary|                asin|               title|            if_title|         if_feature|     if_description|             price|        if_imageURL| if_imageURLHighRes|rank_sub_category| sub_category|   brand|               tech1|            if_tech1|        similar_item|                date|item_weight|  product_dimensions|           batteries|        if_batteries|discounte

In [12]:
# metadata.stat.corr("rank_sub_category", "score_if")
# metadata_scored.summary().show(10, False)
metadata_selected = metadata_scored.select("asin", "title", "description", "brand_preprocessed", "rank_sub_category", "subcategory", "price", "price_range", "score_rank", "score_if")
metadata_selected.summary().show(10, False)

+-------+--------------------+---------------------------------------------------------------------+------------------+-----------------+-----------+------------------+-----------+------------------+-----------------+
|summary|asin                |title                                                                |brand_preprocessed|rank_sub_category|subcategory|price             |price_range|score_rank        |score_if         |
+-------+--------------------+---------------------------------------------------------------------+------------------+-----------------+-----------+------------------+-----------+------------------+-----------------+
|count  |32892               |32892                                                                |32892             |32452            |32892      |11270             |32892      |32452             |32892            |
|mean   |8.525224324631579E9 |658469.21                                                            |Infinity          |1288189.8

In [13]:
metadata_pd = metadata_selected.toPandas()

In [14]:
metadata_pd['score_rank'].fillna(99, inplace=True)
metadata_pd['score_if'].fillna(1, inplace=True)

In [15]:
metadata_pd.head(2)

Unnamed: 0,asin,title,description,brand_preprocessed,rank_sub_category,subcategory,price,price_range,score_rank,score_if
0,6546546450,Loud 'N Clear&trade; Personal Sound Amplifier,[Loud 'N Clear Personal Sound Amplifier allows...,idea village,2938573.0,Beauty & Personal Care,,abnormal,15.460404,2
1,7178680776,No7 Lift &amp; Luminate Triple Action Serum 50...,[No7 Lift & Luminate Triple Action Serum 50ml ...,no Category,872854.0,Beauty & Personal Care,44.99,fourth,16.832349,2


In [16]:
# replace [] on description by ''
metadata_pd['description'] = metadata_pd['description'].apply(lambda x: ' '.join(x))
# combine description, title, brand and subcategory (from rank)
metadata_pd['unstructured'] = metadata_pd['title'] + ' ' + metadata_pd['brand_preprocessed'] + ' ' + metadata_pd['description'] + ' ' + metadata_pd['subcategory']
# # remove special characters besides numbers
# metadata_pd['unstructured'] = metadata_pd['unstructured'].apply(lambda x: re.sub(r"(?<!\d)[\W](?!\d)", ' ', str(x)))
# # to lowercase
# metadata_pd['unstructured'] = metadata_pd['unstructured'].str.lower()

In [17]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
# from nltk.corpus import words #text = words.words()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    
    return text
    
metadata_pd['unstructured'] = metadata_pd['unstructured'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
metadata_pd.describe()

Unnamed: 0,rank_sub_category,price,score_rank,score_if
count,32452.0,11270.0,32892.0,32892.0
mean,1288190.0,28.100209,17.876667,2.554512
std,815171.4,54.107515,9.53666,1.255595
min,35.0,0.01,14.326696,1.0
25%,677521.8,8.99,16.023486,2.0
50%,1133461.0,15.99,16.531682,3.0
75%,1751104.0,29.99,17.184865,3.0
max,9549407.0,2149.0,99.0,12.0


In [19]:
metadata_pd.sample()

Unnamed: 0,asin,title,description,brand_preprocessed,rank_sub_category,subcategory,price,price_range,score_rank,score_if,unstructured
28589,B016BD42JY,Lookatool&reg; Newly Sauna Slimming Belt Waist...,,no Category,905714.0,Beauty & Personal Care,,abnormal,16.786998,3,lookatoolreg newly sauna slimming belt waist w...


In [20]:
metadata_pd = metadata_pd.drop(columns=['title', 'description', 'rank_sub_category', 'price'])
metadata_pd.head(5)

Unnamed: 0,asin,brand_preprocessed,subcategory,price_range,score_rank,score_if,unstructured
0,6546546450,idea village,Beauty & Personal Care,abnormal,15.460404,2,loud n cleartrade personal sound amplifier ide...
1,7178680776,no Category,Beauty & Personal Care,fourth,16.832349,2,no7 lift amp luminate triple action serum 50ml...
2,7250468162,No7,Beauty & Personal Care,avg,16.720244,2,no7 stay perfect foundation cool vanilla no7 n...
3,7367905066,no Category,Beauty & Personal Care,abnormal,15.944128,3,wella koleston perfect hair colour 44 44 mediu...
4,7414204790,Pirmal Healthcare,Beauty & Personal Care,second,20.701383,4,lacto calamine skin balance oil control 120 ml...


In [21]:
metadata_pd.info()
metadata_pd.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32892 entries, 0 to 32891
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   asin                32892 non-null  object 
 1   brand_preprocessed  32892 non-null  object 
 2   subcategory         32892 non-null  object 
 3   price_range         32892 non-null  object 
 4   score_rank          32892 non-null  float64
 5   score_if            32892 non-null  int64  
 6   unstructured        32892 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.8+ MB


asin                  32488
brand_preprocessed     7858
subcategory              11
price_range               6
score_rank            31878
score_if                  9
unstructured          32335
dtype: int64

In [22]:
categorical_cols = ['brand_preprocessed', 'subcategory', 'price_range', 'score_if'] #score_rank
metadata_one_hot = pd.get_dummies(metadata_pd, columns = categorical_cols)

In [23]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [24]:
# only extract past 3 months data for training because beauty product is seasonal
ratings_trainings = ratings[
    (ratings['DATE'] >= '2018-06-01') & (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [25]:
# product_popularity
rating_product_popularity = pd.DataFrame(ratings_trainings.asin.value_counts())
rating_product_popularity.reset_index(inplace=True)
rating_product_popularity.columns = ['asin', 'count']
# avg review
ratings_avg_overall = ratings_trainings.groupby('asin', as_index = False)['overall'].mean()
# join above rating summary
rating_asin = pd.merge(rating_product_popularity,ratings_avg_overall,on='asin',how='inner')

In [26]:
rating_asin.info()
rating_asin.nunique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2475 entries, 0 to 2474
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   asin     2475 non-null   object 
 1   count    2475 non-null   int64  
 2   overall  2475 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 77.3+ KB


asin       2475
count        46
overall     136
dtype: int64

In [27]:
ratings_metadata_one_hot = pd.merge(rating_asin,metadata_one_hot,on='asin',how='inner')

In [28]:
ratings_metadata = pd.merge(rating_asin,metadata_pd,on='asin',how='inner')

In [29]:
# rule-based variables
# ratings_metadata['brand_loyalty'] = ratings_metadata.groupby(["reviewerID", "brand_filled"])["asin"].transform('count')
# ratings_metadata['brand_loyalty'].fillna(1, inplace=True)
# ratings_metadata['subcategory_loyalty'] = ratings_metadata.groupby(["reviewerID", "subcategory"])["asin"].transform('count')
# ratings_metadata['subcategory_loyalty'].fillna(1, inplace=True)
# ratings_metadata['avg_brandLoyalty'] = ratings_metadata.groupby(["asin"])["brand_loyalty"].transform('mean')
# ratings_metadata.head(2)

In [30]:
ratings_metadata.head(2)

Unnamed: 0,asin,count,overall,brand_preprocessed,subcategory,price_range,score_rank,score_if,unstructured
0,B01DKQAXC0,294,3.860544,no Category,Beauty & Personal Care,abnormal,19.288956,3,bali secrets natural deodorant organic amp veg...
1,B00W259T7G,166,4.283133,Pre de Provence,Beauty & Personal Care,first,19.543907,5,pre de provence artisanal french soap bar enri...


In [31]:
ratings_metadata_one_hot = ratings_metadata_one_hot.drop(columns=['count', 'overall', 'score_rank', 'unstructured'])

In [32]:
ratings_metadata_one_hot.sample()

Unnamed: 0,asin,brand_preprocessed_#Flashmob,brand_preprocessed_#Healthy,brand_preprocessed_#R5,brand_preprocessed_(3 Pack) NYC Ultra Moist Lipwear - Violet Shine ...,brand_preprocessed_(Black Opal),brand_preprocessed_(L'Oreal Paris),brand_preprocessed_(Wonderfulbreast),brand_preprocessed_-417,brand_preprocessed_1 eye products,brand_preprocessed_1.4 oz Dab-On,brand_preprocessed_10 Stars,"brand_preprocessed_100% AUTHENTIC Mfg. only byTrue Tape, LLC.",brand_preprocessed_100% Pure,brand_preprocessed_111SKIN NAC Y2,brand_preprocessed_12 Beverage,brand_preprocessed_12 Capsules,brand_preprocessed_1820 (Omega Visage),brand_preprocessed_1907,brand_preprocessed_1979 Collection,brand_preprocessed_1stopsalon,brand_preprocessed_2 Pack Sexy Hair Soy Renewal Crme Oil 4.2 Ounce,brand_preprocessed_2013newestseller,brand_preprocessed_2014 Bayer HealthCare LLC,brand_preprocessed_21 Bridal Accessories,brand_preprocessed_21 drops,brand_preprocessed_21Trans-Dermal,brand_preprocessed_247Skins,brand_preprocessed_2N,brand_preprocessed_2W International,brand_preprocessed_3 Way Poncho,brand_preprocessed_3 Wetter Taft,brand_preprocessed_316steel,brand_preprocessed_32.0 oz Can,brand_preprocessed_350buy,brand_preprocessed_3Gen,brand_preprocessed_3M,brand_preprocessed_40 Carrots,brand_preprocessed_47krate,brand_preprocessed_4TopTime,...,brand_preprocessed_warmcos,brand_preprocessed_we,brand_preprocessed_wet 'n wild,brand_preprocessed_wet n wild,brand_preprocessed_willatram,brand_preprocessed_willy go wild,brand_preprocessed_wing,brand_preprocessed_yi wu zi ping wigs Co.LTD,brand_preprocessed_yiliusu,brand_preprocessed_younique,brand_preprocessed_youthwaters,brand_preprocessed_yoyomax,brand_preprocessed_zila,brand_preprocessed_zonman,subcategory_Automotive,subcategory_Baby,subcategory_Beauty & Personal Care,"subcategory_Clothing, Shoes & Jewelry",subcategory_Grocery & Gourmet Food,subcategory_Health & Household,subcategory_Home & Kitchen,subcategory_Sports & Outdoors,subcategory_Tools & Home Improvement,subcategory_Toys & Games,subcategory_no Category,price_range_abnormal,price_range_avg,price_range_first,price_range_fourth,price_range_luxury,price_range_second,score_if_1,score_if_2,score_if_3,score_if_4,score_if_5,score_if_6,score_if_8,score_if_10,score_if_12
1370,B00H4HCHRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


**EDA**

In [33]:
# from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer

Token Frequency Distribution Removing Stop Words

In [34]:
def unusual_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    df0 = pd.DataFrame(words_freq, columns = ['unstructured' , 'count'])
    text_vocab = set(w.lower() for w in df0['unstructured'] if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    unusual = sorted(unusual)
    # words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    # words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return unusual[:n]

In [35]:
# find out unusual words
from nltk.corpus import words
nltk.download('words')

vec = CountVectorizer().fit(ratings_metadata['unstructured'])
bag_of_words = vec.transform(ratings_metadata['unstructured'])
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
df0 = pd.DataFrame(words_freq, columns = ['unstructured' , 'count'])
text_vocab = set(w.lower() for w in df0['unstructured'] if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab - english_vocab
unusual = sorted(unusual)
print(unusual)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [36]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(ratings_metadata['unstructured'], 20)
df1 = pd.DataFrame(common_words, columns = ['unstructured' , 'count'])
df1.groupby('unstructured').sum()['count'].sort_values() #.iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in product description before removing stop words')


unstructured
face         271
size         278
black        301
amp          319
nail         338
cream        339
pack         384
color        412
body         438
natural      454
use          463
oil          513
oz           577
category     816
hair         955
br          1198
skin        1227
personal    2488
beauty      2606
care        2669
Name: count, dtype: int64

In [37]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(ratings_metadata['unstructured'], 20)
df2 = pd.DataFrame(common_words, columns = ['unstructured' , 'count'])
df2.groupby('unstructured').sum()['count'].sort_values()

unstructured
face         271
size         278
black        301
amp          319
nail         338
cream        339
pack         384
color        412
body         438
natural      454
use          463
oil          513
oz           577
category     816
hair         955
br          1198
skin        1227
personal    2488
beauty      2606
care        2669
Name: count, dtype: int64

Bigrams Frequency Distribution Removing Stop Words

In [38]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(ratings_metadata['unstructured'], 20)
df3 = pd.DataFrame(common_words, columns = ['unstructured' , 'count'])
df3.groupby('unstructured').sum()['count'].sort_values(ascending=False)

unstructured
personal care      2464
beauty personal    2458
category beauty     594
br br               189
fl oz               117
shea butter          81
oz pack              76
nail art             74
high quality         64
pack category        61
skin types           57
nail polish          51
br beauty            50
skin care            49
aloe vera            47
brand new            43
top coat             42
tea tree             42
dry skin             41
made usa             41
Name: count, dtype: int64

In [39]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(ratings_metadata['unstructured'], 20)
df4 = pd.DataFrame(common_words, columns = ['unstructured' , 'count'])
df4.groupby('unstructured').sum()['count'].sort_values(ascending=False)

unstructured
personal care      2464
beauty personal    2458
category beauty     594
br br               190
fl oz               117
shea butter          81
oz pack              76
nail art             74
high quality         64
pack category        61
skin types           57
nail polish          51
br beauty            50
skin care            49
aloe vera            47
brand new            43
tea tree             42
dry skin             41
essential oil        40
essential oils       39
Name: count, dtype: int64

Trigrams Frequency Distribution Removing Stop Words

In [40]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(ratings_metadata['unstructured'], 20)
df5 = pd.DataFrame(common_words, columns = ['unstructured' , 'count'])
df5.groupby('unstructured').sum()['count'].sort_values(ascending=False)

unstructured
beauty personal care                    2458
category beauty personal                 594
br beauty personal                        49
oz beauty personal                        35
pack category beauty                      32
pack beauty personal                      30
skin beauty personal                      26
mmediaamazoncom images aplusmedia         24
images aplusmedia mg                      24
packed safely bubble                      23
retail box packed                         23
box packed safely                         23
usa beauty personal                       23
oz pack category                          22
imagesnasslimagesamazoncom images 01      22
tea tree oil                              21
oz category beauty                        19
package beauty personal                   18
butter cocoa butter                       18
brno retail box                           18
Name: count, dtype: int64

In [41]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(ratings_metadata['unstructured'], 20)
df6 = pd.DataFrame(common_words, columns = ['unstructured' , 'count'])
df6.groupby('unstructured').sum()['count'].sort_values(ascending=False)

unstructured
beauty personal care                    2458
category beauty personal                 594
br beauty personal                        49
oz beauty personal                        35
pack category beauty                      32
pack beauty personal                      30
skin beauty personal                      26
usa beauty personal                       26
mmediaamazoncom images aplusmedia         24
images aplusmedia mg                      24
packed safely bubble                      23
retail box packed                         23
box packed safely                         23
oz pack category                          22
imagesnasslimagesamazoncom images 01      22
tea tree oil                              21
oz category beauty                        19
package beauty personal                   18
brno retail box                           18
br br br                                  18
Name: count, dtype: int64

## 產生推薦

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 計算商品用標題所表示的 tfidf 矩陣
# ngram_range=(1, 3) -> ngram_range=(1, 1)
df = ratings_metadata.drop_duplicates('unstructured')
for unusual_words in unusual:
  df['unstructured'] = df['unstructured'].replace(unusual_words, '')
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['unstructured'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [52]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['unstructured'])
vectorizer.get_feature_names_out()

array(['00', '000', '00020', ..., 'zwzcyz', 'zzzrt', 'zzzsafter'],
      dtype=object)

In [48]:
# 計算商品間的相似程度
# cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['unstructured'])

In [49]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

In [50]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    ratings_trainings = training_data
    for user in users:

        # content based
        content_list = recommend_items(ratings_metadata[ratings_metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['asin'].tolist(), k)
        if content_list:
            recommendations[user] = content_list
        else:
            # rule based
            recommendations[user] = ratings_metadata.asin[:k].tolist()

    # recommendations = {user: recommend_items(ratings_metadata[ratings_metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['unstructured'].tolist(), k) for user in users}
    
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A103T1QOGFCSEH': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A106UKKSJ2KXPF': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A10A7GV4D5A11V': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A1119JJ37ZLB8R': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B006IB5T4W',
  'B00JVU3K9I',
  'B01CJNZKZK',
  'B0168SXRR0'],
 'A113UOOLBSZN52': ['B01DKQAXC0',
  'B00W259T7G',
  'B012Z7IHHI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01A

## 結果評估

In [51]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user) 

# random 0.003389830508474576
# rule-based 0.013559322033898305
# rule based + content based 0.098
# rule based + content (只抽取一個單字) 0.13389830508474576
# 觀察到title vs title + description+ ...的結果一樣

0.13389830508474576

In [45]:
# reference https://towardsdatascience.com/building-a-content-based-recommender-system-for-hotels-in-seattle-d724f0a32070

In [46]:
def recommender_rule(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    ruled-based
    '''
    ratings_trainings = training_data
    product_df = ratings_trainings[['asin', 'product_score', 'product_popularity', 'avg_review']].drop_duplicates()
    avg_brandLoyalty_df = ratings_trainings[['asin', 'avg_brandLoyalty']].drop_duplicates()

    for user in users:
      # choose top 2 brands for user
      brand_list = []
      if len(ratings_trainings[ratings_trainings['reviewerID'] == user]["brand_loyalty"]) > 0:
        index = ratings_trainings[ratings_trainings['reviewerID'] == user]["brand_loyalty"].nlargest(2).index
      else:
        index = avg_brandLoyalty_df["avg_brandLoyalty"].nlargest(5).index

      for id in index:
        brand_list.append(ratings_trainings['brand_filled'][id])
      brand_list = list(dict.fromkeys(brand_list))

      product_list = []
      for brand in brand_list:
        prod_list = ratings_trainings[ratings_trainings['brand_filled'] == brand]['asin'].tolist()
        prod_list = list(dict.fromkeys(prod_list))
        product_list.append(prod_list)
      product_list = list(chain(*product_list))

      # product_score / avg_review / product_popularity
      top25_by_productScore = product_df[product_df['asin'].isin(product_list)].nlargest(25, "product_score")
      top15_by_avgReview = top25_by_productScore.nlargest(15, "avg_review")
      if len(top15_by_avgReview.asin) >= 10:
        # bottom = top15_by_avgReview.nsmallest(5, "product_popularity")['asin'].tolist()
        top = top15_by_avgReview.nlargest(10, "product_popularity")['asin'].tolist()
        recommendation = top
      else:
        sample = len(top15_by_avgReview.asin)
        random_sample = 10 - len(top15_by_avgReview.asin)
        bottom = top15_by_avgReview.nsmallest(sample, "product_popularity")['asin'].tolist()
        top = product_df['asin'].sample(n=random_sample).tolist()
        recommendation = bottom + top
      recommendations[user] = recommendation

    return recommendations
