<a href="https://colab.research.google.com/github/phwangktw/data-course-sample/blob/main/Session2_Content_based_Recommendation_Algorithm_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Session2: Content-based Recommendation Algorithm

## Step1. Load data

In [1]:
import pandas as pd
import numpy as np
import gzip, json
from os.path import exists
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime
sns.set_style("whitegrid")

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english'))
def content_filter(text):
    # stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if (w.lower() not in stop_words) & (w.isalnum()) ]
    return content

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Step2. Download data

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)



--2022-01-02 06:39:57--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.3’


2022-01-02 06:39:58 (22.2 MB/s) - ‘All_Beauty.csv.3’ saved [15499476/15499476]

--2022-01-02 06:39:58--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.3’


2022-01-02 06:39:59 (18.7 MB/s) - ‘meta_All_Beauty.json.gz.3’ saved [10329961/10329961]



## Step3. Parsing data

### Step3-1: Convert time format

In [3]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

### Step3-2: Data preprocessing


*   Dropout the duplicated rows
*   Fill the blanks with `nan`
*   Parsing the `description` column for generating `rank_num` and `rank_category`
*   Regex expression for searching specific key words

In [4]:
##Cleaning data (cited from: https://github.com/yuchiahung/data-course-sample/blob/main/hw1_Ana.ipynb)
##Peaking data firstly
metadata_clean = metadata.loc[metadata.astype(str).drop_duplicates().index]
metadata_clean.replace('', np.nan, inplace = True)


# clean column `rank` -> Parsing out to RankNum + RankCategory
metadata_clean['rank'] = metadata_clean['rank'].str.replace('&amp;', '&')
metadata_clean['rank'].fillna('0', inplace = True)
metadata_clean['rank_category'] = [re.search('in (.*) \(', r).group(1) if r != '0' else None for r in metadata_clean['rank']]
metadata_clean['rank_num'] = [re.search('(.*) in .*', r).group(1) if r != '0' else None for r in metadata_clean['rank']]
metadata_clean['rank_num'] = metadata_clean['rank_num'].str.replace(',', '').astype(float)

# excluding category != 'Beauty & Personal Care'
metadata_clean = metadata_clean[metadata_clean.rank_category == 'Beauty & Personal Care']

# convert `price` to float
metadata_clean['price'].fillna('0', inplace = True)
metadata_clean['price'] = [re.search('\$(.*)', p).group(1) if re.search('\$(.*)', p) != None else None for p in metadata_clean['price']]
metadata_clean['price'] = metadata_clean['price'].str.replace(',', '').astype(float)

# drop useless columns
metadata_clean.drop(
    ['category', 'tech1', 'fit', 'tech2', 'date', 'similar_item', 'feature', 'main_cat', 'rank'], 
    axis = 1, 
    inplace = True
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Find out top X frequency of the words

In [5]:
metadata_titleOnly = metadata_clean[['asin','title']].reset_index().drop(['index'], axis=1)
metadata_titleOnly['title'] = metadata_titleOnly['title'].str.replace('&amp;', '')
wholeContent = ''
for i, a in enumerate(metadata_titleOnly.title):
    wholeContent += a + ' '

##TODO: filter out the numerical words (Ex. 8, 24...)
tokens = nltk.word_tokenize(wholeContent)
contentToken_filtered = content_filter(tokens)
rawBgs = nltk.ngrams(contentToken_filtered,1)
fdist = nltk.FreqDist(rawBgs)

keywordlist ={}
for k, v in fdist.items():                  
    keywordlist[k[0]] = v

#Check only
CountsDF = pd.DataFrame.from_dict(keywordlist, orient='index',columns=['Counts'])
#Control size of the keywords
filterList = CountsDF.reset_index().rename(columns={'index': 'word'}).sort_values('Counts',ascending=False).head(20000).word.tolist()
CountsDF.describe()
thrhold = CountsDF.Counts.quantile(0.9)

## 123

In [6]:
#Text Parsing
#Step1: Parsing title to KeyWords ONLY
keyCounts = 0
metadata_titleOnly['titleNew'] = ""
for i, a in enumerate(metadata_titleOnly.title):
    a_Raw = a.split()
    resultwords  = [word for word in a_Raw if word.lower() in filterList]
    result = ' '.join(resultwords)    
    if (len(result) != 0):
        metadata_titleOnly.loc[i, 'titleNew'] = result
        keyCounts += 1

keyWordCoverage = keyCounts/metadata_titleOnly.shape[0]
print(f'Key words coverage: {round(keyWordCoverage, 4)}')

Key words coverage: 0.9828


In [7]:
# 計算商品用標題所表示的 tfidf 矩陣
#df_test = metadata_titleOnly.drop_duplicates('titleNew')
#If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
#Only applies if analyzer == 'word'.
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(metadata_titleOnly['titleNew'])

# 計算商品間的相似程度
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(metadata_titleOnly.index,index = metadata_titleOnly['asin'])

# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (metadata_titleOnly['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦 (KNN average the purchase history)
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

### Step3-3: Split time frame for testing and validation purpose


*   Visualize the distribution of sales on the time axis

In [8]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [9]:
def recommender_content1(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
       
    for user in users:
        content_recom = []
        rule_recom = []
        k1 = 5
        ### UserID convert to purchase history
        ## Content-based 
        ### Ensure the user has existing purchase (comment) history
        existHistory = metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['asin'].tolist()
        if len(existHistory) > 0:
            content_recom = recommend_items(existHistory,k1)
        
        # Popular products (recommend `k_left` products)
        k_left = k - len(content_recom)
        ## Best seller (by rating data) & highest rating products (recommend `k` product)
        products_rating = training_data[training_data.DATE >= '2017-09-01'].groupby('asin')[['overall']].agg(['mean', 'count'])
        products_rating.columns = products_rating.columns.droplevel(0)
        rule_recom = products_rating.sort_values(by = ['count', 'mean'], ascending = False).index.tolist()[:k_left]
        
        
        # concat all the item lists (k2 by rank, k3 by rating, others by sales)
        user_recom = content_recom + rule_recom
        recommendations[user] = user_recom

    return recommendations

In [10]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

rcListRule1 = recommender_content1(ratings_trainings, users)
# Evaluation scores
print(f'Rule1: \n{round(evaluate(ratings_testings_by_user, rcListRule1), 4)}')


Rule1: 
0.1051


In [11]:
userWithHistory = []
for user in users:
    existHistory = metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['asin'].tolist()
    if len(existHistory) != 0:
        userWithHistory.append(user)
userWithHistoryAns = {}
for i in range(len(userWithHistory)):
  userWithHistoryAns[userWithHistory[i]] = ratings_testings_by_user[userWithHistory[i]]

In [12]:
rcListRule2 = recommender_content1(ratings_trainings, userWithHistory)
# Evaluation scores
print(f'Rule1: \n{round(evaluate(userWithHistoryAns, rcListRule2), 4)}')

Rule1: 
0.0102
