# Part 03. 추천시스템

# 🔍 라이브러리

In [1]:
#데이터셋 처리
import pandas as pd 
import numpy as np
import pyarrow.parquet as pq

#시각화
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc 
rc('font', family='AppleGothic') 			
plt.rcParams['axes.unicode_minus'] = False 

#시간 데이터 처리
from datetime import datetime, timedelta

#진행률 확인
from tqdm.notebook import tqdm # 진행률 확인
import warnings # 경고 무시
warnings.simplefilter("ignore")

# `- 전처리 부분 전처리 part로 옮겼습니다!`

---

# 🔍 추천시스템 모델링

## 1. 전처리된 데이터 로드

In [77]:
# parquet 변환된 데이터셋 불러오기
df = pd.read_parquet('../e-commerce/input/2019-Oct_new.parquet')
df.head(3)

Unnamed: 0,event_time,event_type,product_id,category_code,brand,price,user_id,user_session,event_date,event_month,event_day,day_of_week,event_hour,event_week
0,2019-10-01 06:00:00,view,3900821,appliances.environment.water_heater,aqua,33.200001,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2019-10-01,10,1,Tuesday,6,40
1,2019-10-01 06:00:01,view,1307067,computers.notebook,lenovo,251.740005,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2019-10-01,10,1,Tuesday,6,40
2,2019-10-01 06:00:04,view,1004237,electronics.smartphone,apple,1081.97998,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2019-10-01,10,1,Tuesday,6,40


## 2. Baseline Model
### 📌 통계기반 모델: event가 가장 많은 상품 상위 20개 일괄적으로 추천하는 모델

#### ✅ category code 변환
- `Vectorization`을 위해 category code를 `'.'`으로 word 단위가 구분되는 문자열로 변환

#### 📌 데이터 필터링 및 train/test 데이터 분할

In [78]:
# 필요한 컬럼만 불러오기
df = df[['product_id','category_code','brand','user_id','event_week']]

# 카테고리 코드와 브랜드 type 변경 (필요한가)
df['category_code'] = df['category_code'].astype(object)
df['brand'] = df['brand'].astype(object)

# 카테고리 코드 구분자 '.' 를 공백으로 변경 -> TF IDF 구분 하려면 이렇게 해야 한다고 하는데
# 온점으로 그대로 둬도 괜찮은지 확인 해보자
df['category_code'] = df['category_code'].apply(lambda x: x.replace('.',' '))

# 카테고리에 브랜드 추가해서 성능에 변화가 있는지 확인 해보자 
# df['category_code'] = df['category_code']+df['brand']

# # 카운트 많은 5000개 만 추려냄 - 시간 오래 걸리니까 임시로
tmp = df.groupby('user_id').size().sort_values(ascending=False).head(5000).index
df = df[df.user_id.isin(tmp)].reset_index(drop=True)

# # 오래걸려서 샘플링 진행 (10프로)
df = df.sample(frac=0.5, replace = True, random_state = 1)

# train / test 분할
train = df[df.event_week<44]
test = df[df.event_week==44]

train = train[['user_id','product_id']].sort_values(by='user_id').reset_index(drop=True)
test = test[['user_id','product_id']].sort_values(by='user_id').reset_index(drop=True)

--------

In [79]:
# 회귀분석에서 y, y_pred 중 y의 역할 -> test 데이터로 평가할 때 필요
# y테이블 생성
from ipywidgets import FloatProgress

sol = test.groupby(['user_id'])['product_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

  0%|          | 0/3943 [00:00<?, ?it/s]

In [80]:
# 추천
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()
rec_df

Unnamed: 0,user_id
0,430276841
1,460216566
2,463020196
3,470651295
4,479159590
...,...
4966,564442389
4967,564458377
4968,564492792
4969,564514155


In [81]:
# 판매 상위 20개에 대해서 추천
tmp = df.groupby('product_id').size().nlargest(20).index
popular_rec_model = np.array(tmp)
popular_rec_model

array([1004767, 1005160, 1004856, 1005115, 1004741, 1004833, 1004249,
       1004870, 1005239, 1004739, 1002544, 1004873, 1004785, 1004836,
       1005161, 1005031, 1005159, 1004777, 1002524, 1004957], dtype=int64)

In [82]:
total_rec_list = {}
for user in tqdm(rec_df['user_id'].unique()):
    rec_list = []
    for rec in popular_rec_model[0:20]: 
        rec_list.append(rec)
    total_rec_list[user] = rec_list

  0%|          | 0/4971 [00:00<?, ?it/s]

In [83]:
# 평가지표를 직접 계산하는 클래스 생성: https://www.kaggle.com/code/chocozzz/04-goodbooks-10k-contents-based-model 의 코드 참고
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=20):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [84]:
# top 20에 대해 보여주기
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=20)
evaluate_func._evaluate()

MAP@20: 0.03470936329312459
NDCG@20: 0.08250075125861263
EntDiv@20: 2.995732273553991


----

## 3-1. 성능 개선 모델: TF-IDF를 사용한 Content-Based Model

In [50]:
# parquet 변환된 데이터셋 불러오기
df = pd.read_parquet('../e-commerce/input/2019-Oct_new.parquet')

In [51]:
# 필요한 컬럼만 불러오기
df = df[['product_id','category_code','brand','user_id','event_week']]

# 카테고리 코드와 브랜드 type 변경 (필요한가)
df['category_code'] = df['category_code'].astype(object)
df['brand'] = df['brand'].astype(object)

# 카테고리 코드 구분자 '.' 를 공백으로 변경 -> TF IDF 구분 하려면 이렇게 해야 한다고 하는데
# 온점으로 그대로 둬도 괜찮은지 확인 해보자
df['category_code'] = df['category_code'].apply(lambda x: x.replace('.',' '))

# 카테고리에 브랜드 추가해서 성능에 변화가 있는지 확인 해보자 
# df['category_code'] = df['category_code']+df['brand']

# # 카운트 많은 5000개 만 추려냄 - 시간 오래 걸리니까 임시로
tmp = df.groupby('user_id').size().sort_values(ascending=False).head(5000).index
df = df[df.user_id.isin(tmp)].reset_index(drop=True)

# # 오래걸려서 샘플링 진행 (50프로)
df = df.sample(frac=0.5, replace = True, random_state = 1)

# train / test 분할
train = df[df.event_week<44]
test = df[df.event_week==44]

train = train[['user_id','product_id']].sort_values(by='user_id').reset_index(drop=True)
test = test[['user_id','product_id']].sort_values(by='user_id').reset_index(drop=True)

In [52]:
# product 정보가 담긴 df 생성
# 레퍼런스의 books와 같은 데이터 프레임
tmp_df = df[['product_id','category_code']]
tmp_df = tmp_df.drop_duplicates()
tmp_df = tmp_df.sort_values(by='product_id').set_index('product_id')
tmp_df

Unnamed: 0_level_0,category_code
product_id,Unnamed: 1_level_1
1000978,electronics smartphone
1001588,electronics smartphone
1002042,electronics smartphone
1002062,electronics smartphone
1002098,electronics smartphone
...,...
60500001,construction tools welding
60500002,construction tools welding
60500003,construction tools welding
60500004,construction tools welding


In [53]:
# 카운트(뷰,카트,퍼체이스 무관) 컬럼 생성
tmp_counts = df.groupby('product_id').size()

tmp_df['hit_count'] = tmp_counts
tmp_df = tmp_df.reset_index()
tmp_df

Unnamed: 0,product_id,category_code,hit_count
0,1000978,electronics smartphone,173
1,1001588,electronics smartphone,147
2,1002042,electronics smartphone,9
3,1002062,electronics smartphone,61
4,1002098,electronics smartphone,135
...,...,...,...
35219,60500001,construction tools welding,18
35220,60500002,construction tools welding,15
35221,60500003,construction tools welding,1
35222,60500004,construction tools welding,2


In [54]:
tmp_df = tmp_df.sort_values(by='hit_count', ascending=False).reset_index(drop=True)
tmp_df

Unnamed: 0,product_id,category_code,hit_count
0,1004767,electronics smartphone,6388
1,1005160,electronics smartphone,6089
2,1004856,electronics smartphone,5817
3,1005115,electronics smartphone,4780
4,1004741,electronics smartphone,4555
...,...,...,...
35219,11600245,computers desktop,1
35220,28716053,apparel shoes,1
35221,28716051,apparel shoes,1
35222,21407070,electronics clocks,1


In [55]:
# popular_rec_model 추천할거 없으면
popular_rec_model = tmp_df.sort_values(by='hit_count', ascending=False)['product_id'].values[0:20]
popular_rec_model
# 20번까지

array([1004767, 1005160, 1004856, 1005115, 1004741, 1004833, 1004249,
       1004870, 1005239, 1004739, 1002544, 1004873, 1004785, 1004836,
       1005161, 1005031, 1005159, 1004777, 1002524, 1004957], dtype=int64)

In [56]:
# 비교대상 만들기
sol = test.groupby(['user_id'])['product_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

  0%|          | 0/3943 [00:00<?, ?it/s]

In [57]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()
rec_df

Unnamed: 0,user_id
0,430276841
1,460216566
2,463020196
3,470651295
4,479159590
...,...
4966,564442389
4967,564458377
4968,564492792
4969,564514155


In [58]:
# tfidf 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tmp_df['category_code'])
print(tfidf_matrix.shape)

(35224, 150)


In [59]:
# 유사도 매트릭스 생성 - 코사인 유사도
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(35224, 35224)

In [60]:
# category_code와 id를 매핑할 dictionary를 생성해줍니다. 
product2id = {}
for i, c in enumerate(tmp_df['product_id']): product2id[i] = c

# id와 product title를 매핑할 dictionary를 생성해줍니다. 
id2product = {}
for i, c in product2id.items(): id2product[c] = i
    
# product_id와 title를 매핑할 dictionary를 생성해줍니다.
productid2product = {}
for i, j in zip(tmp_df['product_id'].values, tmp_df['product_id'].values):
    productid2product[i] = j

In [61]:
tmp_df['product_id'].head()

0    1004767
1    1005160
2    1004856
3    1005115
4    1004741
Name: product_id, dtype: int64

In [62]:
idx = id2product[1004767]  
sim_scores = [(product2id[i], c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
sim_scores[0:10]

[(1005160, 1.0),
 (1004856, 1.0),
 (1005115, 1.0),
 (1004741, 1.0),
 (1004833, 1.0),
 (1004249, 1.0),
 (1004870, 1.0),
 (1005239, 1.0),
 (1004739, 1.0),
 (1002544, 1.0)]

In [63]:
train = pd.merge(train, tmp_df[['product_id', 'category_code']], how='left', on='product_id')
train.head()

Unnamed: 0,user_id,product_id,category_code
0,430276841,6000229,auto accessories alarm
1,430276841,5801376,electronics audio subwoofer
2,430276841,6000227,auto accessories alarm
3,430276841,5701084,auto accessories player
4,430276841,6000167,auto accessories alarm


- 예제

In [64]:
# 0. 학습셋에서 제목이 있는 경우에 대해서만 진행
tf_train = train[train['product_id'].notnull()].reset_index(drop=True)
tf_train['idx2product_id'] = tf_train['product_id'].apply(lambda x: id2product[x])
tf_train.head()

Unnamed: 0,user_id,product_id,category_code,idx2product_id
0,430276841,6000229,auto accessories alarm,980
1,430276841,5801376,electronics audio subwoofer,3395
2,430276841,6000227,auto accessories alarm,499
3,430276841,5701084,auto accessories player,1245
4,430276841,6000167,auto accessories alarm,1265


In [65]:
idx2product_id2product = {}
for i, j in zip(tf_train['idx2product_id'].values, tf_train['product_id'].values):
    idx2product_id2product[i] = j

In [66]:
# 1. 각 유저별로 관심 가진 제품의 목록을 수집 
user = 430276841
read_list = tf_train.groupby(['user_id'])['idx2product_id'].agg({'unique'}).reset_index()
seen = read_list[read_list['user_id'] == user]['unique'].values[0]
seen

array([  980,  3395,   499,  1245,  1265,  8893,  4594,   840,  2650,
        3190,   999,   316,  5016,   349, 10937,  1981, 23507,  9566,
        1074,  4360,  2674,   706,  1185,  3957,  1165,  5833,  2680,
        1715,   709, 13648,  4074,   858,  5034,   846,  6112,   350,
        4776, 17659,  5782,  6791,  1227,  2196,  1790,  8825, 18194,
         746,  4448,   515,  4999,  6633,  5091,  1088,  2582,  8290,
        5990,  6331,   859,  1662, 24686, 18398,   392, 11506,  3226,
        1351,  2945,  5103, 23511,  2242,  5153,  3532,  6854,   524,
        1109,  1418,  5336,  1040,  2384,  2603,  3446,  6225,  1505,
       11111,  5796,  4472,  4756,  3726,  2773,  3876,  2388,   640,
        4336,  2017, 12496,  7340,  7321, 13382,  2205,  8340,  8224,
       15770, 10574,  2570,  4640,  1283,  3979, 23494,   723,  3972,
        3508,  3635,  3036, 19428], dtype=int64)

In [67]:
# 2. 읽은 제품과 유사한 제품 추출 
## 733번째 책과 다른 책들간의 유사도 
cosine_matrix[3224]

array([0., 0., 0., ..., 0., 0., 0.])

In [68]:
# 2. 관심가진 제품과 유사한 제품 추출 
total_cosine_sim = np.zeros(len(product2id))
for product_ in seen: 
    
    # 3. 모든 제품에 대해서 유사도를 더한 값을 계산 
    # 343번째 제품과 248의 유사도가 모두 결합된 유사도
    total_cosine_sim += cosine_matrix[product_]

In [69]:
total_cosine_sim

array([5.09981627, 5.09981627, 5.09981627, ..., 0.        , 6.35638293,
       0.        ])

In [70]:
# 4. 3에서 유사도가 가장 높은 순서대로 추출
sim_scores = [(i, c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬 
sim_scores[0:20]

[(679, 63.28009086431688),
 (689, 63.28009086431688),
 (867, 63.28009086431688),
 (915, 63.28009086431688),
 (968, 63.28009086431688),
 (1062, 63.28009086431688),
 (1140, 63.28009086431688),
 (1153, 63.28009086431688),
 (1186, 63.28009086431688),
 (1208, 63.28009086431688),
 (1252, 63.28009086431688),
 (1253, 63.28009086431688),
 (1331, 63.28009086431688),
 (1409, 63.28009086431688),
 (1443, 63.28009086431688),
 (1553, 63.28009086431688),
 (1554, 63.28009086431688),
 (1621, 63.28009086431688),
 (1718, 63.28009086431688),
 (1749, 63.28009086431688)]

In [71]:
product2id[356]

1802037

In [72]:
tf_train['user_id'].unique()

array([430276841, 460216566, 463020196, ..., 564492792, 564514155,
       564680336])

In [73]:
tf_train.head(3)

Unnamed: 0,user_id,product_id,category_code,idx2product_id
0,430276841,6000229,auto accessories alarm,980
1,430276841,5801376,electronics audio subwoofer,3395
2,430276841,6000227,auto accessories alarm,499


In [74]:
# 학습 진행 

## 전체 데이터에 대해서 진행 
total_rec_list = {}

#첫번째 리스트 
# train user_id로 묶어서 
read_list1 = train.groupby(['user_id'])['product_id'].agg({'unique'}).reset_index()
read_list2 = tf_train.groupby(['user_id'])['idx2product_id'].agg({'unique'}).reset_index()

# user_id 순환
for user in tqdm(train['user_id'].unique()):
    rec_list = []
        
    # 만약 TF-IDF 소속의 추천대상이라면 Contents 기반의 추천 
    if user in tf_train['user_id'].unique():
        # 1. 각 유저별로 구매한 상품 목록을 수집 
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        # 2. 구매한 상품과 유사한 상품 추출 
        total_cosine_sim = np.zeros(len(product2id))
        for product_ in seen: 
            # 3. 모든 상품에 대해서 유사도를 더한 값을 계산 
            # 343번째 상품과 248의 유사도가 모두 결합된 유사도
            total_cosine_sim += cosine_matrix[product_]
            
        # 4. 3에서 유사도가 가장 높은 순서대로 추출
        sim_scores = [(productid2product[product2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:300] # 유사도가 높은 순서대로 정렬 
        for rec in recs: 
            if rec[0] not in seen:
                rec_list.append(rec[0])   
        
    # 그렇지 않으면 인기도 기반의 추천 
    else: 
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:400]:
            if rec[0] not in seen:
                rec_list.append(list(rec[0]))
                
    total_rec_list[user] = rec_list[0:20]

  0%|          | 0/4971 [00:00<?, ?it/s]

In [75]:
# 평가함수
import six # 파이썬 2, 3 호관성 라이브러리
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        # n : 개수, ap : average precision
        n, ap = 0.0, 0.0
        
        for u, seen in six.iteritems(self.gt): # six.iteritems는 .items와 같다.
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))

        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [76]:
#평가
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=20)
evaluate_func._evaluate()

MAP@20: 0.017148624592693736
NDCG@20: 0.04917596785249336
EntDiv@20: 6.029382092278079


-------

## 3-2. 성능 개선 모델: `Word2Vec`을 사용한 Content-Based Model
- https://www.kaggle.com/code/chocozzz/00-word2vec-1
- https://www.kaggle.com/code/chocozzz/04-goodbooks-10k-contents-based-model

In [None]:
train = train[['product_id','category_code','user_id']].sort_values(by='user_id').reset_index(drop=True)
test = test[['product_id','category_code','user_id']].sort_values(by='user_id').reset_index(drop=True)
train.shape, test.shape

((41030, 3), (8491, 3))

In [None]:
# product id: int -> str로 변환
train['product_id'] = train['product_id'].astype(str)
test['product_id'] = test['product_id'].astype(str)

-------

In [None]:
# userid 별 product id 구매 목록을 생성 
agg = train.groupby(['user_id'])['product_id'].agg({'unique'})
agg.head(20)

Unnamed: 0_level_0,unique
user_id,Unnamed: 1_level_1
430276841,"[5700402, 5700384, 5700576, 5801701, 6000094, ..."
460216566,"[1201512, 1201297, 1005216, 1307477, 1307489, ..."
463020196,"[1701552, 30900830, 30901117, 1700880, 1701490..."
486578007,"[7005328, 7004649, 7003167, 7005329, 7002775, ..."
489062934,"[1801623, 1004838, 1004659, 1004342, 1005017, ..."
499386220,"[1801929, 1800931, 1801551, 1801976, 1200617, ..."
506000960,"[1307279, 1005238, 1004078, 1305996, 1306651, ..."
506009066,"[1003992, 1004820, 1005062, 1004767, 1004814, ..."
509166157,"[1005032, 1004957, 1005069, 1004503, 1004500, ..."
510335574,"[1306954, 1307355, 1005160, 1307451, 1004957, ..."


In [None]:
# 유저별 구매 목록을 sentence에 넣기
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(user_sentence))

In [None]:
# Word2vec의 학습 진행 -> parameter reference: https://hoonzi-text.tistory.com/2
from gensim.models import Word2Vec

embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, sg=1)

In [None]:
# product_id가 2701024인 상품과 유사한? 아이텀 top 20 출력해보기
embedding_model.wv.most_similar(positive=['2701024'], topn=20)

[('14300312', 0.7671674489974976),
 ('1005262', 0.7537975311279297),
 ('3100267', 0.7170731425285339),
 ('1305851', 0.6903489828109741),
 ('17900077', 0.6858954429626465),
 ('2701460', 0.6819285154342651),
 ('1305976', 0.6705312728881836),
 ('4600560', 0.6687020659446716),
 ('28714725', 0.6658309698104858),
 ('1004988', 0.6626853346824646),
 ('12301124', 0.6544561386108398),
 ('1303049', 0.6449756622314453),
 ('5800957', 0.6413411498069763),
 ('1801983', 0.6351058483123779),
 ('6200080', 0.6282398700714111),
 ('1003609', 0.6261202096939087),
 ('3701399', 0.6239267587661743),
 ('1306761', 0.6209348440170288),
 ('6500779', 0.6208581924438477),
 ('1801792', 0.6175357103347778)]

In [None]:
# 전체 상품에 대해 진행
from ipywidgets import FloatProgress

total_rec_list = {}
purchase_list1 = agg.reset_index()

purchase_list = train.groupby(['user_id'])['product_id'].agg({'unique'}).reset_index()
for user in train['user_id'].unique():
    rec_list = []     
    purchase = purchase_list1[purchase_list1['user_id'] == user]['unique'].values[0]
    word2vec_dict = {}
    for product in purchase: 
        for i in embedding_model.wv.most_similar(positive=[product], topn=20):
            if i[0] not in purchase: 
                if i[0] not in word2vec_dict.keys(): 
                    word2vec_dict[i[0]] = i[1]
                else:
                    word2vec_dict[i[0]] += i[1]
                
    rec_list = list(dict(sorted(word2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    total_rec_list[user] = rec_list[0:20]

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt)
evaluate_func._evaluate()

MAP@20: 0.0
NDCG@20: 0.0
EntDiv@20: 5.7364575748113715


#### ✅ category_code를 통한 유사도 계산

In [None]:
agg = train.groupby(['product_id'])['category_code'].agg({'unique'}).reset_index()
agg.head()

Unnamed: 0,product_id,unique
0,1000978,[electronics smartphone]
1,1001588,[electronics smartphone]
2,1002042,[electronics smartphone]
3,1002062,[electronics smartphone]
4,1002098,[electronics smartphone]


In [None]:
# 카테고리 간의 유사도 계산 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(user_sentence))

In [None]:
from gensim.models import doc2vec

doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    vector_size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [None]:
from collections import namedtuple

# 학습하기 위해 CategorizedDocument라는 클래스에 category_code와 product_id 넣기
CategorizedDocument = namedtuple('CategorizedDocument',  ['words', 'tags'])
categorized_train_docs = [CategorizedDocument(c, [d]) for c, d in agg[['unique', 'product_id']].values]

categorized_train_docs
# reference: https://velog.io/@jewon119/Python-%EC%8B%AC%ED%99%94-NamedTuple#4-namedtuple-%ED%99%9C%EC%9A%A9-%EC%98%88%EC%8B%9C

[CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1000978']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1001588']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002042']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002062']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002098']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002099']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002100']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002101']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002102']),
 CategorizedDocument(words=array(['electronics smartphone'], dtype=object), tags=['1002225']),
 CategorizedDocument(words=array(['electronics sma

In [None]:
doc_vectorizer.build_vocab(categorized_train_docs)      # 학습을 진행할 사전 맏들기
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [None]:
# 벡터 문서 학습
from time import time
start = time()

for epoch in tqdm(range(3)):
    doc_vectorizer.train(categorized_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

end = time()
print("During Time: {}".format(end-start))

  0%|          | 0/3 [00:00<?, ?it/s]

During Time: 3.8909950256347656


In [None]:
doc_vectorizer.docvecs.most_similar('1001588', topn=20)     # product_id = '1001588'인 상품을 볼 때 관련성 높은 20개 상품 추천해주기

[('1004489', 0.38649502396583557),
 ('3701096', 0.3798837661743164),
 ('1004932', 0.37513405084609985),
 ('1701281', 0.37443262338638306),
 ('4804418', 0.36896592378616333),
 ('1003769', 0.34862399101257324),
 ('28719199', 0.33914196491241455),
 ('5800646', 0.3313061594963074),
 ('11600011', 0.3307732939720154),
 ('4804205', 0.32918450236320496),
 ('1004830', 0.3271191120147705),
 ('1005175', 0.3241901695728302),
 ('2800659', 0.3232332766056061),
 ('28719264', 0.31995317339897156),
 ('1004250', 0.3171292543411255),
 ('28719493', 0.316822350025177),
 ('28716184', 0.31601765751838684),
 ('4803014', 0.3155464231967926),
 ('28719776', 0.31500279903411865),
 ('4804423', 0.31188496947288513)]

In [None]:
# 전체 상품에 대해서 진행 
total_rec_list = {}

purchase_list1 = train.groupby(['user_id'])['product_id'].agg({'unique'}).reset_index()
# read_list2 = train[train['type'] == '1'].groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()

for user in train['user_id'].unique():
    rec_list = []
    bought = purchase_list1[purchase_list1['user_id'] == user]['unique'].values[0]
    doc2vec_dict = {}
    for product in bought: 
        for i in doc_vectorizer.docvecs.most_similar(positive=[product], topn=20): 
            if i[0] not in doc2vec_dict.keys(): 
                doc2vec_dict[i[0]] = i[1]
            else:
                doc2vec_dict[i[0]] += i[1]

    rec_list = list(dict(sorted(doc2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    # else:
    #     seen = purchase_list1[purchase_list1['user_id'] == user]['unique'].values[0]
    #     for rec in popular_rec_model[0:300]:
    #         if rec not in seen:
    #             rec_list.append(rec)

    total_rec_list[user] = rec_list[0:20]

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=20)
evaluate_func._evaluate()

MAP@20: 0.001588630171801049
NDCG@20: 0.005573220462657566
EntDiv@20: 7.981385547798531
