<font color = "#CC3D3D"><p>
# CF Based Product Recommender Systems

- 추천시스템은 통계적 기법과 지식탐사기술을 사용하여 고객 개개인의 취향에 가장 부합하는 상품 또는 서비스를 추천해주는 시스템으로서, 고객들의 구매 편의를 도모하고 교차판매 및 매출 증대에 초점을 맞춘 시스템이다. 
-  현재까지 추천시스템을 구현하기 위한 다양한 기법들이 개발되어 왔는데, 이중에서 **협업필터링(Collaborative Filtering)**이 가장 성공적인 추천기법으로 알려져 있으며 Amazone, Netflix, Spotify 등 수많은 기업들이 협업필터링을 통해 고객에게 추천서비스를 제공하고 있다.
- 협업필터링 추천시스템은 상품을 추천하고자 하는 고객과 취향이 유사한 고객들의 의견을 반영하여 추천 대상 고객이 아직 구매하지 않은 상품에 대한 선호도를 예측한 후 선호도가 높을 것으로 예측되는 상품을 추천해주는 시스템이다. 일반적으로 협업필터링 기반 추천 프로세스는 `1) 입력 데이터 구성`, `2) 유사 집단 탐색`, `3) 추천 상품 결정` 단계로 구성된다.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

# 코사인 유사도 계산을 위한 메소드
from sklearn.metrics.pairwise import cosine_similarity

#### 데이터 읽기
*출처 : 롯데멤버스, L.pay|L.POINT, 제6회 L.POINT Big Data Competition*

In [2]:
cs = pd.read_csv('L사_고객정보.csv')
gd = pd.read_csv('L사_상품정보.csv')
tr = pd.read_csv('L사_거래정보.csv')

gd.pd_c = gd.pd_c.astype(str).str.zfill(4) # 상품과 거래 데이터의 pd_c 컬럼의 데이터 유형이 서로 달라 맞춰줌. 자리수까지도 맞춰줘야 함. 
df = pd.merge(tr, cs).merge(gd, on='pd_c')
df.de_dt = df.de_dt.astype(str).astype('datetime64') # 날짜정보를 담고 있는 de_dt 컬럼의 데이터 유형이 정수이기 때문

df.head()

Unnamed: 0,clnt_id,trans_id,trans_seq,biz_unit,pd_c,de_dt,de_tm,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm1,clac_nm2,clac_nm3
0,21922,104999,3,A03,182,2019-09-20,12:41,10900,1,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups
1,39423,105124,10,A03,182,2019-09-20,17:26,21800,2,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups
2,39423,89469,2,A03,182,2019-09-01,03:32,25800,2,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups
3,39423,88436,1,A03,182,2019-08-30,17:15,25800,2,F,50,Chilled Foods,Chilled Instant Foods,Chilled Soups
4,18362,50872,3,A03,182,2019-07-15,09:46,31600,4,F,40,Chilled Foods,Chilled Instant Foods,Chilled Soups


## 추천 프로세스
<img align='left' src='http://drive.google.com/uc?export=view&id=10QS0xBx21NahiKdlstDoh0gkQRyrC2vR'>

#### 1단계: 입력데이터 구성 (Ratings Matrix 만들기)

In [3]:
# 행은 고객ID, 열은 상품명으로 pivoting 수행. 구매가 없는 값(NaN)은 모두 0으로 변환
ratings_matrix = pd.pivot_table(df, index='clnt_id', columns='clac_nm3', values='pd_c', 
                                aggfunc=lambda x: 1 if len(x) >= 1 else 0, fill_value=0)

print(ratings_matrix.shape)
ratings_matrix

(11270, 1662)


clac_nm3,Unnamed: 1_level_0,Accessory Bags,Accident Prevention Equipment,Adhesive Tapes,Adhesives,Adult's Bed Covers and Skirts,Adult's Diapers,Adult's Disposable Briefes,Adults' Bed Fillings,Adults' Bedding Sets,...,Yoga / Pilates Clothing,Yoga Mats / Exercise Mats,Yoghurts / Chunggukjang Makers,Yogurt Drinks,Young Pumpkins,Young Radishes,Yuzus / Quinces,Zipper Poly Bags / Plastic Bags,kelp,life Vests / Safety Accessories
clnt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 2단계: 유사집단 탐색
- 사용자 간 유사도 산출

In [4]:
# 유사도가 가장 높은 이웃의 수 설정 (분석가가 임의로 결정)
K = 50

In [5]:
# cosine_similarity()는 행을 기준으로(즉, 고객 간) 유사도를 계산
user_sim = cosine_similarity(ratings_matrix, ratings_matrix)

# cosine_similarity()로 반환된 numpy 행렬에 상품명을 매핑하기 위해 DataFrame으로 변환
# 행과 열이 동일한 데이터프레임이 생성됨.
user_sim = pd.DataFrame(user_sim, ratings_matrix.index, ratings_matrix.index)

# 다음 단계에서 사용하기 위해 의도적으로 대각선 값을 1에서 2(코사인 유사도 최대값 1보다 크게)로 변경
# np.fill_diagonal() 사용
np.fill_diagonal(user_sim.values, 2)

print(user_sim.shape)
user_sim

(11270, 11270)


clnt_id,2,9,12,20,23,24,29,38,40,41,...,72333,72340,72342,72356,72361,72373,72400,72410,72423,72424
clnt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2.000000,0.060634,0.0,0.064550,0.111803,0.094491,0.059761,0.040825,0.0,0.109109,...,0.171499,0.041380,0.0,0.000000,0.0,0.000000,0.000000,0.127000,0.0,0.099015
9,0.060634,2.000000,0.0,0.093934,0.162698,0.229175,0.202920,0.257438,0.0,0.211702,...,0.207973,0.220796,0.0,0.074848,0.0,0.080845,0.000000,0.277218,0.0,0.168102
12,0.000000,0.000000,2.0,0.000000,0.000000,0.000000,0.000000,0.115470,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000
20,0.064550,0.093934,0.0,2.000000,0.173205,0.146385,0.216025,0.252982,0.0,0.028172,...,0.177123,0.192318,0.0,0.079682,0.0,0.043033,0.000000,0.098374,0.0,0.153393
23,0.111803,0.162698,0.0,0.173205,2.000000,0.084515,0.187083,0.310376,0.0,0.048795,...,0.191741,0.129541,0.0,0.034503,0.0,0.074536,0.000000,0.113592,0.0,0.199263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72373,0.000000,0.080845,0.0,0.043033,0.074536,0.062994,0.000000,0.054433,0.0,0.072739,...,0.057166,0.082761,0.0,0.102869,0.0,2.000000,0.117851,0.042333,0.0,0.066010
72400,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.057735,0.0,0.000000,...,0.000000,0.000000,0.0,0.109109,0.0,0.117851,2.000000,0.000000,0.0,0.000000
72410,0.127000,0.277218,0.0,0.098374,0.113592,0.192006,0.212512,0.207390,0.0,0.193996,...,0.261364,0.147148,0.0,0.000000,0.0,0.042333,0.000000,2.000000,0.0,0.226348
72423,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,2.0,0.000000


<font color='red'><p>
특정 고객 추천과정 살펴보기

In [6]:
# Id=2인 고객과 유사도가 높은 상위 K명 리스트
user_sim.loc[2].sort_values(ascending=False)[1:10]

clnt_id
44452    0.612372
8442     0.500000
27375    0.365148
57512    0.353553
59881    0.353553
41393    0.353553
4301     0.353553
40071    0.353553
39281    0.353553
Name: 2, dtype: float64

- 유사 집단 생성

In [7]:
# 각 고객마다 K-nearest neighbors 생성. 다음단계에서 사용하기 위해 의도적으로 자기 자신을 가장 가까운 이웃으로 설정(K+1개) 
# 행은 고객, 열은 K개의 이웃인 데이터 프레임이 만들어짐. 
knn = user_sim.apply(lambda x,k: x.sort_values(ascending=False).index[:k+1], args=(K,)).T
knn

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
clnt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,44452,8442,27375,57512,59881,41393,4301,40071,39281,...,71623,29119,70512,58962,52829,10483,45510,42572,22953,41668
9,9,29631,37074,45251,64183,42863,17366,22277,68575,67812,...,17920,70174,8308,68394,19956,63530,45318,68611,52111,10800
12,12,395,64406,58558,49148,25451,53656,51950,31111,10554,...,69037,70440,5526,57681,68903,33282,37489,43556,33286,50745
20,20,25433,46182,55002,30018,62659,62230,9667,56664,31866,...,48480,23993,14132,45092,3985,59894,26619,61908,41763,63440
23,23,1316,62234,70144,11299,46832,64182,18829,66940,62257,...,11618,26685,49938,9744,32364,6052,10148,65773,20747,41763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72373,72373,45612,21059,35546,54738,24580,54853,41983,42644,71448,...,43985,41021,26673,60965,61000,46752,37327,42509,9427,56519
72400,72400,37255,17320,18251,48320,35659,63583,43617,16276,35615,...,70433,4545,22713,21171,36433,19758,14297,26925,31036,66526
72410,72410,13995,45171,36518,8302,10808,52338,58764,53780,58044,...,61379,31006,69730,18866,47909,64152,14131,50284,58394,70730
72423,72423,37388,45654,50510,40277,14178,1734,185,39254,21632,...,11322,7009,43030,19142,50600,35904,46024,58141,70826,67721


#### 3단계: 추천 상품 결정 (Top-N 상품추천)

In [8]:
# 추천할 상품수 설정 (분석가가 임의로 결정)
N = 10

In [9]:
# 추천 리스트에서 제외하기 위해 이미 구매한 상품 목록 작성
purchased_list = df.groupby('clnt_id')['clac_nm3'].unique()
purchased_list

clnt_id
2        [Ramens, Canned Vegetable Foods, Coffee Drinks...
9        [Functional Milk, Cookies, Young Pumpkins, Ban...
12                                               [Peaches]
20       [Onions, Domestic Beefs - Rounds, Seasoned Sli...
23       [Chilled Noodles, Onions, Domestic Beefs - Rou...
                               ...                        
72373    [Chilled Noodles, Functional Milk, Seasoned Be...
72400    [Toothpaste, Hair Shampoos, Processed Meats fo...
72410    [Functional Milk, Bananas, Fish Cakes, Chicken...
72423                    [Beauty Supplements, Cell Phones]
72424    [Chilled Noodles, Onions, Cookies, Packged Kim...
Name: clac_nm3, Length: 11271, dtype: object

In [10]:
# 이미 구매한 상품을 제외하고 유사집단에서 가장 많이 구매한 N개의 상품을 추천 
def top_n(x, n):
    # 고객이 구매했던 상품을 뽑아내는 과정: x[0]에는 고객id가 있음 (이를 위해 의도적으로 이웃의 수를 K+1로 했던 것)
    purchased = purchased_list.filter(items=[x[0]]).iloc[0]
    # 위 고객의 이웃들이 가장 많이 산 상품을 뽑아내는 과정: x[1:]에는 이웃들의 id가 들어가 있음. 
    candidate = ratings_matrix.filter(items=x[1:], axis=0).sum().sort_values(ascending=False).index.to_list()
    # 위 이웃들이 가장 많이 산 상품에서 이미 구매했던 상품을 제외하고 N개를 추천
    return [item for item in candidate if item not in purchased][:n]

# knn의 행, 즉 고객별로 top_n을 고객 수만큼 호출 
# 고객ID와 추천상품 두 개의 컬럼이 생기는데, 추천상품 컬럼에는 리스트 형태로 상품이 저장됨. 
recommend_list = knn.apply(top_n, args=(N,), axis=1).reset_index().rename(columns={0:'recommend_items'})
recommend_list

Unnamed: 0,clnt_id,recommend_items
0,2,"[Chicken Eggs, Fresh Milk, General Snacks, Coo..."
1,9,"[Chicken Eggs, Corn Snacks, Spoon Type Yogurts..."
2,12,"[Bibim Ramens, Chicken Eggs, Water, Spoon Type..."
3,20,"[Tofu, Ramens, Soybean Sprouts, Young Pumpkins..."
4,23,"[Tofu, Jumbo Spring Onions, General Snacks, Ra..."
...,...,...
11265,72373,"[Peaches, Oyster Mushrooms, Fruit Juices, Bibi..."
11266,72400,"[General Dishwashing Liquids, Toothbrushes, Wo..."
11267,72410,"[Fresh Milk, Tofu, Ramens, Sausages, Bibim Ram..."
11268,72423,"[General Snacks, Ramens, Tofu, Chicken Eggs, B..."


<font color='red'><p>
특정 고객 추천과정 살펴보기

In [11]:
# Id=2인 고객의 기 구매상품과 추천상품
pd.concat([pd.DataFrame({'purchased': purchased_list.filter(items=[2]).iloc[0]}),
           pd.DataFrame({'recommend': recommend_list.query('clnt_id == 2').recommend_items.iloc[0]})],
          ignore_index=False, axis=1)

Unnamed: 0,purchased,recommend
0,Ramens,Chicken Eggs
1,Canned Vegetable Foods,Fresh Milk
2,Coffee Drinks,General Snacks
3,Fried Tofu,Cookies
4,Crab Sticks,Tofu
5,Cream and Condensed milk,Functional Milk
6,Men's T-shirts,Bananas
7,Infant / Toddlers' T-shirts / Tops,Jumbo Spring Onions
8,,Spoon Type Yogurts
9,,Bibim Ramens


### 고려사항
- RFM의 각 세그먼트를 유사집단(kNN)이라고 간주하고, 각 세그먼트에 속하는 고객들에게 같은 상품을 추천하면 어떨까? 
- 단, 고객마다 기 구매한 상품은 제외하고. 

<font color = "#CC3D3D"><p>
# End