In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 18
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False

# 0.Data Load
---

In [2]:
df_prch = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv", low_memory=False)
# df_prod = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

In [3]:
df_prch.head()

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,20210101,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,20210101,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,20210101,10,19000.0,1


In [4]:
# df_prod.head()

# 1.Explore Association Rules
---

In [5]:
df_prch["pd_c"].value_counts(normalize=True).head(100)

PD0232    0.021702
PD1484    0.020217
PD1156    0.018318
PD0403    0.014099
PD0228    0.012940
            ...   
PD1699    0.002495
PD0408    0.002462
PD0358    0.002431
PD0115    0.002387
PD1290    0.002362
Name: pd_c, Length: 100, dtype: float64

In [6]:
product_list_per_order = df_prch.groupby("rct_no")["pd_c"].apply(list)
product_list_per_order

rct_no
A01000001113                                             [PD0290]
A01000002265                                             [PD1369]
A01000003148                                     [PD0290, PD0290]
A01000004946                                             [PD0290]
A01000005297                                             [PD1692]
                                      ...                        
E06052115831                                             [PD0507]
E06052116037                             [PD0507, PD0507, PD0507]
E06052117103                                             [PD0294]
E06052118403             [PD0507, PD0507, PD0507, PD0507, PD0507]
E06052119978    [PD0507, PD0507, PD0507, PD0507, PD0507, PD050...
Name: pd_c, Length: 1266589, dtype: object

In [7]:
from mlxtend.preprocessing import TransactionEncoder

encoder = TransactionEncoder()
one_hot_df = encoder.fit(product_list_per_order).transform(product_list_per_order)
one_hot_df = pd.DataFrame(one_hot_df, columns=encoder.columns_)
one_hot_df.head()

Unnamed: 0,PD0001,PD0002,PD0003,PD0004,PD0005,PD0006,PD0007,PD0008,PD0009,PD0010,...,PD1924,PD1925,PD1926,PD1927,PD1928,PD1929,PD1930,PD1931,PD1932,PD1933
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
from mlxtend.frequent_patterns import *

frequent_item_df = apriori(one_hot_df, min_support=.005)
result = association_rules(frequent_item_df, metric="confidence", min_threshold=.1)
result.head()

In [None]:
result.sort_values(by="confidence", ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
16,(229),(231),0.023037,0.053507,0.009581,0.415896,7.772775,0.008348,1.620418
27,(403),(402),0.017012,0.036948,0.006578,0.38669,10.465762,0.00595,1.570252
52,(1389),(1386),0.019023,0.039369,0.006563,0.344982,8.762847,0.005814,1.466573
48,(1332),(1329),0.02735,0.031791,0.008933,0.326636,10.274516,0.008064,1.437869
36,(1152),(1155),0.023465,0.059569,0.007369,0.314031,5.271749,0.005971,1.370953
9,(227),(231),0.029954,0.053507,0.009397,0.313706,5.862926,0.007794,1.379136
47,(1329),(1332),0.031791,0.02735,0.008933,0.281006,10.274516,0.008064,1.352794
8,(229),(227),0.023037,0.029954,0.005662,0.245802,8.205842,0.004972,1.286194
46,(1485),(1155),0.040494,0.059569,0.00987,0.243736,4.09169,0.007458,1.243523
5,(217),(231),0.024686,0.053507,0.00579,0.234528,4.38316,0.004469,1.236484


In [None]:
df_prod = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

In [None]:
df_prod.isna().sum()

pd_c            0
pd_nm           0
clac_hlv_nm     0
clac_mcls_nm    0
dtype: int64

In [None]:
result["antecedents"] = result["antecedents"].apply(lambda x: df_prod.loc[df_prod["pd_c"]==one_hot_df.columns[list(x)[0]]]["pd_nm"].values[0])
result["consequents"] = result["consequents"].apply(lambda x: df_prod.loc[df_prod["pd_c"]==one_hot_df.columns[list(x)[0]]]["pd_nm"].values[0])

In [None]:
result.sort_values(by="confidence", ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
16,옥수수스낵,일반스낵,0.023037,0.053507,0.009581,0.415896,7.772775,0.008348,1.620418
27,비빔봉지라면,국물봉지라면,0.017012,0.036948,0.006578,0.38669,10.465762,0.00595,1.570252
52,콩나물,두부류,0.019023,0.039369,0.006563,0.344982,8.762847,0.005814,1.466573
48,일반소주,국산맥주,0.02735,0.031791,0.008933,0.326636,10.274516,0.008064,1.437869
36,일반요구르트,일반우유,0.023465,0.059569,0.007369,0.314031,5.271749,0.005971,1.370953
9,감자스낵,일반스낵,0.029954,0.053507,0.009397,0.313706,5.862926,0.007794,1.379136
47,국산맥주,일반소주,0.031791,0.02735,0.008933,0.281006,10.274516,0.008064,1.352794
8,옥수수스낵,감자스낵,0.023037,0.029954,0.005662,0.245802,8.205842,0.004972,1.286194
46,일반계란,일반우유,0.040494,0.059569,0.00987,0.243736,4.09169,0.007458,1.243523
5,쿠키,일반스낵,0.024686,0.053507,0.00579,0.234528,4.38316,0.004469,1.236484


In [None]:
# result[["antecedents","consequents","support","confidence"]].sort_values(by="confidence", ascending=False).to_csv("연관규칙_Total.csv",index=False, encoding="euc-kr")

**연관 규칙 탐색 결과**

- 도메인 관점에서 유사도가 높은 제품 간 신뢰도가 높게 측정된 경향
- 연관성이 없더라도 필수 품목(ex: 종량제 봉투)의 구매에 연관성이 부여되기도 함
- 이는 다량의 데이터를 통해 규칙을 탐색하기에 유통사를 이용하는 전체 고객의 특성적인 부분을 보여주나, 당연하게 여겨지는 결과도 도출됨
- 더 나아가 연관규칙을 통한 분석은 장바구니 하나에 같이 담기는 상품들이므로 향후 고객이 구매해야하는 품목에 대한 예측에 있어서는 다른 아이디어 필요

**이후 연관 규칙 적용** (가능한 방법들에 대한 기술)

1. 윈도우 분할을 통해 90일 간 점진적인 장바구니 변화 양상을 추적한다.
2. 제휴사 별로 분할하여 연관 규칙을 도출한다. (제휴사가 NaN인 경우의 분리)
3. 지역 별로 분할하여 연관 규칙을 도출한다.
4. 상품 유사도를 측정하고 유사도가 높지 않은 상품을 추천하는 방향으로 설계한다.
    - 이미 구매를 한 내역을 바탕으로 분석하는 내용이기에 유사한 상품을 추천하는 것은 소비자의 구매 추천 흥미를 떨어뜨릴 수 있는 요소
    - 따라서 상품 유사도를 측정하여 유사하지 않은 상품군을 추천하는 것에 대한 효과성이 있을 것으로 판단됨