In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 18
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False

# 0.Data Load
---

In [2]:
df_prch = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv", low_memory=False)
df_prod = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

In [3]:
df_prch.head()

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,20210101,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,20210101,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,20210101,10,19000.0,1


In [4]:
df_prod.head()

Unnamed: 0,pd_c,pd_nm,clac_hlv_nm,clac_mcls_nm
0,PD0001,소파,가구,거실가구
1,PD0002,스툴/리빙의자,가구,거실가구
2,PD0003,탁자,가구,거실가구
3,PD0004,장식장/진열장,가구,거실가구
4,PD0005,기타가구,가구,기타가구


# 1.논리 설계 구현
---

## 1.1.장바구니 연관성이 높지만, 구매하지 않은 품목 추천
---

1. 90일 window 데이터 기반 연관 규칙 도출 (해당 과정에서 상품 유사도를 측정하여 적용하면 좋을 듯)
2. 고객 한 명당 90일 구매 데이터의 value_counts 측정
3. 아래 사항 case 처리
    - value_counts가 높은 순으로 연관 규칙 적용 (1달 이내 구매할 만큼 구매가 빈번한 상품)
    - 최근 구매 순으로 연관 규칙 적용 (가장 최근 구매가 다음 구매에 영향을 미친다는 논리 적용)
    > - 유사도가 특정치 이상이거나,  충분히 빈발한 구매 품목에 대해 굳이 추천하지 않도록<br>
    > - 90일 구매 이내에 이미 구매한 연관 규칙의 경우는 pass
4. 연관 규칙에 의해 도출된 품목 3가지 투입
5. `MAP@K` 지표를 통해 평가

### 1.1.1. 90일 윈도우 연관 규칙 도출
---

In [5]:
# -- 1. 90일 윈도우 기반 연관 규칙
## -- 1.1. 시간 순으로 90일 윈도우 생성하는 함수
df_prch["de_dt"]=pd.to_datetime(df_prch["de_dt"].astype(str))

from datetime import datetime, timedelta

def windows90(df, start, window=None):
    start = datetime.strptime(start, "%Y-%m-%d")
    # end = datetime.strptime(end, "%Y-%m-%d")
    if window is None:
        window=[date.strftime("%Y-%m-%d") for date in pd.date_range(start=start, periods=90)]
    else:
        window=window[1:]+[window[-1]+timedelta(days=1)]
    return df.loc[df["de_dt"].isin(window)]

windows90(df_prch, "2021-01-01")

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,2021-01-01,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,2021-01-01,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
...,...,...,...,...,...,...,...,...,...,...
4339259,M206715364,E06023586868,2,A06,,PD0981,2021-03-31,23,110000.0,1
4339260,M585664062,E06023587628,2,A06,,PD0620,2021-03-31,23,4450.0,5
4339261,M206715364,E06023588335,2,A06,,PD0981,2021-03-31,23,110000.0,1
4339262,M206715364,E06023588335,2,A06,,PD0981,2021-03-31,23,110000.0,1


In [6]:
## -- 1.2. 연관 규칙 도출 (상품 유사도 측정 X)
df_first90 = windows90(df_prch, "2021-01-01")
product_list_per_order = df_first90.groupby("rct_no")["pd_c"].apply(list)
product_list_per_order

rct_no
A01000001113            [PD0290]
A01000002265            [PD1369]
A01000003148    [PD0290, PD0290]
A01000004946            [PD0290]
A01000005297            [PD1692]
                      ...       
E06023585457    [PD0981, PD0981]
E06023586868            [PD0981]
E06023587628            [PD0620]
E06023588335    [PD0981, PD0981]
E06023589803            [PD0613]
Name: pd_c, Length: 316024, dtype: object

In [7]:
from mlxtend.preprocessing import TransactionEncoder

encoder = TransactionEncoder()
one_hot_df = encoder.fit(product_list_per_order).transform(product_list_per_order)
one_hot_df = pd.DataFrame(one_hot_df, columns=encoder.columns_)
one_hot_df.head()

Unnamed: 0,PD0001,PD0002,PD0003,PD0004,PD0005,PD0006,PD0007,PD0008,PD0009,PD0010,...,PD1924,PD1925,PD1926,PD1927,PD1928,PD1929,PD1930,PD1931,PD1932,PD1933
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
from mlxtend.frequent_patterns import *

frequent_item_df = apriori(one_hot_df, min_support=.003)
result = association_rules(frequent_item_df, metric="confidence", min_threshold=.1)
result

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(166),(171),0.015667,0.039918,0.003481,0.222177,5.565864,0.002855,1.234320
1,(166),(1101),0.015667,0.070520,0.003361,0.214502,3.041722,0.002256,1.183301
2,(177),(171),0.015549,0.039918,0.003107,0.199837,5.006211,0.002487,1.199859
3,(171),(189),0.039918,0.032846,0.005667,0.141974,4.322461,0.004356,1.127185
4,(189),(171),0.032846,0.039918,0.005667,0.172543,4.322461,0.004356,1.160281
...,...,...,...,...,...,...,...,...,...
260,"(1427, 390)",(1101),0.008503,0.070520,0.003038,0.357276,5.066307,0.002438,1.446157
261,"(1101, 390)",(1427),0.011322,0.048765,0.003038,0.268306,5.501994,0.002486,1.300045
262,"(1330, 1427)",(1101),0.010759,0.070520,0.003427,0.318529,4.516869,0.002668,1.363933
263,"(1330, 1101)",(1427),0.010746,0.048765,0.003427,0.318905,6.539582,0.002903,1.396625


In [9]:
result.sort_values(by="confidence", ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
145,(391),(390),0.015299,0.044408,0.006493,0.424405,9.556953,0.005814,1.660182
81,(222),(224),0.023312,0.055964,0.009822,0.421338,7.528726,0.008517,1.631413
169,(398),(1101),0.011306,0.07052,0.004357,0.38539,5.464984,0.00356,1.51231
236,(1369),(1330),0.012942,0.045035,0.004784,0.369682,8.208856,0.004202,1.515054
260,"(1427, 390)",(1101),0.008503,0.07052,0.003038,0.357276,5.066307,0.002438,1.446157
232,(1333),(1330),0.023213,0.045035,0.00824,0.354962,7.881988,0.007194,1.480479
178,(1098),(1101),0.030403,0.07052,0.010585,0.348147,4.936863,0.008441,1.425905
57,(211),(224),0.01179,0.055964,0.003791,0.321524,5.74519,0.003131,1.391407
263,"(1330, 1101)",(1427),0.010746,0.048765,0.003427,0.318905,6.539582,0.002903,1.396625
262,"(1330, 1427)",(1101),0.010759,0.07052,0.003427,0.318529,4.516869,0.002668,1.363933


In [10]:
result["antecedents"] = result["antecedents"].apply(lambda x: df_prod.loc[df_prod["pd_c"]==one_hot_df.columns[list(x)[0]]]["pd_nm"].values[0])
result["consequents"] = result["consequents"].apply(lambda x: df_prod.loc[df_prod["pd_c"]==one_hot_df.columns[list(x)[0]]]["pd_nm"].values[0])

In [11]:
result.sort_values(by="confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
145,비빔봉지라면,국물봉지라면,0.015299,0.044408,0.006493,0.424405,9.556953,0.005814,1.660182
81,옥수수스낵,일반스낵,0.023312,0.055964,0.009822,0.421338,7.528726,0.008517,1.631413
169,일반시리얼,일반우유,0.011306,0.070520,0.004357,0.385390,5.464984,0.003560,1.512310
236,애호박,두부류,0.012942,0.045035,0.004784,0.369682,8.208856,0.004202,1.515054
260,일반계란,일반우유,0.008503,0.070520,0.003038,0.357276,5.066307,0.002438,1.446157
...,...,...,...,...,...,...,...,...,...
12,딸기,일반요구르트,0.039918,0.030403,0.004120,0.103210,3.394773,0.002906,1.081187
182,일반요구르트,생수,0.030403,0.029128,0.003133,0.103039,3.537516,0.002247,1.082402
5,딸기,감자스낵,0.039918,0.036377,0.004053,0.101546,2.791484,0.002601,1.072534
181,일반요구르트,치즈,0.030403,0.022182,0.003060,0.100645,4.537279,0.002386,1.087244


### 1.1.2. 고객 한 명당 90일 구매 데이터 value_count
---

In [12]:
df_first90.head()

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,2021-01-01,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,2021-01-01,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1


In [13]:
df_first90.loc[df_first90["cust"]=="M430112881"]

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,2021-01-01,10,15000.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
5,M430112881,A01000005297,1,A01,A010039,PD1692,2021-01-01,10,9900.0,1
...,...,...,...,...,...,...,...,...,...,...
3272601,M430112881,A03105026211,1,A03,A030185,PD0405,2021-03-15,17,2690.0,1
3283418,M430112881,A03089415807,1,A03,A030161,PD1649,2021-03-19,17,23900.0,2
3296452,M430112881,A03105048553,1,A03,A030185,PD1430,2021-03-24,17,4490.0,1
3296453,M430112881,A03105048553,1,A03,A030185,PD1392,2021-03-24,17,1990.0,1


In [14]:
# -- 품목마다 가격이 다르면 다른 품목으로 볼 수 있으므로 다른 단위가격 정리
# df_prch["price_per_prod"]=df_prch["buy_am"]/df_prch["buy_ct"]
# df_prch.groupby(by=["pd_c","price_per_prod"], as_index=False)["rct_no"].count().sort_values(by="rct_no", ascending=False)

In [15]:
product_list_per_cust = df_first90.groupby("cust")["pd_c"].apply(list)
product_list_per_cust["M000034966"]

['PD0777',
 'PD0777',
 'PD0796',
 'PD0816',
 'PD0630',
 'PD0630',
 'PD0616',
 'PD0630',
 'PD0616',
 'PD0649',
 'PD0116',
 'PD0952',
 'PD0952',
 'PD1161',
 'PD1330']

 -> 일단 나중에,,

# 2.함수화 작업 수행
---

In [16]:
# -- N일 윈도우 형성
df_prch["de_dt"]=pd.to_datetime(df_prch["de_dt"].astype(str))

from datetime import datetime, timedelta

def make_windows(df, start, col_nm, window=90):
    start = datetime.strptime(start, "%Y-%m-%d")
    # end = datetime.strptime(end, "%Y-%m-%d")
    window=[date.strftime("%Y-%m-%d") for date in pd.date_range(start=start, periods=window)]
    # window=window[1:]+[window[-1]+timedelta(days=1)]
    return df.loc[df[col_nm].isin(window)]

make_windows(df_prch, "2021-01-01", "de_dt", 30)

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,2021-01-01,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,2021-01-01,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,2021-01-01,10,19000.0,1
...,...,...,...,...,...,...,...,...,...,...
4315205,M505778499,E06006404287,2,A06,,PD0082,2021-01-30,23,34500.0,1
4315206,M788680851,E06006405265,2,A06,,PD0238,2021-01-30,23,11980.0,1
4315207,M097157467,E06006406764,2,A06,,PD1925,2021-01-30,23,11800.0,2
4315208,M410156270,E06006408006,2,A06,,PD0082,2021-01-30,23,19900.0,1


In [17]:
# -- 연관 규칙 도출
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

def make_ASR(df, order, product, min_support=.003, min_confidence=.1):
    
    product_list_per_order = df.groupby(order)[product].apply(list)
    encoder = TransactionEncoder()
    one_hot_df = encoder.fit(product_list_per_order).transform(product_list_per_order)
    one_hot_df = pd.DataFrame(one_hot_df, columns=encoder.columns_)
    frequent_item_df = apriori(one_hot_df, min_support=min_support)
    result = association_rules(frequent_item_df, metric="confidence", min_threshold=min_confidence)
    result = result[["antecedents", "consequents", "support", "confidence"]].sort_values(by="confidence", ascending=False).reset_index(drop=True)
    result["antecedents"] = result["antecedents"].apply(lambda x: df.loc[df["pd_c"]==one_hot_df.columns[list(x)[0]]]["pd_c"].values[0])
    result["consequents"] = result["consequents"].apply(lambda x: df.loc[df["pd_c"]==one_hot_df.columns[list(x)[0]]]["pd_c"].values[0])
    
    return result

In [18]:
make_ASR(df_first90, "rct_no", "pd_c")

Unnamed: 0,antecedents,consequents,support,confidence
0,PD0404,PD0403,0.006493,0.424405
1,PD0230,PD0232,0.009822,0.421338
2,PD0411,PD1156,0.004357,0.385390
3,PD1426,PD1387,0.004784,0.369682
4,PD1486,PD1156,0.003038,0.357276
...,...,...,...,...
260,PD0177,PD1153,0.004120,0.103210
261,PD1153,PD1173,0.003133,0.103039
262,PD0177,PD0228,0.004053,0.101546
263,PD1153,PD1160,0.003060,0.100645
