## library load

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm
import copy
from scipy.spatial.distance import cdist

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 15
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False
plt.style.use("ggplot")

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)

## data load

In [2]:
## rnch load
# df_cust = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
# df_pdde = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv")
# df_cop_u = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")
# df_pd_clac = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

In [3]:
# ## osy load
# df_cust = pd.read_csv("../../LPOINT_BIG_COMP_01_DEMO.csv")
# df_pdde = pd.read_csv("../../LPOINT_BIG_COMP_02_PDDE.csv")
# df_cop_u = pd.read_csv("../../LPOINT_BIG_COMP_03_COP_U.csv")
# df_pd_clac = pd.read_csv("../../LPOINT_BIG_COMP_04_PD_CLAC.csv")

In [4]:
# choongs load
df_cust = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
df_pdde = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv")
df_cop_u = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")
df_pd_clac = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


## Function

In [5]:
#DBSCAN을 통해 이상치 제거
def delete_outlier_DBSCAN(df_pdde, df_pd_clac):
    df_pdde_pd_clac = df_pdde.merge(df_pd_clac, how='left', on='pd_c')
    df_pt = pd.pivot_table(data=df_pdde_pd_clac,
               values='buy_am',
               index='cust',
               columns='clac_hlv_nm',
               aggfunc='sum',
               fill_value=0)

    df_pt.reset_index(inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_pt.iloc[:,1:])
    df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
    df_spt_2 =  df_spt.reset_index()

    dbscan = DBSCAN(eps = 0.1, min_samples = 30, metric = "euclidean")
    dbscan_labels = dbscan.fit_predict(df_spt_2.iloc[:, 1:])
    df_spt_2['outlier'] = dbscan_labels

    df_spt_not_outlier_cust = df_spt_2[df_spt_2['outlier']==0]['cust']
    
    return df_spt_not_outlier_cust



# 누락 데이터 삭제
def check_on_off(df_main, df, key="cust", col_nm="chnl_dv"):
    df_new = pd.DataFrame(df.groupby(key)[col_nm].apply(lambda x:list(set(x))))
    return df_main.merge(df_new, how="left", on=key).dropna(subset=["chnl_dv"])



# on/off 분류
def split_on_off(df, col_nm="chnl_dv"):
    df[col_nm] = df[col_nm].apply(lambda x: x[0] if len(x)==1 else 0)
    df_off = df.loc[df[col_nm]==1] ##off
    df_on = df.loc[df[col_nm]==2] ##on
    df_onf = df.loc[df[col_nm]==0] ##onf

    df_not_off = pd.concat([df_on, df_onf], axis=0, ignore_index=True)


    return df_off, df_not_off #off, not_off

def classification_buy_am(x, std_points):
    if x <= std_points[0]:
        return "D"
    elif std_points[0] < x <= std_points[1]:
        return "C"
    elif std_points[1] < x <= std_points[2]:
        return "B"
    else:
        return "A"

def split_buy_amount_by_4(df_main, df_sub, key='cust', col_nm='buy_am'):
    df_new = df_sub.groupby([key], as_index=False)[col_nm].sum()
    df_new2 = df_main.merge(df_new, on='cust', how='left')
    df_new2["am_class"] = df_new2[col_nm].apply(classification_buy_am, std_points=np.quantile(df_new2[col_nm], [.25, .5, .75]))
    

    df_A = df_new2[df_new2['am_class']=='A']
    df_B = df_new2[df_new2['am_class']=='B']
    df_C = df_new2[df_new2['am_class']=='C']
    df_D = df_new2[df_new2['am_class']=='D']
    
    return df_A, df_B, df_C, df_D
    

def domain_clustering_ver2(df_main, df_sub, df_sub2):

    not_outlier_cust = delete_outlier_DBSCAN(df_sub, df_sub2)

    df_main = df_main.loc[df_main['cust'].isin(not_outlier_cust.values.tolist())]

    df = check_on_off(df_main, df_sub)

    df_off, df_not_off= split_on_off(df)

    
    df_off_A, df_off_B, df_off_C, df_off_D  = split_buy_amount_by_4(df_off, df_sub)
    df_not_off_A, df_not_off_B, df_not_off_C, df_not_off_D  = split_buy_amount_by_4(df_not_off, df_sub)

    df_off_A['cluster'] = 0
    df_off_B['cluster'] = 1
    df_off_C['cluster'] = 2
    df_off_D['cluster'] = 3
    df_not_off_A['cluster'] = 4
    df_not_off_B['cluster'] = 5
    df_not_off_C['cluster'] = 6
    df_not_off_D['cluster'] = 7

    df_off_A_label = df_off_A[['cust', 'cluster']]
    df_off_B_label = df_off_B[['cust', 'cluster']]
    df_off_C_label = df_off_C[['cust', 'cluster']]
    df_off_D_label = df_off_D[['cust', 'cluster']]
    df_not_off_A_label = df_not_off_A[['cust', 'cluster']]
    df_not_off_B_label = df_not_off_B[['cust', 'cluster']]
    df_not_off_C_label = df_not_off_C[['cust', 'cluster']]
    df_not_off_D_label = df_not_off_D[['cust', 'cluster']]
    

        
    return df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label

## Result of 8 clusters

In [6]:
df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label = domain_clustering_ver2(df_cust, df_pdde, df_pd_clac) 

result example

In [7]:
df_off_A_label

Unnamed: 0,cust,cluster
3,M000261625,0
4,M000350564,0
7,M000508243,0
24,M001694463,0
25,M001697472,0
...,...,...
16498,M998346579,0
16508,M999213998,0
16509,M999227380,0
16511,M999340261,0


* A > B > C > D (buy amount 순)

|군집 번호|채널   |금액 등급|고객 수|
|:------:|:-----:|:------:|:-----:|
|0       |Offline|A       |4129   |
|1       |Offline|B       |4129   |
|2       |Offline|C       |4129   |
|3       |Offline|D       |4130   |
|4       |Online |A       |2132   |
|5       |Online |B       |2131   |
|6       |Online |C       |2131   |
|7       |Online |D       |2132   |

#  ====== osy clustering ======

**결과물은 pd.DataFrame**

|cust|cluster|
|:--:|:-----:|
|M0000000|0|
|M0000001|1|
|M0000002|3|

In [8]:
### 아래 함수에서 각각의 데이터프레임 넣어서 각각에 대한 군집 뽑아내서 붙이려고 하나로 합치고 시작했슴당

df_list = [df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label]
df_dom_clustered = pd.concat(df_list, ignore_index=True)

In [9]:
df_main = pd.merge(df_pdde, df_pd_clac, how='left', on='pd_c')

In [10]:
def merging(df1, df2):
    return pd.merge(df1, df2, how='left', on='cust')


def pivoting(df_main, df): 
    columns_default = list(df_main['clac_hlv_nm'].unique())
    df_res = pd.DataFrame(columns=columns_default)
    df_pt = pd.pivot_table(data=df,
                           values='buy_am',
                           index='cust',
                           columns='clac_hlv_nm',
                           aggfunc='sum',
                           fill_value=0)
    df_res = pd.concat([df_res, df_pt], ignore_index=False)
    df_res.fillna(0, inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_res)
    df_spt = pd.DataFrame(scaler.transform(df_res), index=df_pt.index, columns=df_res.columns)
    return df_spt


def get_inertia(df, k):
    Kmeans = KMeans(n_clusters=k, random_state=200)
    Kmeans.fit(df)
    return Kmeans.inertia_

def get_clusters_k(df):
    k_range = range(2, 20)
    k_result = None

    for k in k_range:
        minus = get_inertia(df, k) - get_inertia(df, k + 1)
        if k == 2:
            minus_before = minus
            minus_rate_max = 0
        else:
            minus_rate = minus_before - minus
            if minus_rate > minus_rate_max:
                k_result = k
                minus_rate_max = minus_rate
            minus_before = minus

    return k_result


def clustering(df_main, df_dom_clustered):   ### 여기서 돌릴 때, 초기 데이터이면 fit만 하고, column명들 저장. 초기 데이터 아니면 다음으로 넘어가서 transform만
    df = merging(df_main, df_dom_clustered)
    df_result = pd.DataFrame()
    db_clustered_k = len(df_dom_clustered['cluster'].unique())
    for i in range(db_clustered_k):
        df_ = df[df['cluster']==i]
        df_pt = pivoting(df_main, df_)
        k = get_clusters_k(df_pt)
        Kmeans_ = KMeans(n_clusters=k, random_state=200)
        Kmeans_.fit(df_pt)
        cluster = Kmeans_.predict(df_pt)
        df_pt['buy_am_cluster'] = cluster
        df_result = pd.concat([df_result, df_pt])
        # break
    df_result.fillna(0, inplace=True)
    df_result.reset_index(drop=False)
    df_result = df_result.merge(df_dom_clustered, how="left", on="cust")
    df_clustered_final = df_result.loc[:, ['cust', 'buy_am_cluster', 'cluster']]
    
    # 두 개로 나누어져있는 군집 하나의 열로 합치는 과정 필요합니다,,! 이거까지 하고 잘 자신이 없
    
    return df_clustered_final

In [11]:
df_osy = clustering(df_main, df_dom_clustered)



In [12]:
df_osy

Unnamed: 0,cust,buy_am_cluster,cluster
0,M000261625,0,0
1,M000350564,0,0
2,M000508243,0,0
3,M001694463,0,0
4,M001697472,0,0
...,...,...,...
25038,M996376807,8,7
25039,M997082506,6,7
25040,M998129365,9,7
25041,M998600186,8,7


In [13]:
df_osy.to_csv('../../result.csv', index=False)

## 저 두 칼럼 합쳐서 하나의 칼럼으로 만들어야하는데,, 도무지 어떻게 합쳐야할지,, for문 하나하나 다 돌리는 것 밖에는 생각이 나지 않습니다,,! 도와주세여 ㅎㅎ

# Find Neighbor

In [14]:
df_pdde_pd_clac = df_pdde.merge(df_pd_clac, how='left', on='pd_c')
df_pt = pd.pivot_table(data=df_pdde_pd_clac,
            values='buy_am',
            index='cust',
            columns='clac_hlv_nm',
            aggfunc='sum',
            fill_value=0)

df_pt.reset_index(inplace=True)

scaler = MinMaxScaler()
scaler.fit(df_pt.iloc[:,1:])
df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
df_spt_2 =  df_spt.reset_index()
df_spt_2

clac_hlv_nm,cust,가구,건강식품,건강용품,건해산물,...,테넌트/음식점,패션잡화,퍼스널케어,헬스/피트니스,화장품/뷰티케어
0,M000034966,0.0,0.000000,0.000000,0.000813,...,0.000000,0.000000,0.000000,0.000000,0.000000
1,M000136117,0.0,0.000000,0.000000,0.000000,...,0.003625,0.013671,0.002099,0.015225,0.000888
2,M000201112,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000
3,M000225114,0.0,0.000000,0.000000,0.000000,...,0.012579,0.000914,0.000000,0.000000,0.001373
4,M000261625,0.0,0.000000,0.000000,0.000000,...,0.044248,0.000076,0.018588,0.000000,0.012273
...,...,...,...,...,...,...,...,...,...,...,...
26912,M999599111,0.0,0.001076,0.000000,0.001166,...,0.000000,0.000000,0.000000,0.000000,0.000000
26913,M999673157,0.0,0.000000,0.000000,0.000000,...,0.000873,0.004577,0.000000,0.000000,0.000493
26914,M999770689,0.0,0.006508,0.001428,0.000000,...,0.009315,0.000078,0.000000,0.000000,0.000000
26915,M999849895,0.0,0.000000,0.000246,0.000000,...,0.008162,0.000114,0.000000,0.000000,0.000000


In [15]:
df_0_log = df_spt_2.loc[df_spt_2['cust'].isin(df_off_A_label['cust'].values.tolist())]
df_0_log

clac_hlv_nm,cust,가구,건강식품,건강용품,건해산물,...,테넌트/음식점,패션잡화,퍼스널케어,헬스/피트니스,화장품/뷰티케어
4,M000261625,0.000000,0.000000,0.0,0.000000,...,0.044248,0.000076,0.018588,0.0,0.012273
5,M000350564,0.000307,0.017852,0.0,0.000000,...,0.001342,0.008809,0.000000,0.0,0.002372
9,M000508243,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000867,0.001175,0.0,0.000038
36,M001694463,0.000675,0.000000,0.0,0.006057,...,0.037817,0.000000,0.001681,0.0,0.000019
38,M001697472,0.000000,0.000000,0.0,0.000000,...,0.000000,0.006398,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
26881,M998346579,0.000000,0.000000,0.0,0.000582,...,0.015264,0.000000,0.000000,0.0,0.000000
26903,M999213998,0.000000,0.000000,0.0,0.007301,...,0.002953,0.000133,0.004143,0.0,0.001237
26904,M999227380,0.000000,0.000000,0.0,0.002686,...,0.018580,0.000979,0.000000,0.0,0.000077
26907,M999340261,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000


In [16]:
from scipy.spatial.distance import cdist

dist_map = cdist(df_0_log.iloc[:,1:], df_0_log.iloc[:,1:], metric='cosine')
dist_map_df = pd.DataFrame(dist_map, index=df_0_log["cust"], columns=df_0_log["cust"])
dist_map_df

temp_dict = dict()
for i in dist_map_df.index.tolist():
   temp_dict[i] = dist_map_df.loc[i,:].sort_values()[1:4].index.tolist()

df_cust_near = pd.DataFrame(temp_dict)
df_cust_near_result = df_cust_near.T
df_cust_near_result

Unnamed: 0,0,1,2
M000261625,M377275195,M241834190,M955329376
M000350564,M908934722,M409274346,M216947560
M000508243,M103616477,M557010626,M941739760
M001694463,M368322777,M565688203,M009541134
M001697472,M151347977,M721943730,M456856307
...,...,...,...
M998346579,M157272450,M416491629,M342338249
M999213998,M362833541,M210687594,M354931617
M999227380,M591198612,M516251891,M117163228
M999340261,M705698429,M820229067,M773895513


In [17]:
def code_to_name(df :pd.DataFrame, col_name :str, df_pdc :pd.DataFrame):
    df_result = copy.deepcopy(df)
    matching_series = df_pdc.set_index("pd_c")
    df_result[col_name] = df[col_name].apply(lambda x: matching_series.loc[x, "pd_nm"])
    return df_result

In [19]:
final_result_df = pd.DataFrame()
recommend_dict = {}
except_list = []
for cust_num in tqdm(df_cust_near_result.index.tolist()):


    # sample은 이웃의 모든 구매 목록 리스트
    sample = df_pdde.loc[df_pdde["cust"].isin(df_cust_near_result.loc[cust_num,:].tolist())]['pd_c'].value_counts()

    # sample_result는 이웃의 모든 구매 목록 리스트와 Target(M000261625)의 구매 목록 리스트를 비교해 Target이 구매하지 않은 품목을 찾음
    sample_result = set(sample.index)-set(df_pdde.loc[df_pdde['cust']==cust_num]['pd_c'].values)

    # df_smp
    df_smp = pd.DataFrame()
    df_smp["neighbor_list"] = list(sample_result)
    df_smp["neighbor_buy_am"] = list(map(lambda x: sample[x] ,list(sample_result)))
    df_smp_nm = code_to_name(df_smp.sort_values("neighbor_buy_am", ascending=False), "neighbor_list", df_pd_clac)
    

    try:
        # sample_dff = pd.DataFrame(df_smp_nm, "neighbor_list", df_pd_clac)[0:3]['neighbor_list'].T
        sample_dff = df_smp_nm.iloc[:3,0].values.tolist()
        recommend_dict[cust_num] = sample_dff
    except:
        except_list.append((cust_num,len(df_smp_nm.iloc[:,0].values)))
        continue

final_result_df = pd.DataFrame(recommend_dict).T
    # sample_dff.columns = [0,1,2]
    # sample_dff.index = [cust_num]
    # final_result_df = pd.concat([final_result_df,sample_dff], axis=0, ignore_index=True)

# final_result_df

  0%|          | 0/4129 [00:00<?, ?it/s]

ValueError: All arrays must be of the same length

In [29]:
recommend_dict

{'M000261625': ['여성티셔츠/탑', '기타남성의류세트', '임대매출'],
 'M000350564': ['일반빵', '기타남성의류세트', '기타건과일'],
 'M000508243': ['기타여성의류아우터', '기타국산과일류', '기타남성의류세트'],
 'M001694463': ['푸드코트한식', '기타남성의류세트', '빵/케이크'],
 'M001697472': ['국산담배', '수입맥주', '국물용기라면'],
 'M001729158': ['종량제봉투', '생활잡화균일가', '일반스낵'],
 'M001826589': ['레드와인', '리큐르', '생수'],
 'M002516882': ['기타남성의류세트', '푸드코트한식', '남성슬립온'],
 'M003641896': ['커피/음료', '기타남성의류세트', '남성트레이닝복'],
 'M003842970': ['커피/음료', '스타킹', '즉석어묵'],
 'M004292419': ['남성런닝/트레이닝화', '푸드코트한식', '임대매출'],
 'M004599697': ['수입담배', '전자담배용리필', '기타패션잡화'],
 'M004743281': ['골프필드용품', '푸드코트컨세션', '여성골프티셔츠/탑'],
 'M005190060': ['기타파티/팬시용품', '냉동밥', '국산돼지삼겹살'],
 'M005297673': ['디저트', '요가/필라테스복', '푸드코트컨세션'],
 'M005484498': ['생활잡화균일가', '생리대', '수박'],
 'M005515605': ['기타피트니스기구', '스포츠댄스의류/용품', '남성일반양말'],
 'M005930833': ['메이크업브러쉬', '여성블라우스', '여성티셔츠/탑'],
 'M006103201': ['일반스낵', '기타여성의류아우터', '마시는요구르트'],
 'M006216667': ['고양이간식', '베이커리', '장바구니'],
 'M006276409': ['바초콜릿', '기타파티/팬시용품', '푸드코트컨세션'],
 'M006658403': ['디

In [25]:
final_result_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in recommend_dict.items() ]))
final_result_df.T

  final_result_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in recommend_dict.items() ]))


Unnamed: 0,0,1,2
M000261625,여성티셔츠/탑,기타남성의류세트,임대매출
M000350564,일반빵,기타남성의류세트,기타건과일
M000508243,기타여성의류아우터,기타국산과일류,기타남성의류세트
M001694463,푸드코트한식,기타남성의류세트,빵/케이크
M001697472,국산담배,수입맥주,국물용기라면
...,...,...,...
M998346579,국산맥주,수입맥주,여성티셔츠/탑
M999213998,이유식,고추,냉장국탕류
M999227380,일반스낵,기타파티/팬시용품,기타아웃도어/레저용품
M999340261,기타컴퓨터액세서리,키보드,저장장치


In [None]:
#함수화

##2만여명 고객 품목별 구매금액 Scale

def pivot_table_for_recommed(df_pd, df_pdc):

    df_pdde_pd_clac = df_pd.merge(df_pdc, how='left', on='pd_c')
    df_pt = pd.pivot_table(data=df_pdde_pd_clac,
                values='buy_am',
                index='cust',
                columns='clac_hlv_nm',
                aggfunc='sum',
                fill_value=0)

    df_pt.reset_index(inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_pt.iloc[:,1:])
    df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
    df_spt_2 =  df_spt.reset_index()
    return df_spt_2

fit_scaler()


##클러스터 별로 나누는 함수 정의 필요.


df_0_log = df_spt_2.loc[df_spt_2['cust'].isin(df_off_A_label['cust'].values.tolist())]
df_0_log


##가까운 3명 뽑기 위한 dist map생성 df<-클러스터별로 나누는 함수가 들어간다.
def make_dist_map_pick_near_3(df):
    dist_map = cdist(df.iloc[:,1:], df.iloc[:,1:], metric='cosine')
    dist_map_df = pd.DataFrame(dist_map, index=df["cust"], columns=df["cust"])
    return dist_map_df

    temp_dict = dict()
    for i in dist_map_df.index.tolist():
        temp_dict[i] = dist_map_df.loc[i,:].sort_values()[1:4].index.tolist()

    df_cust_near = pd.DataFrame(temp_dict)
    df_cust_near_result = df_cust_near.T

    return df_cust_near_result


def code_to_name(df :pd.DataFrame, col_name :str, df_pdc :pd.DataFrame):
    df_result = copy.deepcopy(df)
    matching_series = df_pdc.set_index("pd_c")
    df_result[col_name] = df[col_name].apply(lambda x: matching_series.loc[x, "pd_nm"])
    return df_result



def make_cust_recommend_item(df, cust_list):
    final_result_df = pd.DataFrame()
    recommend_dict = {}
    except_list = []
    for cust_num in tqdm(cust_list):
    # sample은 이웃의 모든 구매 목록 리스트
        sample = df_pdde.loc[df_pdde["cust"].isin(df_cust_near_result.loc[cust_num,:].tolist())]['pd_c'].value_counts()

    # sample_result는 이웃의 모든 구매 목록 리스트와 Target(M000261625)의 구매 목록 리스트를 비교해 Target이 구매하지 않은 품목을 찾음
        sample_result = set(sample.index)-set(df_pdde.loc[df_pdde['cust']==cust_num]['pd_c'].values)

    # df_smp
        df_smp = pd.DataFrame()
        df_smp["neighbor_list"] = list(sample_result)
        df_smp["neighbor_buy_am"] = list(map(lambda x: sample[x] ,list(sample_result)))
        df_smp_nm = code_to_name(df_smp.sort_values("neighbor_buy_am", ascending=False), "neighbor_list", df_pd_clac)
    
        try:
            # sample_dff = pd.DataFrame(df_smp_nm, "neighbor_list", df_pd_clac)[0:3]['neighbor_list'].T
            sample_dff = df_smp_nm.iloc[:3,0].values.tolist()
            recommend_dict[cust_num] = sample_dff
        except:
            except_list.append((cust_num,len(df_smp_nm.iloc[:,0].values)))
            continue

    final_result_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in recommend_dict.items() ]))
    final_df = final_result_df.T
    # sample_dff.columns = [0,1,2]
    # sample_dff.index = [cust_num]
    # final_result_df = pd.concat([final_result_df,sample_dff], axis=0, ignore_index=True)

    return final_df, except_list

In [None]:
def recommend_3_item():
    pivot_table_for_recommed(df_pdde, df_pd_clac)
    make_dist_map_pick_near_3(df)
    code_to_name(df :pd.DataFrame, col_name :str, df_pdc :pd.DataFrame)
    make_cust_recommend_item(df, cust_list)

# Association Rules about neighbors

연관 규칙 도출 Flow 함수화

In [31]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import *

def make_association_rule(df, order="rct_no", prod_code="pd_c", min_support=.003, min_confidence=.1):
    # 장바구니 별 상품 리스트
    product_list_per_order = df.groupby(order)[prod_code].apply(list)
    
    # 연관 규칙을 위한 encoding
    encoder = TransactionEncoder()
    one_hot_df = encoder.fit_transform(product_list_per_order)
    one_hot_df = pd.DataFrame(one_hot_df, columns=encoder.columns_)
    
    # 연관 규칙 도출
    frequent_item_df = apriori(one_hot_df, min_support=min_support)
    result = association_rules(frequent_item_df, metric="confidence", min_threshold=min_confidence)
    
    # 정리후 리턴
    result = result[["antecedents", "consequents", "support", "confidence"]].sort_values(by="confidence", ascending=False).reset_index(drop=True)
    result["antecedents"] = result["antecedents"].apply(lambda x: df.loc[df[prod_code]==one_hot_df.columns[list(x)[0]]][prod_code].values[0])
    result["consequents"] = result["consequents"].apply(lambda x: df.loc[df[prod_code]==one_hot_df.columns[list(x)[0]]][prod_code].values[0])

    return result

In [32]:
df_cust_near_result

Unnamed: 0,0,1,2
M000261625,M377275195,M241834190,M955329376
M000350564,M908934722,M409274346,M216947560
M000508243,M103616477,M557010626,M941739760
M001694463,M368322777,M565688203,M009541134
M001697472,M151347977,M721943730,M456856307
...,...,...,...
M998346579,M157272450,M416491629,M342338249
M999213998,M362833541,M210687594,M354931617
M999227380,M591198612,M516251891,M117163228
M999340261,M705698429,M820229067,M773895513
