In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 15
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False
plt.style.use("ggplot")

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)

## 0.Data load

In [2]:
# choongs load
df_cust = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
df_pdde = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv")
df_cop_u = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")
df_pd_clac = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# ## osy load
# df_cust = pd.read_csv("../../LPOINT_BIG_COMP_01_DEMO.csv")
# df_pdde = pd.read_csv("../../LPOINT_BIG_COMP_02_PDDE.csv")
# df_cop_u = pd.read_csv("../../LPOINT_BIG_COMP_03_COP_U.csv")
# df_pd_clac = pd.read_csv("../../LPOINT_BIG_COMP_04_PD_CLAC.csv")

In [4]:
# # choongs load
# df_cust = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
# df_pdde = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv")
# df_cop_u = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")
# df_pd_clac = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

## 1.Domain Clustering Function

In [5]:
#DBSCAN을 통해 이상치 제거
def delete_outlier_DBSCAN(df_pdde, df_pd_clac):
    df_pdde_pd_clac = df_pdde.merge(df_pd_clac, how='left', on='pd_c')
    df_pt = pd.pivot_table(data=df_pdde_pd_clac,
               values='buy_am',
               index='cust',
               columns='clac_hlv_nm',
               aggfunc='sum',
               fill_value=0)

    df_pt.reset_index(inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_pt.iloc[:,1:])
    df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
    df_spt_2 =  df_spt.reset_index()

    dbscan = DBSCAN(eps = 0.1, min_samples = 30, metric = "euclidean")
    dbscan_labels = dbscan.fit_predict(df_spt_2.iloc[:, 1:])
    df_spt_2['outlier'] = dbscan_labels

    df_spt_not_outlier_cust = df_spt_2[df_spt_2['outlier']==0]['cust']
    
    return df_spt_not_outlier_cust



# 누락 데이터 삭제
def check_on_off(df_main, df, key="cust", col_nm="chnl_dv"):
    df_new = pd.DataFrame(df.groupby(key)[col_nm].apply(lambda x:list(set(x))))
    return df_main.merge(df_new, how="left", on=key).dropna(subset=["chnl_dv"])



# on/off 분류
def split_on_off(df, col_nm="chnl_dv"):
    df[col_nm] = df[col_nm].apply(lambda x: x[0] if len(x)==1 else 0)
    df_off = df.loc[df[col_nm]==1] ##off
    df_on = df.loc[df[col_nm]==2] ##on
    df_onf = df.loc[df[col_nm]==0] ##onf

    df_not_off = pd.concat([df_on, df_onf], axis=0, ignore_index=True)


    return df_off, df_not_off #off, not_off

def classification_buy_am(x, std_points):
    if x <= std_points[0]:
        return "D"
    elif std_points[0] < x <= std_points[1]:
        return "C"
    elif std_points[1] < x <= std_points[2]:
        return "B"
    else:
        return "A"

def split_buy_amount_by_4(df_main, df_sub, key='cust', col_nm='buy_am'):
    df_new = df_sub.groupby([key], as_index=False)[col_nm].sum()
    df_new2 = df_main.merge(df_new, on='cust', how='left')
    df_new2["am_class"] = df_new2[col_nm].apply(classification_buy_am, std_points=np.quantile(df_new2[col_nm], [.25, .5, .75]))
    

    df_A = df_new2[df_new2['am_class']=='A']
    df_B = df_new2[df_new2['am_class']=='B']
    df_C = df_new2[df_new2['am_class']=='C']
    df_D = df_new2[df_new2['am_class']=='D']
    
    return df_A, df_B, df_C, df_D
    

def domain_clustering_ver2(df_main, df_sub, df_sub2):

    not_outlier_cust = delete_outlier_DBSCAN(df_sub, df_sub2)

    df_main = df_main.loc[df_main['cust'].isin(not_outlier_cust.values.tolist())]

    df = check_on_off(df_main, df_sub)

    df_off, df_not_off= split_on_off(df)

    
    df_off_A, df_off_B, df_off_C, df_off_D  = split_buy_amount_by_4(df_off, df_sub)
    df_not_off_A, df_not_off_B, df_not_off_C, df_not_off_D  = split_buy_amount_by_4(df_not_off, df_sub)

    df_off_A['cluster'] = 0
    df_off_B['cluster'] = 1
    df_off_C['cluster'] = 2
    df_off_D['cluster'] = 3
    df_not_off_A['cluster'] = 4
    df_not_off_B['cluster'] = 5
    df_not_off_C['cluster'] = 6
    df_not_off_D['cluster'] = 7

    df_off_A_label = df_off_A[['cust', 'cluster']]
    df_off_B_label = df_off_B[['cust', 'cluster']]
    df_off_C_label = df_off_C[['cust', 'cluster']]
    df_off_D_label = df_off_D[['cust', 'cluster']]
    df_not_off_A_label = df_not_off_A[['cust', 'cluster']]
    df_not_off_B_label = df_not_off_B[['cust', 'cluster']]
    df_not_off_C_label = df_not_off_C[['cust', 'cluster']]
    df_not_off_D_label = df_not_off_D[['cust', 'cluster']]
    

        
    return df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label

In [6]:
df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label = domain_clustering_ver2(df_cust, df_pdde, df_pd_clac) 

## 2.K means Clustering Function

In [8]:
### 아래 함수에서 각각의 데이터프레임 넣어서 각각에 대한 군집 뽑아내서 붙이려고 하나로 합치고 시작했슴당

df_list = [df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label]
df_dom_clustered = pd.concat(df_list, ignore_index=True)

In [9]:
df_main = pd.merge(df_pdde, df_pd_clac, how='left', on='pd_c')

In [10]:
def merging(df1, df2):
    return pd.merge(df1, df2, how='left', on='cust')


# def pivoting(df_main, df):
def pivoting(df_main, df):
    columns_default = list(df_main['clac_hlv_nm'].unique())
    df_res = pd.DataFrame(columns=columns_default)
    df_pt = pd.pivot_table(data=df,
                           values='buy_am',
                           index='cust',
                           columns='clac_hlv_nm',
                           aggfunc='sum',
                           fill_value=0)
    df_res = pd.concat([df_res, df_pt], ignore_index=False, axis=0)
    df_res.fillna(0, inplace=True)

    # scaler = MinMaxScaler()
    # scaler.fit(df_res)
    # df_spt = pd.DataFrame(scaler.transform(df_res), index=df_pt.indeㄴx, columns=df_res.columns)
    # return df_spt
    return df_res


### 추가한 부분 ###
def fit_scaler(df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    return scaler

def transform_scaler(df, scaler):
    return pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
###################

def get_inertia(df, k):
    Kmeans = KMeans(n_clusters=k, random_state=200)
    Kmeans.fit(df)
    return Kmeans.inertia_

def get_clusters_k(df):
    k_range = range(2, 20)
    k_result = None

    for k in k_range:
        minus = get_inertia(df, k) - get_inertia(df, k + 1)
        if k == 2:
            minus_before = minus
            minus_rate_max = 0
        else:
            minus_rate = minus_before - minus
            if minus_rate > minus_rate_max:
                k_result = k
                minus_rate_max = minus_rate
            minus_before = minus

    return k_result


def clustering(df_main, df_dom_clustered):   ### 여기서 돌릴 때, 초기 데이터이면 fit만 하고, column명들 저장. 초기 데이터 아니면 다음으로 넘어가서 transform만
    df = merging(df_main, df_dom_clustered)
    
    ### 전체 스케일링 먼저 ###
    df_total_pt = pivoting(df, df)
    mms_scaler = fit_scaler(df_total_pt)
    ###
    
    df_result = pd.DataFrame()
    db_clustered_k = df_dom_clustered['cluster'].nunique()
    for i in range(db_clustered_k):
        df_ = df[df['cluster']==i]
        df_pt = pivoting(df_main, df_)
        
        ### 추가 라인 ###
        k = get_clusters_k(transform_scaler(df_pt, mms_scaler))
        ###
        
        # k = get_clusters_k(df_pt)
        Kmeans_ = KMeans(n_clusters=k, random_state=200)
        Kmeans_.fit(df_pt)
        cluster = Kmeans_.predict(df_pt)
        df_pt['buy_am_cluster'] = cluster
        df_result = pd.concat([df_result, df_pt])
        # break
    df_result.fillna(0, inplace=True)
    # df_result.reset_index(drop=False)
    df_result.reset_index(drop=False, inplace=True)
    df_result.rename({"index":"cust"}, axis=1, inplace=True)
    df_result = df_result.merge(df_dom_clustered, how="left", on="cust")
    df_clustered_final = df_result.loc[:, ['cust', 'buy_am_cluster', 'cluster']]
    
    return df_clustered_final

In [None]:
# df_total_pt = pivoting(df_main)
# mms_scaler = fit_scaler(df_total_pt)
# transform_scaler(df_total_pt, mms_scaler)
df_osy = clustering(df_main, df_dom_clustered)

In [14]:
comb_cluster = df_osy.iloc[:,1].astype(str)+df_osy.iloc[:,2].astype(str)
df_osy["comb_cluster"] = comb_cluster.apply(lambda x: comb_cluster.unique().tolist().index(x))
df_osy

Unnamed: 0,cust,buy_am_cluster,cluster,comb_cluster
0,M000261625,0,0,0
1,M000350564,4,0,1
2,M000508243,0,0,0
3,M001694463,0,0,0
4,M001697472,0,0,0
...,...,...,...,...
25038,M996376807,1,7,26
25039,M997082506,2,7,28
25040,M998129365,0,7,27
25041,M998600186,0,7,27


## 3.Recommending Function

In [15]:
#함수화
import copy
from tqdm.notebook import tqdm

##2만여명 고객 품목별 구매금액 Scale

def pivot_table_for_recommed(df_pd, df_pdc):
    df_norm = df_pd.loc[df_pdde["cust"].isin(delete_outlier_DBSCAN(df_pd, df_pdc).values.tolist())]
    df_pdde_pd_clac = df_norm.merge(df_pdc, how='left', on='pd_c')
    df_pt = pd.pivot_table(data=df_pdde_pd_clac,
                values='buy_am',
                index='cust',
                columns='clac_hlv_nm',
                aggfunc='sum',
                fill_value=0)

    df_pt.reset_index(inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_pt.iloc[:,1:])
    df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
    df_spt_2 =  df_spt.reset_index()
    return df_spt_2


##클러스터 별로 나누는 함수 정의 필요.




##가까운 3명 뽑기 위한 dist map생성 df<-클러스터별로 나누는 함수가 들어간다.
def make_dist_map_pick_near_3(df):
    dist_map = cdist(df.iloc[:,1:], df.iloc[:,1:], metric='cosine')
    dist_map_df = pd.DataFrame(dist_map, index=df["cust"], columns=df["cust"])

    temp_dict = dict()
    for i in dist_map_df.index.tolist():
        temp_dict[i] = dist_map_df.loc[i,:].sort_values()[1:4].index.tolist()

    df_cust_near = pd.DataFrame(temp_dict)
    df_cust_near_result = df_cust_near.T

    return df_cust_near_result


def code_to_name(df :pd.DataFrame, col_name :str, df_pdc :pd.DataFrame):
    df_result = copy.deepcopy(df)
    matching_series = df_pdc.set_index("pd_c")
    df_result[col_name] = df[col_name].apply(lambda x: matching_series.loc[x, "pd_nm"])
    return df_result



def make_cust_recommend_item(df, df_pd, df_pdc):
    
    recommend_dict = {}
    except_list = []

    for cust_num in tqdm(df.index):
    # sample은 이웃의 모든 구매 목록 리스트
        sample = df_pd.loc[df_pd["cust"].isin(df.loc[cust_num,:].tolist())]['pd_c'].value_counts()

    # sample_result는 이웃의 모든 구매 목록 리스트와 Target(M000261625)의 구매 목록 리스트를 비교해 Target이 구매하지 않은 품목을 찾음
        sample_result = set(sample.index)-set(df_pd.loc[df_pdde['cust']==cust_num]['pd_c'].values)

    # df_smp
        df_smp = pd.DataFrame()
        df_smp["neighbor_list"] = list(sample_result)
        df_smp["neighbor_buy_am"] = list(map(lambda x: sample[x] ,list(sample_result)))
        df_smp_nm = code_to_name(df_smp.sort_values("neighbor_buy_am", ascending=False), "neighbor_list", df_pdc)
    
        try:
            # sample_dff = pd.DataFrame(df_smp_nm, "neighbor_list", df_pd_clac)[0:3]['neighbor_list'].T
            sample_dff = df_smp_nm.iloc[:3,0].values.tolist()
            recommend_dict[cust_num] = sample_dff
        except:
            except_list.append((cust_num,len(df_smp_nm.iloc[:,0].values)))
            continue

    final_result_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in recommend_dict.items() ]))
    final_df = final_result_df.T
    # sample_dff.columns = [0,1,2]
    # sample_dff.index = [cust_num]
    # final_result_df = pd.concat([final_result_df,sample_dff], axis=0, ignore_index=True)

    return final_df, except_list

In [21]:
df_normal_dist = pivot_table_for_recommed(df_pdde, df_pd_clac)
df_normal_dist

clac_hlv_nm,cust,가구,건강식품,건강용품,건해산물,...,테넌트/음식점,패션잡화,퍼스널케어,헬스/피트니스,화장품/뷰티케어
0,M000034966,0.000000,0.000000,0.000000,0.005936,...,0.000000,0.000000,0.000000,0.0,0.000000
1,M000201112,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000
2,M000225114,0.000000,0.000000,0.000000,0.000000,...,0.046184,0.004713,0.000000,0.0,0.018674
3,M000261625,0.000000,0.000000,0.000000,0.000000,...,0.162458,0.000392,0.136348,0.0,0.166908
4,M000350564,0.002474,0.080532,0.000000,0.000000,...,0.004929,0.045439,0.000000,0.0,0.032260
...,...,...,...,...,...,...,...,...,...,...,...
25038,M999599111,0.000000,0.004852,0.000000,0.008511,...,0.000000,0.000000,0.000000,0.0,0.000000
25039,M999673157,0.000000,0.000000,0.000000,0.000000,...,0.003204,0.023606,0.000000,0.0,0.006706
25040,M999770689,0.000000,0.029356,0.007665,0.000000,...,0.034202,0.000402,0.000000,0.0,0.000000
25041,M999849895,0.000000,0.000000,0.001319,0.000000,...,0.029968,0.000590,0.000000,0.0,0.000000


In [None]:
df_normal_dist = pivot_table_for_recommed(df_pdde, df_pd_clac)

cluster_recommend_dict = dict()
cluster_exception = dict()

for clust_no in df_osy["comb_cluster"].unique():
    ## -- 클러스터 1개 당 df 정의
    clust_list = df_osy.loc[df_osy["comb_cluster"]==clust_no]["cust"].values.tolist()
    df_similar = df_normal_dist.loc[df_normal_dist["cust"].isin(clust_list)]
    cluster_recommend_dict[clust_no], cluster_exception[clust_no] = make_cust_recommend_item(make_dist_map_pick_near_3(df_similar),df_pdde, df_pd_clac)

# Association Rules about neighbors

연관 규칙 도출 Flow 함수화

In [29]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import *

def make_association_rule(df, order="rct_no", prod_code="pd_c", min_support=.003, min_confidence=.1):
    # 장바구니 별 상품 리스트
    product_list_per_order = df.groupby(order)[prod_code].apply(list)
    
    # 연관 규칙을 위한 encoding
    encoder = TransactionEncoder()
    one_hot_df = encoder.fit_transform(product_list_per_order)
    one_hot_df = pd.DataFrame(one_hot_df, columns=encoder.columns_)
    
    # 연관 규칙 도출
    frequent_item_df = apriori(one_hot_df, min_support=min_support)
    result = association_rules(frequent_item_df, metric="confidence", min_threshold=min_confidence)
    
    # 정리후 리턴
    result = result[["antecedents", "consequents", "support", "confidence"]].sort_values(by="confidence", ascending=False).reset_index(drop=True)
    result["antecedents"] = result["antecedents"].apply(lambda x: df.loc[df[prod_code]==one_hot_df.columns[list(x)[0]]][prod_code].values[0])
    result["consequents"] = result["consequents"].apply(lambda x: df.loc[df[prod_code]==one_hot_df.columns[list(x)[0]]][prod_code].values[0])

    return result