## library load

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 15
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False
plt.style.use("ggplot")

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)

## data load

In [2]:
df_cust = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
df_pdde = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv")
df_cop_u = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")
df_pd_clac = pd.read_csv("../../LPOINT_BIG_COMP/LPOINT_BIG_COMP_04_PD_CLAC.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


## Function

In [3]:
#DBSCAN을 통해 이상치 제거
def delete_outlier_DBSCAN(df_pdde, df_pd_clac):
    df_pdde_pd_clac = df_pdde.merge(df_pd_clac, how='left', on='pd_c')
    df_pt = pd.pivot_table(data=df_pdde_pd_clac,
               values='buy_am',
               index='cust',
               columns='clac_hlv_nm',
               aggfunc='sum',
               fill_value=0)

    df_pt.reset_index(inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(df_pt.iloc[:,1:])
    df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
    df_spt_2 =  df_spt.reset_index()

    dbscan = DBSCAN(eps = 0.1, min_samples = 30, metric = "euclidean")
    dbscan_labels = dbscan.fit_predict(df_spt_2.iloc[:, 1:])
    df_spt_2['outlier'] = dbscan_labels

    df_spt_not_outlier_cust = df_spt_2[df_spt_2['outlier']==0]['cust']
    
    return df_spt_not_outlier_cust



# 누락 데이터 삭제
def check_on_off(df_main, df, key="cust", col_nm="chnl_dv"):
    df_new = pd.DataFrame(df.groupby(key)[col_nm].apply(lambda x:list(set(x))))
    return df_main.merge(df_new, how="left", on=key).dropna(subset=["chnl_dv"])



# on/off 분류
def split_on_off(df, col_nm="chnl_dv"):
    df[col_nm] = df[col_nm].apply(lambda x: x[0] if len(x)==1 else 0)
    df_off = df.loc[df[col_nm]==1] ##off
    df_on = df.loc[df[col_nm]==2] ##on
    df_onf = df.loc[df[col_nm]==0] ##onf

    df_not_off = pd.concat([df_on, df_onf], axis=0, ignore_index=True)


    return df_off, df_not_off #off, not_off

def classification_buy_am(x, std_points):
    if x <= std_points[0]:
        return "D"
    elif std_points[0] < x <= std_points[1]:
        return "C"
    elif std_points[1] < x <= std_points[2]:
        return "B"
    else:
        return "A"

def split_buy_amount_by_4(df_main, df_sub, key='cust', col_nm='buy_am'):
    df_new = df_sub.groupby([key], as_index=False)[col_nm].sum()
    df_new2 = df_main.merge(df_new, on='cust', how='left')
    df_new2["am_class"] = df_new2[col_nm].apply(classification_buy_am, std_points=np.quantile(df_new2[col_nm], [.25, .5, .75]))
    

    df_A = df_new2[df_new2['am_class']=='A']
    df_B = df_new2[df_new2['am_class']=='B']
    df_C = df_new2[df_new2['am_class']=='C']
    df_D = df_new2[df_new2['am_class']=='D']
    
    return df_A, df_B, df_C, df_D
    

def domain_clustering_ver2(df_main, df_sub, df_sub2):

    not_outlier_cust = delete_outlier_DBSCAN(df_sub, df_sub2)

    df_main = df_main.loc[df_main['cust'].isin(not_outlier_cust.values.tolist())]

    df = check_on_off(df_main, df_sub)

    df_off, df_not_off= split_on_off(df)

    
    df_off_A, df_off_B, df_off_C, df_off_D  = split_buy_amount_by_4(df_off, df_sub)
    df_not_off_A, df_not_off_B, df_not_off_C, df_not_off_D  = split_buy_amount_by_4(df_not_off, df_sub)

    df_off_A['cluster'] = 0
    df_off_B['cluster'] = 1
    df_off_C['cluster'] = 2
    df_off_D['cluster'] = 3
    df_not_off_A['cluster'] = 4
    df_not_off_B['cluster'] = 5
    df_not_off_C['cluster'] = 6
    df_not_off_D['cluster'] = 7

    df_off_A_label = df_off_A[['cust', 'cluster']]
    df_off_B_label = df_off_B[['cust', 'cluster']]
    df_off_C_label = df_off_C[['cust', 'cluster']]
    df_off_D_label = df_off_D[['cust', 'cluster']]
    df_not_off_A_label = df_not_off_A[['cust', 'cluster']]
    df_not_off_B_label = df_not_off_B[['cust', 'cluster']]
    df_not_off_C_label = df_not_off_C[['cust', 'cluster']]
    df_not_off_D_label = df_not_off_D[['cust', 'cluster']]
    

        
    return df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label

## Result of 8 clusters

In [4]:
df_off_A_label, df_off_B_label, df_off_C_label, df_off_D_label, df_not_off_A_label, df_not_off_B_label, df_not_off_C_label, df_not_off_D_label = domain_clustering_ver2(df_cust, df_pdde, df_pd_clac) 

result example

In [5]:
df_off_A_label

Unnamed: 0,cust,cluster
3,M000261625,0
4,M000350564,0
7,M000508243,0
24,M001694463,0
25,M001697472,0
...,...,...
16498,M998346579,0
16508,M999213998,0
16509,M999227380,0
16511,M999340261,0


1. 0번 클러스터 OFF A (4129명)
2. 1번 클러스터 OFF B (4129명)
3. 2번 클러스터 OFF C (4129명)
4. 3번 클러스터 OFF D (4130명)
5. 4번 클러스터 NOT_OFF A (2132명)
6. 5번 클러스터 NOT_OFF B (2131명)
7. 6번 클러스터 NOT_OFF C (2131명)
8. 7번 클러스터 NOT_OFF D (2132명)

A > B > C > D (buy amount 순)

In [6]:
df_pdde_pd_clac = df_pdde.merge(df_pd_clac, how='left', on='pd_c')
df_pt = pd.pivot_table(data=df_pdde_pd_clac,
            values='buy_am',
            index='cust',
            columns='clac_hlv_nm',
            aggfunc='sum',
            fill_value=0)

df_pt.reset_index(inplace=True)

scaler = MinMaxScaler()
scaler.fit(df_pt.iloc[:,1:])
df_spt = pd.DataFrame(scaler.transform(df_pt.iloc[:,1:]), index=df_pt["cust"], columns=df_pt.columns[1:])
df_spt_2 =  df_spt.reset_index()
df_spt_2

clac_hlv_nm,cust,가구,건강식품,건강용품,건해산물,...,테넌트/음식점,패션잡화,퍼스널케어,헬스/피트니스,화장품/뷰티케어
0,M000034966,0.0,0.000000,0.000000,0.000813,...,0.000000,0.000000,0.000000,0.000000,0.000000
1,M000136117,0.0,0.000000,0.000000,0.000000,...,0.003625,0.013671,0.002099,0.015225,0.000888
2,M000201112,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000
3,M000225114,0.0,0.000000,0.000000,0.000000,...,0.012579,0.000914,0.000000,0.000000,0.001373
4,M000261625,0.0,0.000000,0.000000,0.000000,...,0.044248,0.000076,0.018588,0.000000,0.012273
...,...,...,...,...,...,...,...,...,...,...,...
26912,M999599111,0.0,0.001076,0.000000,0.001166,...,0.000000,0.000000,0.000000,0.000000,0.000000
26913,M999673157,0.0,0.000000,0.000000,0.000000,...,0.000873,0.004577,0.000000,0.000000,0.000493
26914,M999770689,0.0,0.006508,0.001428,0.000000,...,0.009315,0.000078,0.000000,0.000000,0.000000
26915,M999849895,0.0,0.000000,0.000246,0.000000,...,0.008162,0.000114,0.000000,0.000000,0.000000


In [8]:
df_0_log = df_spt_2.loc[df_spt_2['cust'].isin(df_off_A_label['cust'].values.tolist())]
df_0_log

clac_hlv_nm,cust,가구,건강식품,건강용품,건해산물,...,테넌트/음식점,패션잡화,퍼스널케어,헬스/피트니스,화장품/뷰티케어
4,M000261625,0.000000,0.000000,0.0,0.000000,...,0.044248,0.000076,0.018588,0.0,0.012273
5,M000350564,0.000307,0.017852,0.0,0.000000,...,0.001342,0.008809,0.000000,0.0,0.002372
9,M000508243,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000867,0.001175,0.0,0.000038
36,M001694463,0.000675,0.000000,0.0,0.006057,...,0.037817,0.000000,0.001681,0.0,0.000019
38,M001697472,0.000000,0.000000,0.0,0.000000,...,0.000000,0.006398,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
26881,M998346579,0.000000,0.000000,0.0,0.000582,...,0.015264,0.000000,0.000000,0.0,0.000000
26903,M999213998,0.000000,0.000000,0.0,0.007301,...,0.002953,0.000133,0.004143,0.0,0.001237
26904,M999227380,0.000000,0.000000,0.0,0.002686,...,0.018580,0.000979,0.000000,0.0,0.000077
26907,M999340261,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000


In [25]:
from scipy.spatial.distance import cdist

dist_map = cdist(df_0_log.iloc[:,1:], df_0_log.iloc[:,1:], metric='cosine')
dist_map_df = pd.DataFrame(dist_map, index=df_0_log["cust"], columns=df_0_log["cust"])
dist_map_df

temp_dict = dict()
for i in dist_map_df.index.tolist():
   temp_dict[i] = dist_map_df.loc[i,:].sort_values()[1:4].index.tolist()

df_cust_near = pd.DataFrame(temp_dict)
df_cust_near_result = df_cust_near.T
df_cust_near_result

Unnamed: 0,0,1,2
M000261625,M377275195,M241834190,M955329376
M000350564,M908934722,M409274346,M216947560
M000508243,M103616477,M557010626,M941739760
M001694463,M368322777,M565688203,M009541134
M001697472,M151347977,M721943730,M456856307
...,...,...,...
M998346579,M157272450,M416491629,M342338249
M999213998,M362833541,M210687594,M354931617
M999227380,M591198612,M516251891,M117163228
M999340261,M705698429,M820229067,M773895513


In [26]:
df_cust_near_result

Unnamed: 0,0,1,2
M000261625,M377275195,M241834190,M955329376
M000350564,M908934722,M409274346,M216947560
M000508243,M103616477,M557010626,M941739760
M001694463,M368322777,M565688203,M009541134
M001697472,M151347977,M721943730,M456856307
...,...,...,...
M998346579,M157272450,M416491629,M342338249
M999213998,M362833541,M210687594,M354931617
M999227380,M591198612,M516251891,M117163228
M999340261,M705698429,M820229067,M773895513


In [34]:
set(df_pdde.loc[df_pdde['cust']=="M000261625"]['pd_c'].values)

{'PD0232',
 'PD0415',
 'PD0419',
 'PD0640',
 'PD0777',
 'PD0873',
 'PD0951',
 'PD0965',
 'PD1255',
 'PD1340',
 'PD1683',
 'PD1684',
 'PD1685',
 'PD1689',
 'PD1690',
 'PD1692',
 'PD1694',
 'PD1824',
 'PD1827',
 'PD1839',
 'PD1840',
 'PD1888',
 'PD1889',
 'PD1893',
 'PD1894',
 'PD1918',
 'PD1919',
 'PD1922',
 'PD1923',
 'PD1924',
 'PD1926',
 'PD1928',
 'PD1929'}

In [36]:
sample = df_pdde.loc[df_pdde["cust"].isin(df_cust_near_result.loc["M000261625",:].tolist())]['pd_c'].value_counts()

In [39]:
sample_result = set(sample.index)-set(df_pdde.loc[df_pdde['cust']=="M000261625"]['pd_c'].values)
list(map(lambda x: sample[x] ,list(sample_result)))

<map at 0x1ee064e31f0>

In [None]:
df_cust_near_result

In [21]:
df_pdde

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,20210101,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,20210101,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
3,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,20210101,10,19000.0,1
...,...,...,...,...,...,...,...,...,...,...
4381738,M533286446,E06052119978,2,A06,,PD0507,20211231,23,50000.0,1
4381739,M533286446,E06052119978,2,A06,,PD0507,20211231,23,50000.0,1
4381740,M533286446,E06052119978,2,A06,,PD0507,20211231,23,50000.0,1
4381741,M533286446,E06052119978,2,A06,,PD0507,20211231,23,50000.0,1


In [68]:
df_cop_u

Unnamed: 0,cust,rct_no,cop_c,br_c,chnl_dv,de_dt,vst_dt,de_hr,buy_am
0,M839993508,21102612B015763935,B01,B010012,1,20211026,20211026,12,60000
1,M839993508,21110610B014219744,B01,B010012,1,20211106,20211106,10,17100
2,M839993508,21021112B013419710,B01,B010012,1,20210211,20210211,12,136500
3,M839993508,21092010B012637545,B01,B010012,1,20210920,20210920,10,34200
4,M839993508,21101009D015920171,D01,D010614,1,20211010,20211010,9,2500
...,...,...,...,...,...,...,...,...,...
248299,M058650684,21111614C021426818,C02,C020002,1,20211116,20211116,14,2000
248300,M058650684,21111619C023223432,C02,C020002,1,20211116,20211116,19,26900
248301,M014154595,21121209C015324520,C01,C010087,2,20211212,20211212,9,2000
248302,M510878172,21112719C013369102,C01,C010007,2,20211127,20211127,19,6000


In [69]:
df_pdde[df_pdde['cust']=='M533286446']['pd_c']

51632      PD0291
133025     PD0288
133056     PD0288
192963     PD0288
192964     PD0288
            ...  
4381738    PD0507
4381739    PD0507
4381740    PD0507
4381741    PD0507
4381742    PD0507
Name: pd_c, Length: 260, dtype: object