In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 18
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False
plt.style.use("ggplot")

# 0.Data Load
---

In [2]:
df_pdde = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv", low_memory=False)
df_cust = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
df_affi = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")

In [3]:
df_affi

Unnamed: 0,cust,rct_no,cop_c,br_c,chnl_dv,de_dt,vst_dt,de_hr,buy_am
0,M839993508,21102612B015763935,B01,B010012,1,20211026,20211026,12,60000
1,M839993508,21110610B014219744,B01,B010012,1,20211106,20211106,10,17100
2,M839993508,21021112B013419710,B01,B010012,1,20210211,20210211,12,136500
3,M839993508,21092010B012637545,B01,B010012,1,20210920,20210920,10,34200
4,M839993508,21101009D015920171,D01,D010614,1,20211010,20211010,9,2500
...,...,...,...,...,...,...,...,...,...
248299,M058650684,21111614C021426818,C02,C020002,1,20211116,20211116,14,2000
248300,M058650684,21111619C023223432,C02,C020002,1,20211116,20211116,19,26900
248301,M014154595,21121209C015324520,C01,C010087,2,20211212,20211212,9,2000
248302,M510878172,21112719C013369102,C01,C010007,2,20211127,20211127,19,6000


# 1.필요 함수 미리 정의
---

## 1.1.Cust Table Preprocessing
---

Cust Table 이진화

In [4]:
import copy

def cust_encoding(df, opt_gender=True, opt_etc=True):
    df_result = copy.deepcopy(df)
    
    # -- 1. gender
    if opt_gender:
        df_result["ma_fem_dv"] = df_result["ma_fem_dv"].apply(lambda x: 0 if x=="남성" else 1)
    
    # -- 2. ETC
    if opt_etc:
        target = ["ages", "zon_hlv"]
        df_result = pd.concat([df_result[["cust", "ma_fem_dv"]], pd.get_dummies(df_result[target], drop_first=True)], axis=1)
    
    return df_result

In [5]:
df_check = cust_encoding(df_cust)
df_check

Unnamed: 0,cust,ma_fem_dv,ages_30대,ages_40대,ages_50대,ages_60대,ages_70대,zon_hlv_Z02,zon_hlv_Z03,zon_hlv_Z04,...,zon_hlv_Z08,zon_hlv_Z09,zon_hlv_Z10,zon_hlv_Z11,zon_hlv_Z12,zon_hlv_Z13,zon_hlv_Z14,zon_hlv_Z15,zon_hlv_Z16,zon_hlv_Z17
0,M000034966,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M000059535,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,M000136117,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,M000201112,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,M000225114,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29908,M999708287,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
29909,M999770689,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29910,M999849895,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
29911,M999926092,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


## 1.2.PDDE Table Prerocessing
---

채널 정보 처리

In [6]:
# 1. 채널 이진화 (on, off, on&off)
def check_on_off(x):
    if len(x)>1:
        return "onf"
    else:
        if x[0]==1:
            return "off"
        else:
            return "on"

def binary_chnl(df_base, df):
    df_chnl_3c = df.groupby("cust")["chnl_dv"].apply(lambda x: list(set(x))).apply(check_on_off).reset_index()
    df_ohe = pd.concat([df_chnl_3c["cust"], pd.get_dummies(df_chnl_3c["chnl_dv"], drop_first=True)], axis=1)
    df_ohe = df_ohe.merge(df_base, how="left", on="cust")
    return df_ohe

# 2. chnl aggregate
def chnl_aggregate(df_base, df):
    df_chnl_count = pd.pivot_table(data=df.drop_duplicates(subset=["cust", "rct_no"]),
                index="cust",
                columns="chnl_dv",
                values="rct_no",
                aggfunc="count",
                fill_value=0).reset_index()
    df_chnl_count.rename({1:"online_이용건수", 2:"offline_이용건수"}, axis=1, inplace=True)
    return df_base.merge(df_chnl_count, on="cust", how="left")
    # return df_chnl_count

In [7]:
df_check = chnl_aggregate(df_check, df_pdde)
df_check

Unnamed: 0,cust,ma_fem_dv,ages_30대,ages_40대,ages_50대,ages_60대,ages_70대,zon_hlv_Z02,zon_hlv_Z03,zon_hlv_Z04,...,zon_hlv_Z10,zon_hlv_Z11,zon_hlv_Z12,zon_hlv_Z13,zon_hlv_Z14,zon_hlv_Z15,zon_hlv_Z16,zon_hlv_Z17,online_이용건수,offline_이용건수
0,M000034966,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12.0,0.0
1,M000059535,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,,
2,M000136117,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,64.0,6.0
3,M000201112,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,5.0,0.0
4,M000225114,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,78.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29908,M999708287,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,,
29909,M999770689,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,117.0,5.0
29910,M999849895,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,34.0,0.0
29911,M999926092,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,


구매건수 및 구매금액

In [8]:
def purchase_count(df_base, df):
    df_result = df.drop_duplicates(subset=["cust", "rct_no"]).groupby("cust")["buy_am"].aggregate(["sum","count"]).reset_index()
    df_result.rename({"sum":"구매금액", "count":"구매건수"}, axis=1, inplace=True)
    return pd.merge(df_base, df_result, on="cust", how="left")

In [9]:
df_check = purchase_count(df_check, df_pdde)

날짜 변환 함수

In [10]:
from datetime import datetime
def make_ts_column(df, date="de_dt", hour="de_hr", drop=False):
    
    if df[date].dtypes=="str" and df[hour].dtypes=="int64":
        df["de_dthr"]=pd.to_datetime(df[date]+":"+df[hour].apply(str), format="%Y-%m-%d:%H")
    elif df[date].dtypes=="<M8[ns]" and df[hour].dtypes=="int64":
        df["de_dthr"]=pd.to_datetime(df[date].apply(lambda x: datetime.strftime(x, format="%Y-%m-%d"))+":"+df[hour].apply(str),format="%Y-%m-%d:%H")
    elif df[date].dtypes=="int64" and df[hour].dtypes=="int64":
        df["de_dthr"]=pd.to_datetime(df[date].apply(str)+":"+df[hour].apply(str),format="%Y%m%d:%H")
    else:
        # assert df[date].dtypes!="str" or df[date].dtypes!="<M8[ns]", "date must be 'str' or '<M8[ns]' type"
        # assert df[hour].dtypes!="int", "hour must be 'int' type"
        raise TypeError("Check args type -> date must be 'str' or '<M8[ns]' type. hour must be 'int' type.") 
    
    if drop:
        return df.drop(["de_dt", "de_hr"], axis=1)
    else:
        return df

최근성

In [11]:
import pandas as pd
def recency(value, cur_date = pd.to_datetime('2022-01-01')):
    y, m, d = str(value)[:4], str(value)[4:6], str(value)[6:]
    diff = (cur_date-pd.to_datetime('{}-{}-{}'.format(y, m, d))).days
    return diff

def make_recency(df_base, df):
    cols = ["cust", "de_dt"]
    df_result = df.sort_values(by="de_dt").drop_duplicates(subset=["cust"],keep="last")[cols]
    df_result["최근성"] = df["de_dt"].apply(recency)
    return pd.merge(df_base, df_result[["cust", "최근성"]], on="cust", how="left",)
    # return df_result

In [12]:
df_check = make_recency(df_check, df_pdde)

더미화 후 결합 함수

In [13]:
def encode_concat(df, col_name, key, merge_target_df=None, drop=False, pre_fix=None):
    if pre_fix is None:
            dummy_df = pd.get_dummies(df[col_name], drop_first=drop, prefix=col_name, prefix_sep="_")
    else:
        dummy_df = pd.get_dummies(df[col_name], drop_first=drop, prefix=pre_fix, prefix_sep="_")
    if merge_target_df is None:
        return pd.concat([df[key], dummy_df], axis=1)
    else:
        return merge_target_df.merge(pd.concat([df[key], dummy_df], axis=1), on=key, how="left")

# 2.군집화
---

In [14]:
df_check_drop = df_check.dropna()
df_check_drop.iloc[:,-3:].astype(int)
df_check_drop.iloc[:,-3:] = df_check_drop.iloc[:,-3:].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_check_drop.iloc[:,-3:] = df_check_drop.iloc[:,-3:].astype(int)


In [16]:
df_check_drop["구매금액"]=df_check_drop["구매금액"].apply(np.log10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_check_drop["구매금액"]=df_check_drop["구매금액"].apply(np.log10)


In [17]:
df_check_drop

Unnamed: 0,cust,ma_fem_dv,ages_30대,ages_40대,ages_50대,ages_60대,ages_70대,zon_hlv_Z02,zon_hlv_Z03,zon_hlv_Z04,...,zon_hlv_Z13,zon_hlv_Z14,zon_hlv_Z15,zon_hlv_Z16,zon_hlv_Z17,online_이용건수,offline_이용건수,구매금액,구매건수,최근성
0,M000034966,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,12.0,0.0,5.530225,12,9
2,M000136117,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,64.0,6.0,7.404100,70,2
3,M000201112,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,5.0,0.0,3.886491,5,34
4,M000225114,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,78.0,0.0,6.074648,78,1
5,M000261625,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,48.0,0.0,6.707008,48,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29906,M999599111,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,6.0,2.0,6.085319,8,48
29907,M999673157,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,17.0,0.0,6.443235,17,14
29909,M999770689,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,117.0,5.0,5.771889,122,1
29910,M999849895,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,34.0,0.0,5.924476,34,31


In [25]:
from sklearn.cluster import AgglomerativeClustering as AGC

cluster = AGC(n_clusters = 7, affinity="cosine", linkage="average")
# cluster.fit_predict(df_check_drop.iloc[:,23:])
cluster.fit_predict(df_check_drop.iloc[:,1:])

array([1, 1, 5, ..., 1, 1, 1], dtype=int64)

In [26]:
df_clust_res = pd.DataFrame()
df_clust_res["cust"] = df_check_drop["cust"]
df_clust_res["cluster"] = cluster.labels_
# df_clust_res["cluster"].value_counts().apply(lambda x: x*100/len(df_clust_res["cluster"]))
df_clust_res["cluster"].value_counts()

1    16914
5     8568
0     1187
6      125
2       95
3       25
4        3
Name: cluster, dtype: int64

In [28]:
df_clust_res

Unnamed: 0,cust,cluster
0,M000034966,5
2,M000136117,0
3,M000201112,7
4,M000225114,0
5,M000261625,2
...,...,...
29906,M999599111,7
29907,M999673157,5
29909,M999770689,0
29910,M999849895,5


In [30]:
cluster_df = copy.deepcopy(df_check_drop)
cluster_df["cluster"] = df_clust_res["cluster"]
cluster_df.groupby(['cluster'])[["online_이용건수","offline_이용건수","최근성"]].mean()

Unnamed: 0_level_0,online_이용건수,offline_이용건수,최근성
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,78.727289,2.421561,5.039332
1,17.86631,13.465241,38.294118
2,35.201538,1.843383,14.972885
3,27.013043,56.778261,6.073913
4,13.116279,29.697674,22.294574
5,28.466984,0.695238,28.787937
6,2.5,35.546875,33.09375
7,4.972808,1.524885,155.55323
8,4.448052,64.084416,10.441558
9,25.531915,17.234043,16.882979
