In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 18
plt.rcParams["figure.figsize"] = (16,9)
plt.rcParams["axes.unicode_minus"] = False
plt.style.use("ggplot")

# 0.Data Load
---

In [2]:
df_pdde = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_02_PDDE.csv", low_memory=False)
df_cust = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_01_DEMO.csv")
df_affi = pd.read_csv("../LPOINT_BIG_COMP/LPOINT_BIG_COMP_03_COP_U.csv")

In [3]:
df_affi

Unnamed: 0,cust,rct_no,cop_c,br_c,chnl_dv,de_dt,vst_dt,de_hr,buy_am
0,M839993508,21102612B015763935,B01,B010012,1,20211026,20211026,12,60000
1,M839993508,21110610B014219744,B01,B010012,1,20211106,20211106,10,17100
2,M839993508,21021112B013419710,B01,B010012,1,20210211,20210211,12,136500
3,M839993508,21092010B012637545,B01,B010012,1,20210920,20210920,10,34200
4,M839993508,21101009D015920171,D01,D010614,1,20211010,20211010,9,2500
...,...,...,...,...,...,...,...,...,...
248299,M058650684,21111614C021426818,C02,C020002,1,20211116,20211116,14,2000
248300,M058650684,21111619C023223432,C02,C020002,1,20211116,20211116,19,26900
248301,M014154595,21121209C015324520,C01,C010087,2,20211212,20211212,9,2000
248302,M510878172,21112719C013369102,C01,C010007,2,20211127,20211127,19,6000


# 1.필요 함수 미리 정의
---

## 1.1.Cust Table Preprocessing
---

Cust Table 이진화

In [4]:
import copy

def cust_encoding(df, opt_gender=True, opt_etc=True):
    df_result = copy.deepcopy(df)
    
    # -- 1. gender
    if opt_gender:
        df_result["ma_fem_dv"] = df_result["ma_fem_dv"].apply(lambda x: 0 if x=="남성" else 1)
    
    # -- 2. ETC
    if opt_etc:
        target = ["ages", "zon_hlv"]
        df_result = pd.concat([df_result[["cust", "ma_fem_dv"]], pd.get_dummies(df_result[target], drop_first=True)], axis=1)
    
    return df_result

## 1.2.PDDE Table Prerocessing
---

채널 정보 처리

In [None]:
# 1. 채널 이진화 (on, off, on&off)
def check_on_off(x):
    if len(x)>1:
        return "onf"
    else:
        if x[0]==1:
            return "off"
        else:
            return "on"

def binary_chnl(df_base, df):
    df_chnl_3c = df.groupby("cust")["chnl_dv"].apply(lambda x: list(set(x))).apply(check_on_off).reset_index()
    df_ohe = pd.concat([df_chnl_3c["cust"], pd.get_dummies(df_chnl_3c["chnl_dv"], drop_first=True)], axis=1)
    df_ohe = df_ohe.merge(df_base, how="left", on="cust")
    return df_ohe

# 2. chnl aggregate
def chnl_aggregate(df_base, df):
    df_chnl_count = pd.pivot_table(data=df.drop_duplicates(subset=["cust", "rct_no"]),
                index="cust",
                columns="chnl_dv",
                values="rct_no",
                aggfunc="count",
                fill_value=0).reset_index()
    return df_base.merge(df, on="cust", how="left")

구매건수 및 구매금액

In [None]:
def purchase_count(df_base, df):
    

In [10]:
df_pdde.drop_duplicates(subset=["cust", "rct_no"])

Unnamed: 0,cust,rct_no,chnl_dv,cop_c,br_c,pd_c,de_dt,de_hr,buy_am,buy_ct
0,M430112881,A01000001113,1,A01,A010039,PD0290,20210101,10,15000.0,1
1,M646853852,A01000002265,1,A01,A010025,PD1369,20210101,10,79700.0,1
2,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000.0,1
4,M430112881,A01000004946,1,A01,A010039,PD0290,20210101,10,19000.0,1
5,M430112881,A01000005297,1,A01,A010039,PD1692,20210101,10,9900.0,1
...,...,...,...,...,...,...,...,...,...,...
4381723,M816318679,E06052115831,2,A06,,PD0507,20211231,22,50000.0,1
4381724,M816318679,E06052116037,2,A06,,PD0507,20211231,22,10000.0,1
4381727,M182645944,E06052117103,2,A06,,PD0294,20211231,23,289000.0,1
4381728,M533286446,E06052118403,2,A06,,PD0507,20211231,23,200000.0,1


더미화 후 결합 함수

In [None]:
def encode_concat(df, col_name, key, merge_target_df=None, drop=False, pre_fix=None):
    if pre_fix is None:
            dummy_df = pd.get_dummies(df[col_name], drop_first=drop, prefix=col_name, prefix_sep="_")
    else:
        dummy_df = pd.get_dummies(df[col_name], drop_first=drop, prefix=pre_fix, prefix_sep="_")
    if merge_target_df is None:
        return pd.concat([df[key], dummy_df], axis=1)
    else:
        return merge_target_df.merge(pd.concat([df[key], dummy_df], axis=1), on=key, how="left")