In [2]:
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import Imputer, RobustScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.metrics import roc_curve, classification_report


In [3]:
# Data loading
mem_data   = pd.read_csv('data/mem_data.csv', encoding='CP949')
mem_trans  = pd.read_csv('data/mem_transaction.csv', encoding='CP949')
store_info = pd.read_csv('data/store_info.csv', encoding='CP949')

#### mem_data.info()

- RangeIndex: 10764 entries, 0 to 10763
- Data columns (total 15 columns):
- MEM_ID         10764 non-null int32
- M_STORE_ID     10764 non-null int32
- GENDER         10764 non-null object : UNKNOWN 존재
- BIRTH_DT       5997 non-null object  : null 존재
- BIRTH_SL       10764 non-null object : ['S', 'L']
- ZIP_CD         10764 non-null object :  '-' 존재
- RGST_DT        10764 non-null object : 2006-02-05 16:56:58, 2006-02-05 23:19:22.280000000
- VISIT_CNT      10764 non-null int32  : [8.000e+00, 2.200e+01, 1.100e+01, 3.000e+00, 2.000e+00, 1.800e+01, 5.000e+00,
- SALES_AMT      10764 non-null int32   : [ 337000., 1438500.,  309300., ...,   85500.,  176200.,  385800.]
- LAST_VST_DT    10764 non-null object : 2007-01-19 21:46:12.827000000', '2007-10-18 19:57:45.503000000',...,
- USABLE_PNT     10764 non-null int32  : [6740., 1732., 6372., ..., 8608., 3453., 5954.]
- USED_PNT       10764 non-null int32  : [0.000000e+00, 1.200000e+04, 2.300000e+04, 3.600000e+03,
- ACC_PNT        10764 non-null int32  : [ 6740., 13732.,  6372., ...,  5926.,  1710.,  7870.]
- USABLE_INIT    10764 non-null int32  : [ 3202., 17762.,  1856., ...,  5552.,  3382.,  2530.]
- SMS            10764 non-null object : ['Y', 'N']
- dtypes: int32(8), object(7)
- memory usage: 925.1+ KB


- 자료형 변환
 - mem_data['MEM_ID']      = mem_data.MEM_ID.astype(int)
 - mem_data['M_STORE_ID']  = mem_data.M_STORE_ID.astype(int)
 - mem_data['VISIT_CNT']  = mem_data.SALES_AMT.astype(int)
 - mem_data['SALES_AMT']  = mem_data.SALES_AMT.astype(int)
 - mem_data['USABLE_PNT']  = mem_data.USABLE_PNT.astype(int)
 - mem_data['USED_PNT']    = mem_data.USED_PNT.astype(int)
 - mem_data['ACC_PNT']    = mem_data.ACC_PNT.astype(int)
 - mem_data['USABLE_INIT'] = mem_data.USABLE_INIT.astype(int)

In [4]:
# PK : MEM_ID 
#      M_STORE_ID
# mem_data.MEM_ID.nunique()
# mem_data.head()
# mem_data.GENDER.unique() -> UNKNOWN 존재
# mem_data.BIRTH_DT.unique() -> nan
# mem_data.BIRTH_SL.unique() -> ['S', 'L']
# mem_data.ZIP_CD.unique() -> '-'
# mem_data.RGST_DT.unique() -> 2006-02-05 16:56:58, 2006-02-05 23:19:22.280000000
# mem_data.SMS.unique() -> ['Y', 'N']
# mem_data.USABLE_INIT.unique() -> [ 3202., 17762.,  1856., ...,  5552.,  3382.,  2530.]
# mem_data.ACC_PNT.unique() -> [ 6740., 13732.,  6372., ...,  5926.,  1710.,  7870.]
# mem_data.USED_PNT.unique() -> [0.000000e+00, 1.200000e+04, 2.300000e+04, 3.600000e+03,
# mem_data.USABLE_PNT.unique() -> [6740., 1732., 6372., ..., 8608., 3453., 5954.]
# mem_data.LAST_VST_DT.unique() -> ['2007-01-19 21:46:12.827000000', '2007-10-18 19:57:45.503000000',...,
# mem_data.SALES_AMT.unique() -> [337000., 1438500.,  309300., ...,   85500.,  176200.,  385800.]
# mem_data.VISIT_CNT.unique() -> [8.000e+00, 2.200e+01, 1.100e+01, 3.000e+00, 2.000e+00,,...,

# mem_data['MEM_ID']      = mem_data.MEM_ID.astype(int)
mem_data['M_STORE_ID']  = mem_data.M_STORE_ID.astype(int)
mem_data['VISIT_CNT']  = mem_data.SALES_AMT.astype(int)
mem_data['SALES_AMT']  = mem_data.SALES_AMT.astype(int)
mem_data['USABLE_PNT']  = mem_data.USABLE_PNT.astype(int)
mem_data['USED_PNT']    = mem_data.USED_PNT.astype(int)
mem_data['ACC_PNT']    = mem_data.ACC_PNT.astype(int)
mem_data['USABLE_INIT'] = mem_data.USABLE_INIT.astype(int)

mem_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10764 entries, 0 to 10763
Data columns (total 15 columns):
MEM_ID         10764 non-null int64
M_STORE_ID     10764 non-null int32
GENDER         10764 non-null object
BIRTH_DT       5997 non-null object
BIRTH_SL       10764 non-null object
ZIP_CD         10764 non-null object
RGST_DT        10764 non-null object
VISIT_CNT      10764 non-null int32
SALES_AMT      10764 non-null int32
LAST_VST_DT    10764 non-null object
USABLE_PNT     10764 non-null int32
USED_PNT       10764 non-null int32
ACC_PNT        10764 non-null int32
USABLE_INIT    10764 non-null int32
SMS            10764 non-null object
dtypes: int32(7), int64(1), object(7)
memory usage: 967.2+ KB


In [5]:
mem_data

Unnamed: 0,MEM_ID,M_STORE_ID,GENDER,BIRTH_DT,BIRTH_SL,ZIP_CD,RGST_DT,VISIT_CNT,SALES_AMT,LAST_VST_DT,USABLE_PNT,USED_PNT,ACC_PNT,USABLE_INIT,SMS
0,1134945,1084,M,,S,-,2006-02-04 11:51:08.233000000,337000,337000,2007-01-19 21:46:12.827000000,6740,0,6740,3202,Y
1,38458,539,F,1973-07-11,S,420-721,2006-02-04 11:51:08.233000000,1438500,1438500,2007-10-18 19:57:45.503000000,1732,12000,13732,17762,Y
2,7009,1113,F,1973-04-06,S,-,2006-02-04 11:51:08.233000000,309300,309300,2007-05-30 15:21:10.863000000,6372,0,6372,1856,Y
3,91791,2273,UNKNOWN,1968-09-05,S,487-820,2006-02-04 11:51:08.233000000,170400,170400,2007-10-16 15:55:52.483000000,3408,0,3408,236,Y
4,1374842,1300,M,,S,-,2006-02-04 11:51:08.233000000,110300,110300,2007-03-25 12:49:30.170000000,2206,0,2206,1522,Y
5,719600,130,UNKNOWN,1977-01-23,L,-,2006-02-04 11:51:08.233000000,48700,48700,2007-07-19 22:04:01.567000000,974,0,974,132,Y
6,1383660,1024,M,,S,-,2006-02-05 16:56:58,33500,33500,2006-09-09 14:55:21.687000000,670,0,670,110,Y
7,911717,10792,M,,S,-,2006-02-04 11:51:08.233000000,227400,227400,2007-10-21 17:16:03.257000000,4548,0,4548,842,Y
8,172362,799,UNKNOWN,,S,-,2006-02-04 11:51:08.233000000,58000,58000,2006-07-31 15:13:01.703000000,1226,0,1226,512,Y
9,272744,997,UNKNOWN,1978-11-04,L,-,2006-02-04 11:51:08.233000000,1590100,1590100,2007-10-31 14:34:27.783000000,10134,23000,33134,8052,Y


#### mem_trans.info()

- RangeIndex: 60049 entries, 0 to 60048
- Data columns (total 7 columns):
- STORE_ID    60049 non-null int64
- SELL_DT     60049 non-null int64
- MEMP_STY    60049 non-null object : ['O', 'M']
- MEM_ID      60049 non-null int64
- MEMP_DT     60049 non-null object : '2006-02-27 13:57:44.750000000', '2006-03-03 15:43:56.987000000',
- SELL_AMT    60049 non-null int64
- MEMP_TP     60049 non-null object : ['A']
- dtypes: int64(4), object(3)
- memory usage: 3.2+ MB


- 자료형 변환 int64 -> int32
 - mem_trans['STORE_ID'] = mem_trans.STORE_ID.astype(int)
 - mem_trans['MEM_ID']   = mem_trans.MEM_ID.astype(int)
 - mem_data['SELL_AMT']  = mem_data.USABLE_INIT.astype(int) -> mem_trans['SELL_AMT'].max() = 100000000

In [6]:
mem_trans

Unnamed: 0,STORE_ID,SELL_DT,MEMP_STY,MEM_ID,MEMP_DT,SELL_AMT,MEMP_TP
0,125,20060227,O,1225434,2006-02-27 13:57:44.750000000,15500,A
1,125,20060303,O,1181938,2006-03-03 15:43:56.987000000,22600,A
2,125,20060322,O,6093,2006-03-22 18:08:45.563000000,11000,A
3,125,20060403,O,1398202,2006-04-03 13:36:27.040000000,14400,A
4,125,20060418,O,1225434,2006-04-18 16:04:39.583000000,6600,A
5,125,20060421,O,1398202,2006-04-21 17:13:51.600000000,6600,A
6,125,20060430,O,1371363,2006-04-30 13:11:19.377000000,8800,A
7,125,20060430,O,1398580,2006-04-30 18:12:56.877000000,21000,A
8,125,20060502,O,468502,2006-05-02 12:52:03.883000000,3300,A
9,125,20060506,O,1225263,2006-05-06 15:55:33.723000000,14300,A


In [4]:
mem_trans['STORE_ID'] = mem_trans.STORE_ID.astype(int)
# mem_trans['MEM_ID']   = mem_trans.MEM_ID.astype(int)
mem_trans['SELL_AMT'] = mem_trans.SELL_AMT.astype(int) 
mem_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60049 entries, 0 to 60048
Data columns (total 7 columns):
STORE_ID    60049 non-null int32
SELL_DT     60049 non-null int64
MEMP_STY    60049 non-null object
MEM_ID      60049 non-null int64
MEMP_DT     60049 non-null object
SELL_AMT    60049 non-null int32
MEMP_TP     60049 non-null object
dtypes: int32(2), int64(2), object(3)
memory usage: 2.7+ MB


#### store_info.info()

- RangeIndex: 329 entries, 0 to 328
- Data columns (total 2 columns):
- STORE_ID        329 non-null int64
- STORE_REGION    329 non-null int64 -> [ 2, 11, 16, 13,  3,  1,  0,  8,  4, 15, 10,  6,  7, 14,  9, 12,  5]
- dtypes: int64(2)
- memory usage: 5.2 KB


- 자료형 변환 int64 -> int32
 - store_info['STORE_ID'] = store_info.STORE_ID.astype(int)
 - store_info['STORE_REGION'] = store_info.STORE_REGION.astype(int)

In [5]:
store_info['STORE_ID'] = store_info.STORE_ID.astype(int)
store_info['STORE_REGION'] = store_info.STORE_REGION.astype(int)
store_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 2 columns):
STORE_ID        329 non-null int32
STORE_REGION    329 non-null int32
dtypes: int32(2)
memory usage: 2.7 KB


In [6]:
# 컬럼명 변경
mem_data.rename(columns={"M_STORE_ID":"STORE_ID"}, inplace = True) 
mem_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10764 entries, 0 to 10763
Data columns (total 15 columns):
MEM_ID         10764 non-null int64
STORE_ID       10764 non-null int32
GENDER         10764 non-null object
BIRTH_DT       5997 non-null object
BIRTH_SL       10764 non-null object
ZIP_CD         10764 non-null object
RGST_DT        10764 non-null object
VISIT_CNT      10764 non-null int32
SALES_AMT      10764 non-null int32
LAST_VST_DT    10764 non-null object
USABLE_PNT     10764 non-null int32
USED_PNT       10764 non-null int32
ACC_PNT        10764 non-null int32
USABLE_INIT    10764 non-null int32
SMS            10764 non-null object
dtypes: int32(7), int64(1), object(7)
memory usage: 967.2+ KB


In [7]:
# STORE_ID, MEM_ID 데이터 수를 확인함. -> mem_trans을 기준으로 모든 데이터를 Gathering하기로 함.
print("STORE_ID data:{}   trans:{}   store:{}".format(mem_data.STORE_ID.nunique(), mem_trans.STORE_ID.nunique(), store_info.STORE_ID.nunique()))
print("MEM_ID   data:{} trans:{} store:{}".format(mem_data.MEM_ID.nunique(), mem_trans.MEM_ID.nunique(), 0))

STORE_ID data:367   trans:695   store:329
MEM_ID   data:10764 trans:10764 store:0


In [8]:
# mem_trans.STORE_ID.nunique()     -> 695   MEM_ID : 10764
# store_info.M_STORE_ID.nunique()  -> 329   MEM_ID : 10764
# mem_data.M_STORE_ID.nunique()    -> 695  
# mem_trans.groupby(['MEM_ID','STORE_ID'])['MEMP_DT'].count() # 21924
# mem_data.groupby(['MEM_ID','STORE_ID'])['RGST_DT'].count()  # 10764

In [9]:
# 특별광역시도구군
mem_data['특별광역시도구군']   = mem_data['ZIP_CD'].apply(lambda x: 0 if x == '-' else (x[0:1] + x[2:3]))

In [10]:
mem_data['BIRTH_DT'].fillna('1900-01-01',inplace=True)

In [11]:
mem_data['나이년도'] = mem_data['BIRTH_DT'].apply(lambda x: x[0:4])

In [12]:
mem_data.head(3)

Unnamed: 0,MEM_ID,STORE_ID,GENDER,BIRTH_DT,BIRTH_SL,ZIP_CD,RGST_DT,VISIT_CNT,SALES_AMT,LAST_VST_DT,USABLE_PNT,USED_PNT,ACC_PNT,USABLE_INIT,SMS,특별광역시도구군,나이년도
0,1134945,1084,M,1900-01-01,S,-,2006-02-04 11:51:08.233000000,337000,337000,2007-01-19 21:46:12.827000000,6740,0,6740,3202,Y,0,1900
1,38458,539,F,1973-07-11,S,420-721,2006-02-04 11:51:08.233000000,1438500,1438500,2007-10-18 19:57:45.503000000,1732,12000,13732,17762,Y,40,1973
2,7009,1113,F,1973-04-06,S,-,2006-02-04 11:51:08.233000000,309300,309300,2007-05-30 15:21:10.863000000,6372,0,6372,1856,Y,0,1973


In [13]:
def agePeriod(age, flg):
    hh = ''
    x = -1
    if flg == 'AGE':
        try :
            x = 2018 - int(age) + 1
        except ValueError :
            x = -1
            # print(age)

        if age in ('916.', '405.', '913.', '706.',
                  '1900', '1209', '2019', '2063', '2066', 
                   '2065', '2056', '2054', '2067', '2060', 
                   '2064', '9751'):
            x = -1

        if x < 0:
            hh = '나이미상'
        elif x >= 0 and x <= 5:
            hh = '유아'
        elif x >= 6 and x <= 9:
            hh = '아동'
        elif x >= 10 and x <= 19:
            hh = '10대'
        elif x >= 20 and x <= 29:
            hh = '20대'
        elif x >= 30 and x < 39:
            hh = '30대'
        elif x >= 40 and x <= 49:
            hh = '40대'
        elif x >= 50 and x <= 59:
            hh = '50대'
        elif x >= 60 and x <= 69:
            hh = '60대'
        elif x >= 70 and x <= 79:
            hh = '70대'
        else:
            hh = '노년'
        
        return hh
    else:
        if age in ('916.', '405.', '913.', '706.',
                  '1900', '1209', '2019', '2063', '2066', 
                   '2065', '2056', '2054', '2067', '2060', 
                   '2064', '9751'):
            return '1983'
        else:
            return age
        
def agePeriodW(age, flg):
    hh = ''
    x = -1
    if flg == 'AGE':
        try :
            x = 2018 - int(age) + 1
        except ValueError :
            x = -1
            # print(age)

        if age in ('916.', '405.', '913.', '706.',
                  '1900', '1209', '2019', '2063', '2066', 
                   '2065', '2056', '2054', '2067', '2060', 
                   '2064', '9751'):
            x = -1

        if x < 0:
            hh = '0'
        elif x >= 0 and x <= 5:
            hh = '5'
        elif x >= 6 and x <= 9:
            hh = '9'
        elif x >= 10 and x <= 19:
            hh = '10'
        elif x >= 20 and x <= 29:
            hh = '20'
        elif x >= 30 and x < 39:
            hh = '30'
        elif x >= 40 and x <= 49:
            hh = '40'
        elif x >= 50 and x <= 59:
            hh = '50'
        elif x >= 60 and x <= 69:
            hh = '60'
        elif x >= 70 and x <= 79:
            hh = '70'
        else:
            hh = '80'
        
        return hh
    else:
        if age in ('916.', '405.', '913.', '706.',
                  '1900', '1209', '2019', '2063', '2066', 
                   '2065', '2056', '2054', '2067', '2060', 
                   '2064', '9751','2056'):
            return '1983'
        else:
            return age

mem_data['연령대']   = mem_data['나이년도'].apply(lambda x: agePeriod(x, 'AGE'))
mem_data['연령대NO'] = mem_data['나이년도'].apply(lambda x: agePeriodW(x, 'AGE'))

In [14]:
# 최고 많은 분포를 형성하고 있는 30대 후반으로 결측치를 보정함.
# dfAll['연령대'] = dfAll['나이년도'].apply(lambda x: agePeriod(x))
mem_data['연령대'].unique()

array(['나이미상', '40대', '50대', '노년', '30대', '20대', '10대', '60대', '70대',
       '유아'], dtype=object)

In [15]:
mem_data['연령대'] = mem_data['연령대'].apply(lambda x: x if x != '나이미상' else '30대')

In [17]:
# dfAll['나이년도'] = dfAll['나이년도'].apply(lambda x: x if x != '1900' else '1983')
#dfAll['나이년도'] = dfAll['나이년도'].apply(lambda x: agePeriod(x, 'YYYY'))

In [18]:
mem_data['BIRTH_DT'] = mem_data['BIRTH_DT'].apply(lambda x: x if x != '1900-01-01' else '1983-01-01')

In [19]:
mem_data['나이년도'].unique() #-> 916., 405., 1209, 2019, 2063, 2066, 2065, 2056, 2054, 2067, 2060, 2064, 9751, 913. 706.

# mem_data['STORE_REGION'].fillna(0, inplace=True)
# mem_data['STORE_REGION'] = dfAll.STORE_REGION.astype(int) 
# mem_data.groupby(["STORE_REGION"])['특별광역시도구군'].count()  # -> 22727

array(['1900', '1973', '1968', '1977', '1978', '1980', '1960', '1989',
       '1979', '1990', '1985', '1969', '1984', '1987', '1986', '2003',
       '1971', '1976', '1982', '1983', '1975', '1966', '1974', '1963',
       '1965', '1951', '1992', '2004', '1981', '1959', '1956', '1950',
       '1998', '1970', '1993', '1972', '1988', '1958', '2001', '1962',
       '1967', '1948', '1991', '1961', '1955', '1957', '1964', '1954',
       '2005', '2063', '2014', '2019', '1953', '1952', '1997', '2006',
       '2000', '1947', '1209', '405.', '916.', '2002', '2007', '1943',
       '2054', '706.', '2066', '1919', '2065', '2056', '2067', '2060',
       '2008', '1949', '1946', '1944', '2064', '9751', '1937', '1995',
       '1688', '1750', '913.'], dtype=object)

In [20]:
def updateStoreRegion(x, y):
    if x == 0 and y != 0:
        return y
    else:
        return x
        
# mem_data['STORE_REGION'] = mem_data.apply(lambda x: updateStoreRegion(x.특별광역시도구군, x.STORE_REGION), axis=1) 

In [21]:
# 시간대 : 새벽[0-559], 아침[600-1129], 점심[1130-1429], 오후[1230-1729], 저녁[1530-2129],심야[2130-2359]
def timePeriod(tt, mm):
    hh = ''
    x = int((str(tt) +''+ str(mm)))
    if x >= 100 and x <= 559:
        hh = '새벽'
    elif x >= 0 and x <= 59:
        hh = '심야'
    elif x >= 600 and x <= 1129:
        hh = '아침'
    elif x >= 1130 and x <= 1429:
        hh = '점심'
    elif x >= 1230 and x < 1729:
        hh = '오후'
    elif x >= 1530 and x <= 2159:
        hh = '저녁'
    elif x >= 2200 and x <= 2359:
        hh = '심야'
    else:
        print(tt, mm)
        return 'None'
    return hh

def monthSeason(x):
    dd = {'01':'겨울','02':'겨울','03':'겨울','04':'봄','05':'봄',
          '06':'여름', '07':'여름', '08':'여름', '09':'가을', 
          '10':'가을', '11':'가을', '12':'겨울'
         }
    return dd.get(x)


In [23]:
'2006-02-27 13:57:44.750000000'[11:13]
#del dfMembers 

'13'

In [24]:
# mem_trans.SELL_DT.isnull().sum()

mem_trans['구매시간대'] = mem_trans['MEMP_DT'].apply(lambda x: timePeriod(x[11:13],x[14:16]))

# sales_date를 datetime 형식으로 변환한 후 계절 Feature 생성
mem_trans['구매계절'] = mem_trans['MEMP_DT'].apply(lambda x: monthSeason(str(x)[5:7]))

# '2000-06-25 00:00:00'[0:4] -> 년
mem_trans['구매년도'] = pd.to_numeric(mem_trans['SELL_DT'].apply(lambda x: str(x)[0:4]), errors='coerce').astype(np.int32)

# '2000-06-25 00:00:00'[5:7] -> 월
mem_trans['구매월'] = pd.to_numeric(mem_trans['SELL_DT'].apply(lambda x: str(x)[4:6]), errors='coerce').astype(np.int32)

# '2000-06-25 00:00:00'[5:7] -> 월
mem_trans['구매일자'] = pd.to_numeric(mem_trans['SELL_DT'].apply(lambda x: str(x)[6:8]), errors='coerce').astype(np.int32)

In [25]:
mem_trans.구매시간대.unique() ## mem_trans.head(5)

array(['점심', '오후', '저녁', '아침', '심야', '새벽'], dtype=object)

In [26]:
# 구매시간대별 구입액
df_ = mem_trans.groupby(['MEM_ID','구매시간대'])['SELL_AMT'].agg(sum)
dfAmtTimePeriod = df_.reset_index()
dfAmtTimePeriod = pd.pivot_table(dfAmtTimePeriod, values='SELL_AMT', index=['MEM_ID'], columns='구매시간대', fill_value=0).reset_index()
del df_
gc.collect()
dfAmtTimePeriod.columns = ['MEM_ID','새벽구입액','심야구입액','아침구입액','오후구입액','저녁구입액','점심구입액']
print('dfAmtTimePeriod shape : {}'.format(dfAmtTimePeriod.shape))
dfAmtTimePeriod.head(3)

dfAmtTimePeriod shape : (10764, 7)


Unnamed: 0,MEM_ID,새벽구입액,심야구입액,아침구입액,오후구입액,저녁구입액,점심구입액
0,957,0,1500,0,0,6000,0
1,1054,0,0,0,0,30200,160400
2,1067,0,0,0,0,7700,0


In [27]:
# 구매시간대별 구입건수
df_ = mem_trans.groupby(['MEM_ID','구매시간대'])['SELL_DT'].count()
dfAmtTimePeriodCnt = df_.reset_index()
dfAmtTimePeriodCnt.columns = ['MEM_ID','구매시간대','구입건수']
dfAmtTimePeriodCnt = pd.pivot_table(dfAmtTimePeriodCnt, values='구입건수', index=['MEM_ID'], columns='구매시간대', fill_value=0).reset_index()
del df_
gc.collect()
dfAmtTimePeriodCnt.columns = ['MEM_ID','새벽구입건수','심야구입건수','아침구입건수','오후구입건수','저녁구입건수','점심구입건수']
print('dfAmtTimePeriodCnt shape : {}'.format(dfAmtTimePeriodCnt.shape))
dfAmtTimePeriodCnt.head(3)

dfAmtTimePeriodCnt shape : (10764, 7)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수
0,957,0,1,0,0,2,0
1,1054,0,0,0,0,2,3
2,1067,0,0,0,0,1,0


In [28]:
# 구매시간대별 구입액 + 구매시간대별 구입건수
dfMembers = pd.merge(dfAmtTimePeriodCnt, dfAmtTimePeriod, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfAmtTimePeriodCnt
del dfAmtTimePeriod
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 13)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,오후구입액,저녁구입액,점심구입액
0,957,0,1,0,0,2,0,0,1500,0,0,6000,0
1,1054,0,0,0,0,2,3,0,0,0,0,30200,160400
2,1067,0,0,0,0,1,0,0,0,0,0,7700,0


In [29]:
def dayName(x):
    dd = {'0':'월','1':'화','2':'수','3':'목','4':'금','5':'토','6':'일'}    
    return dd.get(x)

# 훈련 데이터 셋에 sales_date를 datetime 형식으로 변환한 후 요일 Feature 생성
mem_trans['요일NO'] = mem_trans['SELL_DT'].apply(lambda x: pd.to_datetime(str(x), format='%Y-%m-%d %H:%M:%S').dayofweek)

# 훈련 데이터 셋에 sales_date를 datetime 형식으로 변환한 후 요일명 Feature 생성
mem_trans['요일NM'] = mem_trans['요일NO'].apply(lambda x: dayName(str(x)))
mem_trans.head(3)

Unnamed: 0,STORE_ID,SELL_DT,MEMP_STY,MEM_ID,MEMP_DT,SELL_AMT,MEMP_TP,구매시간대,구매계절,구매년도,구매월,구매일자,요일NO,요일NM
0,125,20060227,O,1225434,2006-02-27 13:57:44.750000000,15500,A,점심,겨울,2006,2,27,0,월
1,125,20060303,O,1181938,2006-03-03 15:43:56.987000000,22600,A,오후,겨울,2006,3,3,4,금
2,125,20060322,O,6093,2006-03-22 18:08:45.563000000,11000,A,저녁,겨울,2006,3,22,2,수


In [30]:
# 구매계절별 구입액
df_ = mem_trans.groupby(['MEM_ID','구매계절'])['SELL_AMT'].agg(sum)
dfAmtSeason = df_.reset_index()
dfAmtSeason = pd.pivot_table(dfAmtSeason, values='SELL_AMT', index=['MEM_ID'], columns='구매계절', fill_value=0).reset_index()
del df_
gc.collect()
dfAmtSeason.columns = ['MEM_ID','가을구입액','겨울구입액','봄구입액','여름구입액']
print('dfAmtTimePeriod shape : {}'.format(dfAmtSeason.shape))
dfAmtSeason.head(3)

dfAmtTimePeriod shape : (10764, 5)


Unnamed: 0,MEM_ID,가을구입액,겨울구입액,봄구입액,여름구입액
0,957,1500,0,0,6000
1,1054,32400,15400,142800,0
2,1067,7700,0,0,0


In [31]:
# 구매계절별 구입건수
df_ = mem_trans.groupby(['MEM_ID','구매계절'])['SELL_DT'].count()
dfAmtSeasonCnt = df_.reset_index()
dfAmtSeasonCnt.columns = ['MEM_ID','구매계절','구입건수']
dfAmtSeasonCnt = pd.pivot_table(dfAmtSeasonCnt, values='구입건수', index=['MEM_ID'], columns='구매계절', fill_value=0).reset_index()
del df_
gc.collect()
dfAmtSeasonCnt.columns = ['MEM_ID','가을구입건수','겨울구입건수','봄구입건수','여름구입건수']
print('dfAmtSeasonCnt shape : {}'.format(dfAmtSeasonCnt.shape))
dfAmtSeasonCnt.head(3)

dfAmtSeasonCnt shape : (10764, 5)


Unnamed: 0,MEM_ID,가을구입건수,겨울구입건수,봄구입건수,여름구입건수
0,957,1,0,0,2
1,1054,2,1,2,0
2,1067,1,0,0,0


In [32]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 
dfMembers = pd.merge(dfMembers, dfAmtSeason, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfAmtSeason
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 17)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,오후구입액,저녁구입액,점심구입액,가을구입액,겨울구입액,봄구입액,여름구입액
0,957,0,1,0,0,2,0,0,1500,0,0,6000,0,1500,0,0,6000
1,1054,0,0,0,0,2,3,0,0,0,0,30200,160400,32400,15400,142800,0
2,1067,0,0,0,0,1,0,0,0,0,0,7700,0,7700,0,0,0


In [33]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수
dfMembers = pd.merge(dfMembers, dfAmtSeasonCnt, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfAmtSeasonCnt
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 21)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,저녁구입액,점심구입액,가을구입액,겨울구입액,봄구입액,여름구입액,가을구입건수,겨울구입건수,봄구입건수,여름구입건수
0,957,0,1,0,0,2,0,0,1500,0,...,6000,0,1500,0,0,6000,1,0,0,2
1,1054,0,0,0,0,2,3,0,0,0,...,30200,160400,32400,15400,142800,0,2,1,2,0
2,1067,0,0,0,0,1,0,0,0,0,...,7700,0,7700,0,0,0,1,0,0,0


In [34]:
# 매장별 구입액
df_ = mem_trans.groupby(['MEM_ID','STORE_ID'])['SELL_AMT'].agg(sum)
dfAmtBySTORE = df_.reset_index()
del df_
gc.collect()
dfAmtBySTORE = pd.pivot_table(dfAmtBySTORE, values='SELL_AMT', index=['MEM_ID'], columns='STORE_ID', fill_value=0).reset_index()
print("dfAmtByPcrenm.shape {}".format(dfAmtBySTORE.shape))
dfAmtBySTORE.head(3)

dfAmtByPcrenm.shape (10764, 696)


STORE_ID,MEM_ID,125,126,128,129,130,133,136,140,141,...,30022,30029,30031,30037,30088,30100,30101,30102,99953,1022001
0,957,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1054,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1067,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# 매장별 구입건수
df_ = mem_trans.groupby(['MEM_ID','STORE_ID'])['SELL_DT'].count()
dfAmtBySTORECnt = df_.reset_index()
del df_
gc.collect()
dfAmtBySTORECnt.columns = ['MEM_ID','STORE_ID','구입건수']

dfAmtBySTORECnt = pd.pivot_table(dfAmtBySTORECnt, values='구입건수', index=['MEM_ID'], columns='STORE_ID', fill_value=0).reset_index()
print("dfAmtBySTORE.shape {}".format(dfAmtBySTORECnt.shape))
dfAmtBySTORECnt.head(3)

dfAmtBySTORE.shape (10764, 696)


STORE_ID,MEM_ID,125,126,128,129,130,133,136,140,141,...,30022,30029,30031,30037,30088,30100,30101,30102,99953,1022001
0,957,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1054,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1067,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액
dfMembers = pd.merge(dfMembers, dfAmtBySTORE, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfAmtBySTORE
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 716)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,30022,30029,30031,30037,30088,30100,30101,30102,99953,1022001
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0,0,0,0,0,0
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수
dfMembers = pd.merge(dfMembers, dfAmtBySTORECnt, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfAmtBySTORECnt
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 1411)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,30022_y,30029_y,30031_y,30037_y,30088_y,30100_y,30101_y,30102_y,99953_y,1022001_y
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0,0,0,0,0,0
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# 주매장별 매출액 및 건수
df_ = mem_trans.groupby(['MEM_ID','STORE_ID'])['SELL_AMT'].agg(['sum', 'count'])
dfStoreAmts = df_.reset_index()
dfStoreAmts.columns = ['MEM_ID', '주매장', '주매장구입액', '주매장구입수']
dfStoreAmts.head(3)
dfStoreInfo = dfStoreAmts.loc[dfStoreAmts.groupby(['MEM_ID'])['주매장구입수'].idxmax(),]
del df_
del dfStoreAmts
gc.collect()
print("dfStoreInfo.shape {}, mem_trans.custid.nunique() {}".format(dfStoreInfo.shape, mem_trans.MEM_ID.nunique()))
dfStoreInfo.head(3)

dfStoreInfo.shape (10764, 4), mem_trans.custid.nunique() 10764


Unnamed: 0,MEM_ID,주매장,주매장구입액,주매장구입수
0,957,554,1500,1
3,1054,550,142800,2
6,1067,1027,7700,1


In [39]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보 
dfMembers = pd.merge(dfMembers, dfStoreInfo, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfStoreInfo
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 1414)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,30037_y,30088_y,30100_y,30101_y,30102_y,99953_y,1022001_y,주매장,주매장구입액,주매장구입수
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0,0,0,554,1500,1
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0,0,0,550,142800,2
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1027,7700,1


In [40]:
# 매장구매건수
ff = mem_trans.groupby('MEM_ID')['STORE_ID'].agg([('매장구매건수', lambda x: x.value_counts().index[0])]).reset_index()
dfMainConer = pd.get_dummies(ff, columns=['매장구매건수'])
del ff
dfMainConer.head(3)

Unnamed: 0,MEM_ID,매장구매건수_125,매장구매건수_126,매장구매건수_128,매장구매건수_129,매장구매건수_130,매장구매건수_133,매장구매건수_136,매장구매건수_140,매장구매건수_141,...,매장구매건수_30011,매장구매건수_30017,매장구매건수_30021,매장구매건수_30022,매장구매건수_30029,매장구매건수_30031,매장구매건수_30037,매장구매건수_30088,매장구매건수_30101,매장구매건수_30102
0,957,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1054,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1067,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보+ 매장구매건수
dfMembers = pd.merge(dfMembers, dfMainConer, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfMainConer
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 2038)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,매장구매건수_30011,매장구매건수_30017,매장구매건수_30021,매장구매건수_30022,매장구매건수_30029,매장구매건수_30031,매장구매건수_30037,매장구매건수_30088,매장구매건수_30101,매장구매건수_30102
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0,0,0,0,0,0
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
def fw(x):
    k = x
    if k <= 4 :
        return('주중_방문')
    else :
        return('주말_방문')    
    
df = mem_trans.copy()
df = df.drop_duplicates(['MEM_ID','SELL_DT'])

df['WEEK'] = df.요일NO.apply(fw)
df = pd.pivot_table(df, index='MEM_ID', columns='WEEK', values='SELL_AMT', 
                   aggfunc=np.size, fill_value=0).reset_index()
df['주말방문비율'] = ((df.iloc[:,1] / (df.iloc[:,1]+df.iloc[:,2]))*100).apply(round, args=(1,))
dfWeekday = df.copy().iloc[:,[0,-1]]
dfWeekday.head(3)

WEEK,MEM_ID,주말방문비율
0,957,0.0
1,1054,0.0
2,1067,100.0


In [43]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보+ 매장구매건수 + 주말방문비율
dfMembers = pd.merge(dfMembers, dfWeekday, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfWeekday
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 2039)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,매장구매건수_30017,매장구매건수_30021,매장구매건수_30022,매장구매건수_30029,매장구매건수_30031,매장구매건수_30037,매장구매건수_30088,매장구매건수_30101,매장구매건수_30102,주말방문비율
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0,0,0,0,0,0.0
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100.0


In [44]:
mem_data['GENDER'].unique()

array(['M', 'F', 'UNKNOWN'], dtype=object)

In [45]:
def menwomen(x):
    # print(x)
    if x=='UNKNOWN':
        return '2'
    elif x=='F':
        return '0'
    elif x=='M':
        return '1' 
    
mem_data['GENDER'] = mem_data['GENDER'].apply(lambda x: menwomen(x))

mem_data['GENDER'] = mem_data.GENDER.astype(int)

In [46]:
mem_data.GENDER.unique()

array([1, 0, 2], dtype=int64)

In [47]:
def setyyyymmdd(y, m, d):
    return y + m + d

mem_data['최근방문일자'] = mem_data['LAST_VST_DT'].apply(lambda x: setyyyymmdd(str(x)[0:4], str(x)[5:7], str(x)[8:10]))
mem_data['최근방문일자'] = mem_data.최근방문일자.astype(int)

In [48]:
# 매장 최근방문일
df_ = mem_data.groupby(['MEM_ID','STORE_ID'])['최근방문일자'].agg(['max'])
dfStoreIncome = df_.reset_index()
dfStoreIncome.columns = ['MEM_ID', '매장', '최근방문일']
# dfStoreIncome.head(3)
dfStoreIncome.groupby(['MEM_ID'])['최근방문일'].idxmax()
dfStoreGo = dfStoreIncome.loc[dfStoreIncome.groupby(['MEM_ID'])['최근방문일'].idxmax(),]
del df_
del dfStoreIncome
gc.collect()
print("dfStoreGo.shape {}, mem_data.MEM_ID.nunique() {}".format(dfStoreGo.shape, mem_data.MEM_ID.nunique()))
dfStoreGo.head(3)

dfStoreGo.shape (10764, 3), mem_data.MEM_ID.nunique() 10764


Unnamed: 0,MEM_ID,매장,최근방문일
0,957,543,20070830
1,1054,550,20071016
2,1067,550,20060916


In [49]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보+ 매장구매건수 + 주말방문비율 + 최근방문일
dfMembers = pd.merge(dfMembers, dfStoreGo, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfStoreGo
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 2041)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,매장구매건수_30022,매장구매건수_30029,매장구매건수_30031,매장구매건수_30037,매장구매건수_30088,매장구매건수_30101,매장구매건수_30102,주말방문비율,매장,최근방문일
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0,0,0,0.0,543,20070830
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0,0,0,0.0,550,20071016
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,100.0,550,20060916


In [50]:
# 주매장 포인트 정보
df_ = mem_data.groupby(['MEM_ID','STORE_ID'])['USED_PNT','USABLE_PNT'].agg(max)
dfConnerPoints = df_.reset_index()
dfConnerPoints.columns = ['MEM_ID', '주매장', '주매장사용포인트', '주매장보유포인트']
dfConnerPoints = dfConnerPoints.loc[dfConnerPoints.groupby(['MEM_ID'])['주매장보유포인트'].idxmax(),]
del df_
gc.collect()
print("dfConnerPoints.shape {}, mem_data.MEM_ID.nunique() {}".format(dfConnerPoints.shape, mem_data.MEM_ID.nunique()))
dfConnerPoints.head(3)

dfConnerPoints.shape (10764, 4), mem_data.MEM_ID.nunique() 10764


Unnamed: 0,MEM_ID,주매장,주매장사용포인트,주매장보유포인트
0,957,543,0,1778
1,1054,550,0,29158
2,1067,550,0,4172


In [51]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보+ 매장구매건수 + 주말방문비율 + 최근방문일 + 주매장 포인트 정보
dfMembers = pd.merge(dfMembers, dfConnerPoints, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfConnerPoints
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 2044)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,매장구매건수_30037,매장구매건수_30088,매장구매건수_30101,매장구매건수_30102,주말방문비율,매장,최근방문일,주매장_y,주매장사용포인트,주매장보유포인트
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0,0,0.0,543,20070830,543,0,1778
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0,0,0.0,550,20071016,550,0,29158
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,0,0,100.0,550,20060916,550,0,4172


In [52]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보+ 매장구매건수 + 주말방문비율 + 최근방문일 + 주매장 포인트 정보 + 연령대
df_ = mem_data.groupby(['MEM_ID','연령대NO'])['SMS'].agg(max)
dfAges = df_.reset_index()
dfAges.columns = ['MEM_ID', '연령대', 'SMS']

dfMembers = pd.merge(dfMembers, dfAges, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfAges
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))

dfMembers.head(3)

dfMembers shape : (10764, 2046)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,매장구매건수_30101,매장구매건수_30102,주말방문비율,매장,최근방문일,주매장_y,주매장사용포인트,주매장보유포인트,연령대,SMS
0,957,0,1,0,0,2,0,0,1500,0,...,0,0,0.0,543,20070830,543,0,1778,0,Y
1,1054,0,0,0,0,2,3,0,0,0,...,0,0,0.0,550,20071016,550,0,29158,50,N
2,1067,0,0,0,0,1,0,0,0,0,...,0,0,100.0,550,20060916,550,0,4172,0,Y


In [53]:
dfMembers['SMS'] = dfMembers.SMS.apply(lambda x: 1 if x=='Y' else 0)

In [54]:
# 구매시간대별 구입액 + 구매시간대별 구입건수 + 구매계절별 구입액 + 구매계절별 구입건수 + 매장별 구입액 +
# 매장별 구입건수 + 주매장구입정보+ 매장구매건수 + 주말방문비율 + 최근방문일 + 주매장 포인트 정보 + 연령대 +
# GENDER
df_ = mem_data.groupby(['MEM_ID'])['GENDER'].agg(max)
dfGenders = df_.reset_index()
dfGenders.columns = ['MEM_ID', 'GENDER']
dfMembers = pd.merge(dfMembers, dfGenders, left_on=['MEM_ID'], right_on=['MEM_ID'], how='left')
del dfGenders
gc.collect()
print('dfMembers shape : {}'.format(dfMembers.shape))
dfMembers.head(3)

dfMembers shape : (10764, 2047)


Unnamed: 0,MEM_ID,새벽구입건수,심야구입건수,아침구입건수,오후구입건수,저녁구입건수,점심구입건수,새벽구입액,심야구입액,아침구입액,...,매장구매건수_30102,주말방문비율,매장,최근방문일,주매장_y,주매장사용포인트,주매장보유포인트,연령대,SMS,GENDER
0,957,0,1,0,0,2,0,0,1500,0,...,0,0.0,543,20070830,543,0,1778,0,1,1
1,1054,0,0,0,0,2,3,0,0,0,...,0,0.0,550,20071016,550,0,29158,50,0,2
2,1067,0,0,0,0,1,0,0,0,0,...,0,100.0,550,20060916,550,0,4172,0,1,1


In [55]:
label   = 'GENDER'
testSize= 0.25

dfTest  = dfMembers.query("GENDER==2")
dfTrain = dfMembers.query("GENDER!=2")
mdf_y   = dfTrain['GENDER']
mdf_X   = dfTrain.drop(['MEM_ID','GENDER'], axis=1)
ndf_X   = dfTest.drop(['MEM_ID','GENDER'], axis=1)
# del dfTrain
# del dfTest
X_train, X_test, y_train, y_test = train_test_split(mdf_X, mdf_y, test_size=testSize, random_state=0)
print("ndf_X:{}, mdf_X:{}, mdf_y:{}, X_train:{}, X_test:{}".format(ndf_X.shape, mdf_X.shape, mdf_y.shape, X_train.shape, X_test.shape))

ndf_X:(3230, 2045), mdf_X:(7534, 2045), mdf_y:(7534,), X_train:(5650, 2045), X_test:(1884, 2045)


In [860]:
# dfMembers.isnull().sum()

In [56]:
scR = RobustScaler(copy=True, quantile_range=(10.0, 90.0), with_centering=True, with_scaling=True)

X_train_std     = scR.fit_transform(X_train)
X_test_std      = scR.transform(X_test)

print(X_train_std.shape, X_test_std.shape)

(5650, 2045) (1884, 2045)


In [None]:
eval_set = [(X_train_std, y_train), (X_test_std, y_test)]
xgb_model = XGBClassifier(**{'n_estimators':800, 
                              'gamma':0.1, 
                              'reg_alpha':0.87, 
                              'reg_lambda':0.015,
                              'learning_rate':0.03,
                              'max_depth':6, 
                              'objective':'binary:logistic',
                              'subsample':0.75},random_state=0,n_jobs=-1)

xgb_score = xgb_model.fit(X_train_std, y_train, eval_metric=["error", "logloss"], 
                        eval_set=eval_set, early_stopping_rounds=10, 
                        verbose=False).score(X_test_std, y_test)
y_xgb = xgb_model.predict(X_test_std)

print("best_model -----------------------------{}".format(xgb_score))
print(classification_report(y_test, y_xgb))

In [None]:
IDtest = dfTest.MEM_ID.reset_index();
X_new  = scR.transform(ndf_X)

pred = xgb_model.predict_proba(X_new)[:,1]

fname = 'submission.csv'
submissions = pd.concat([IDtest, pd.Series(pred, name="GENDER")] ,axis=1)

# submissions.set_index("MEM_ID")

submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

In [None]:
IDtest.shape

In [None]:
pred.size

In [None]:
ret = pd.read_csv(fname)
ret.drop(["index"], axis=1, inplace=True)
ret

In [None]:
ret.to_csv('submission1130.csv', index=False)
print("'{}' is ready to submit." .format(fname))

### END