In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(601)

import sklearn
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder

# 2. 데이터 전처리
## Data Cleansing & Pre-Processing

In [3]:
data = pd.read_csv('data/201901-202003.csv')

In [4]:
data.head()
data.tail()

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4
1,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8
2,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6
3,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5
4,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3


Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
24697787,202003,충북,충주시,휴양콘도 운영업,충북,충주시,30s,1,2,3,43300,4
24697788,202003,충북,충주시,휴양콘도 운영업,충북,충주시,40s,1,3,3,35000,3
24697789,202003,충북,충주시,휴양콘도 운영업,충북,충주시,50s,1,4,4,188000,6
24697790,202003,충북,충주시,휴양콘도 운영업,충북,충주시,50s,2,4,4,99000,6
24697791,202003,충북,충주시,휴양콘도 운영업,충북,충주시,60s,1,5,3,194000,3


In [5]:
data = data.fillna('')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24697792 entries, 0 to 24697791
Data columns (total 12 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   REG_YYMM      int64 
 1   CARD_SIDO_NM  object
 2   CARD_CCG_NM   object
 3   STD_CLSS_NM   object
 4   HOM_SIDO_NM   object
 5   HOM_CCG_NM    object
 6   AGE           object
 7   SEX_CTGO_CD   int64 
 8   FLC           int64 
 9   CSTMR_CNT     int64 
 10  AMT           int64 
 11  CNT           int64 
dtypes: int64(6), object(6)
memory usage: 2.2+ GB


In [6]:
data['REG_YYMM'].value_counts()

201908    1880296
201905    1772237
201910    1751005
201909    1734914
201907    1721353
201906    1712696
201912    1674381
202001    1672889
201903    1668675
201904    1655655
201911    1652905
201901    1607032
201902    1594266
202002    1338980
202003    1260508
Name: REG_YYMM, dtype: int64

In [6]:
# 201908 ~ 202002 -> train
# 202003          -> test

train = data[data['REG_YYMM'] != 202003]
test = data[data['REG_YYMM'] == 202003]

### 날짜 처리 YYYYMM -> YYYY + MM

In [7]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [8]:
# all

data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [9]:
# train
train['year'] = train['REG_YYMM'].apply(lambda x: grap_year(x))
train['month'] = train['REG_YYMM'].apply(lambda x: grap_month(x))
train = train.drop(['REG_YYMM'], axis=1)

In [10]:
# test
test['year'] = test['REG_YYMM'].apply(lambda x: grap_year(x))
test['month'] = test['REG_YYMM'].apply(lambda x: grap_month(x))
test = test.drop(['REG_YYMM'], axis=1)

In [11]:
train.tail()
test.tail()

Unnamed: 0,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,year,month
23437279,충북,충주시,휴양콘도 운영업,충북,충주시,40s,1,3,3,62000,3,2020,2
23437280,충북,충주시,휴양콘도 운영업,충북,충주시,40s,2,3,3,172000,4,2020,2
23437281,충북,충주시,휴양콘도 운영업,충북,충주시,50s,1,4,12,399000,12,2020,2
23437282,충북,충주시,휴양콘도 운영업,충북,충주시,50s,2,4,13,504100,16,2020,2
23437283,충북,충주시,휴양콘도 운영업,충북,충주시,60s,1,5,6,142000,6,2020,2


Unnamed: 0,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,year,month
24697787,충북,충주시,휴양콘도 운영업,충북,충주시,30s,1,2,3,43300,4,2020,3
24697788,충북,충주시,휴양콘도 운영업,충북,충주시,40s,1,3,3,35000,3,2020,3
24697789,충북,충주시,휴양콘도 운영업,충북,충주시,50s,1,4,4,188000,6,2020,3
24697790,충북,충주시,휴양콘도 운영업,충북,충주시,50s,2,4,4,99000,6,2020,3
24697791,충북,충주시,휴양콘도 운영업,충북,충주시,60s,1,5,3,194000,3,2020,3


### 데이터 정제

In [12]:
# all
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1) # 카드이용지역_시군구 (가맹점 주소 기준) # 거주지역_시군구 (고객 집주소 기준)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [13]:
# train
df_train = train.copy()
df_train = df_train.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1) # 카드이용지역_시군구 (가맹점 주소 기준) # 거주지역_시군구 (고객 집주소 기준)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df_train = df_train.groupby(columns).sum().reset_index(drop=False)

In [14]:
# test
df_test = test.copy()
df_test = df_test.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1) # 카드이용지역_시군구 (가맹점 주소 기준) # 거주지역_시군구 (고객 집주소 기준)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df_test = df_test.groupby(columns).sum().reset_index(drop=False)

### 인코딩

In [15]:
# all
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [16]:
# train
dtypes = df_train.dtypes
encoders = {}
for column in df_train.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df_train[column])
        encoders[column] = encoder
        
df_num_train = df_train.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num_train[column] = encoder.transform(df_train[column])

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [17]:
# test
encoders = {}
for column in df_test.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df_test[column])
        encoders[column] = encoder
        
df_num_test = df_test.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num_test[column] = encoder.transform(df_test[column])

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

# 3. 탐색적 자료분석
## Exploratory Data Analysis

In [19]:
# 입력하세요.


# 4. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling

In [20]:
df_num_train
df_num_test

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2019,1,4,311200,4
1,0,0,0,1,1,1,2019,2,3,605000,3
2,0,0,0,1,1,1,2019,6,3,139000,3
3,0,0,0,1,1,1,2019,8,3,27500,3
4,0,0,0,1,1,1,2019,9,3,395500,3
...,...,...,...,...,...,...,...,...,...,...,...
1002743,16,40,16,6,2,5,2019,3,3,148000,4
1002744,16,40,16,6,2,5,2019,5,5,329800,7
1002745,16,40,16,6,2,5,2019,10,7,557800,7
1002746,16,40,16,6,2,5,2019,12,3,247800,3


Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2020,3,3,427510,2
1,0,0,0,2,1,2,2020,3,38,5622890,37
2,0,0,0,2,2,2,2020,3,30,2296125,38
3,0,0,0,3,1,2,2020,3,3,257000,4
4,0,0,0,3,1,3,2020,3,51,6490800,48
...,...,...,...,...,...,...,...,...,...,...,...
54641,16,40,16,3,1,4,2020,3,9,385700,10
54642,16,40,16,4,1,4,2020,3,10,790350,19
54643,16,40,16,4,2,4,2020,3,4,99000,6
54644,16,40,16,5,1,5,2020,3,6,228800,6


### feature, target 설정

In [18]:
# all
all_features = df_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
all_target = np.log1p(df_num['AMT'])

In [19]:
# train
train_features = df_num_train.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(df_num_train['AMT'])

In [20]:
# test
test_features = df_num_test.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
test_target = np.log1p(df_num_test['AMT'])

# 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [24]:
# train test 

In [21]:
# 훈련
# model = ExtraTreesRegressor(n_jobs=-1, random_state=0)
from lightgbm import LGBMRegressor
model = LGBMRegressor()

In [22]:
model.fit(train_features, train_target)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [23]:
y_pred = model.predict(test_features)

In [24]:
def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

In [25]:
rmsle(test_target, y_pred)

0.12373836102812845

In [26]:
# all
model = LGBMRegressor()
model.fit(all_features, all_target)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# 6. 결과 및 결언
## Conclusion & Discussion

In [27]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=all_features.columns)

In [28]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [29]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [30]:
# 제출 파일 만들기
submission = pd.read_csv('data/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('20200702_3.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,583565000.0
1,202004,강원,골프장 운영업,1188377000.0
2,202004,강원,과실 및 채소 소매업,584760800.0
3,202004,강원,관광 민예품 및 선물용품 소매업,168711600.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,140435400.0
