In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(601)

import sklearn
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [2]:
data = pd.read_csv('data/201901-202003.csv')

In [None]:
# 날짜 처리

data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [None]:
data

In [3]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'   , 'AGE', 'SEX_CTGO_CD', 'FLC', 'HOM_SIDO_NM','CSTMR_CNT','CNT'], axis=1)
df

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,201901,강원,건강보조식품 소매업,311200
1,201901,강원,건강보조식품 소매업,1374500
2,201901,강원,건강보조식품 소매업,818700
3,201901,강원,건강보조식품 소매업,1717000
4,201901,강원,건강보조식품 소매업,1047300
...,...,...,...,...
24697787,202003,충북,휴양콘도 운영업,43300
24697788,202003,충북,휴양콘도 운영업,35000
24697789,202003,충북,휴양콘도 운영업,188000
24697790,202003,충북,휴양콘도 운영업,99000


In [None]:
columns = ['REG_YYMM','CARD_SIDO_NM', 'STD_CLSS_NM']
df = df.groupby(columns).sum().reset_index(drop=False)
df

In [None]:
pivot = df.pivot_table(['AMT'], index=['CARD_SIDO_NM','STD_CLSS_NM'], columns='REG_YYMM')
pivot

In [20]:
cond1 = df03.CARD_SIDO_NM == '강원'
cond2 = df03.STD_CLSS_NM == '건강보조식품 소매업'
cond3 = df03.REG_YYMM == 202003
df03[cond1 & cond2 & cond3]

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,REG_YYMM,AMT
0,강원,건강보조식품 소매업,202003.0,1185914.0


In [None]:
pivot.to_csv('pivot.csv', encoding='utf-8-sig')

In [None]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [None]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
x = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(train_num['AMT'])

In [None]:
k = int(len(x)*0.9)

In [None]:
x_train = x[:k]
y_train = y[:k]
x_val = x[k:]
y_val = y[k:]

In [None]:
import lightgbm as lgb

In [None]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)

In [None]:
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [None]:
model = lgb.train(params,
                  train_ds,
                  1000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

In [None]:
# 예측 템플릿 만들기
REG_YYMMs = [202004, 202007]
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
#HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
#AGEs          = df_num['AGE'].unique()
#SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
#FLCs          = df_num['FLC'].unique()
#years         = [2020]
#months        = [4, 7]

temp = []
for REG_YYMM in REG_YYMMs:
    for CARD_SIDO_NM in CARD_SIDO_NMs:
        for STD_CLSS_NM in STD_CLSS_NMs:
            temp.append([REG_YYMM, CARD_SIDO_NM, STD_CLSS_NM]) # , AGE, SEX_CTGO_CD, FLC
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=x.columns)

In [None]:
temp

In [None]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [6]:
#df02 = df[df.REG_YYMM == 202002]
dfmean = df[df.REG_YYMM == 202003]
dfmean
#dfmean = pd.concat([df02, df03])
#dfmean

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
23437284,202003,강원,건강보조식품 소매업,2529000
23437285,202003,강원,건강보조식품 소매업,1133650
23437286,202003,강원,건강보조식품 소매업,570800
23437287,202003,강원,건강보조식품 소매업,4192828
23437288,202003,강원,건강보조식품 소매업,3358900
...,...,...,...,...
24697787,202003,충북,휴양콘도 운영업,43300
24697788,202003,충북,휴양콘도 운영업,35000
24697789,202003,충북,휴양콘도 운영업,188000
24697790,202003,충북,휴양콘도 운영업,99000


In [8]:
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM']
df03 = dfmean.groupby(columns).sum().reset_index(drop=False)
df03

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,REG_YYMM,AMT
0,강원,건강보조식품 소매업,202003.0,1.185914e+06
1,강원,골프장 운영업,202003.0,2.036172e+06
2,강원,과실 및 채소 소매업,202003.0,7.105835e+05
3,강원,관광 민예품 및 선물용품 소매업,202003.0,1.040414e+05
4,강원,그외 기타 스포츠시설 운영업,202003.0,5.187500e+05
...,...,...,...,...
604,충북,피자 햄버거 샌드위치 및 유사 음식점업,202003.0,9.033278e+05
605,충북,한식 음식점업,202003.0,1.827617e+06
606,충북,호텔업,202003.0,6.099420e+05
607,충북,화장품 및 방향제 소매업,202003.0,1.004406e+06


In [9]:
df04 = df03.copy()
df07 = df03.copy()
df04

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,REG_YYMM,AMT
0,강원,건강보조식품 소매업,202003.0,1.185914e+06
1,강원,골프장 운영업,202003.0,2.036172e+06
2,강원,과실 및 채소 소매업,202003.0,7.105835e+05
3,강원,관광 민예품 및 선물용품 소매업,202003.0,1.040414e+05
4,강원,그외 기타 스포츠시설 운영업,202003.0,5.187500e+05
...,...,...,...,...
604,충북,피자 햄버거 샌드위치 및 유사 음식점업,202003.0,9.033278e+05
605,충북,한식 음식점업,202003.0,1.827617e+06
606,충북,호텔업,202003.0,6.099420e+05
607,충북,화장품 및 방향제 소매업,202003.0,1.004406e+06


In [10]:
df04['REG_YYMM'] = 202004
df07['REG_YYMM'] = 202007
df04

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,REG_YYMM,AMT
0,강원,건강보조식품 소매업,202004,1.185914e+06
1,강원,골프장 운영업,202004,2.036172e+06
2,강원,과실 및 채소 소매업,202004,7.105835e+05
3,강원,관광 민예품 및 선물용품 소매업,202004,1.040414e+05
4,강원,그외 기타 스포츠시설 운영업,202004,5.187500e+05
...,...,...,...,...
604,충북,피자 햄버거 샌드위치 및 유사 음식점업,202004,9.033278e+05
605,충북,한식 음식점업,202004,1.827617e+06
606,충북,호텔업,202004,6.099420e+05
607,충북,화장품 및 방향제 소매업,202004,1.004406e+06


In [None]:
df07['REG_YYMM']

In [11]:
dfall = pd.concat([df04, df07])
dfall

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,REG_YYMM,AMT
0,강원,건강보조식품 소매업,202004,1.185914e+06
1,강원,골프장 운영업,202004,2.036172e+06
2,강원,과실 및 채소 소매업,202004,7.105835e+05
3,강원,관광 민예품 및 선물용품 소매업,202004,1.040414e+05
4,강원,그외 기타 스포츠시설 운영업,202004,5.187500e+05
...,...,...,...,...
604,충북,피자 햄버거 샌드위치 및 유사 음식점업,202007,9.033278e+05
605,충북,한식 음식점업,202007,1.827617e+06
606,충북,호텔업,202007,6.099420e+05
607,충북,화장품 및 방향제 소매업,202007,1.004406e+06


In [12]:
submission = pd.read_csv('data/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)

In [13]:
submission = submission.merge(dfall, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'

In [14]:
idx = submission[submission.AMT.isnull() == True].index
idx

Int64Index([   4,   14,   15,   30,   35,   45,   55,   71,   96,  110,
            ...
            1326, 1342, 1347, 1357, 1365, 1367, 1368, 1376, 1381, 1383],
           dtype='int64', name='id', length=176)

In [15]:
submission = submission.fillna(1)

In [16]:
# 제출 파일 만들기
submission.to_csv('20200709_1.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,1185914.0
1,202004,강원,골프장 운영업,2036172.0
2,202004,강원,과실 및 채소 소매업,710583.5
3,202004,강원,관광 민예품 및 선물용품 소매업,104041.4
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,1.0
