In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(601)

import sklearn
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [25]:
# 날짜 처리
data = pd.read_csv('data/201901-202003.csv')

In [26]:
data.drop(['CARD_CCG_NM', 'HOM_CCG_NM'   , 'AGE', 'SEX_CTGO_CD', 'FLC','CSTMR_CNT'], axis=1, inplace=True)

In [27]:
data

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AMT,CNT
0,201901,강원,건강보조식품 소매업,강원,311200,4
1,201901,강원,건강보조식품 소매업,강원,1374500,8
2,201901,강원,건강보조식품 소매업,강원,818700,6
3,201901,강원,건강보조식품 소매업,강원,1717000,5
4,201901,강원,건강보조식품 소매업,강원,1047300,3
...,...,...,...,...,...,...
24697787,202003,충북,휴양콘도 운영업,충북,43300,4
24697788,202003,충북,휴양콘도 운영업,충북,35000,3
24697789,202003,충북,휴양콘도 운영업,충북,188000,6
24697790,202003,충북,휴양콘도 운영업,충북,99000,6


In [None]:
data[pd.isnull(data)].sum()

In [28]:
def get_outlier(df=None, column=None, weight=1.5):
    # fraud에 해당하는 column 데이터만 추출, 1/4 분위와 3/4 분위 지점을 np.percentile로 구함. 
    
    fraud = df[column]
    quantile_25 = np.percentile(fraud.values, 25)
    quantile_75 = np.percentile(fraud.values, 75)
    
    # IQR을 구하고, IQR에 1.5를 곱하여 최대값과 최소값 지점 구함. 
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight
    
    # 최대값 보다 크거나, 최소값 보다 작은 값을 아웃라이어로 설정하고 DataFrame index 반환. 
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
    
    return outlier_index

In [30]:
outlier_index = get_outlier(df=data, column='AMT', weight=1.5)
print('이상치 데이터 인덱스:', outlier_index)
data.drop(outlier_index, axis=0, inplace=True)

이상치 데이터 인덱스: Int64Index([       6,        7,        9,       13,       16,       18,
                  20,       34,       35,       38,
            ...
            24697759, 24697761, 24697773, 24697778, 24697779, 24697780,
            24697781, 24697782, 24697783, 24697785],
           dtype='int64', length=3426705)


In [31]:
data

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AMT,CNT
0,201901,강원,건강보조식품 소매업,강원,311200,4
1,201901,강원,건강보조식품 소매업,강원,1374500,8
2,201901,강원,건강보조식품 소매업,강원,818700,6
3,201901,강원,건강보조식품 소매업,강원,1717000,5
4,201901,강원,건강보조식품 소매업,강원,1047300,3
...,...,...,...,...,...,...
24697787,202003,충북,휴양콘도 운영업,충북,43300,4
24697788,202003,충북,휴양콘도 운영업,충북,35000,3
24697789,202003,충북,휴양콘도 운영업,충북,188000,6
24697790,202003,충북,휴양콘도 운영업,충북,99000,6


In [32]:
#data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [6]:
data

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AMT,CNT,year,month
0,강원,건강보조식품 소매업,강원,311200,4,2019,1
1,강원,건강보조식품 소매업,강원,1374500,8,2019,1
2,강원,건강보조식품 소매업,강원,818700,6,2019,1
3,강원,건강보조식품 소매업,강원,1717000,5,2019,1
4,강원,건강보조식품 소매업,강원,1047300,3,2019,1
...,...,...,...,...,...,...,...
24697787,충북,휴양콘도 운영업,충북,43300,4,2020,3
24697788,충북,휴양콘도 운영업,충북,35000,3,2020,3
24697789,충북,휴양콘도 운영업,충북,188000,6,2020,3
24697790,충북,휴양콘도 운영업,충북,99000,6,2020,3


In [None]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'   , 'AGE', 'SEX_CTGO_CD', 'FLC'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'year', 'month']


In [None]:
df = data.copy()
columns = ['REG_YYMM','CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM']


In [33]:
df = data.copy()
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [34]:
df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year,month,AMT,CNT
0,강원,건강보조식품 소매업,강원,2019,1,49682755,480
1,강원,건강보조식품 소매업,강원,2019,2,51207871,459
2,강원,건강보조식품 소매업,강원,2019,3,35459319,474
3,강원,건강보조식품 소매업,강원,2019,4,43317508,526
4,강원,건강보조식품 소매업,강원,2019,5,50876872,507
...,...,...,...,...,...,...,...
106209,충북,휴양콘도 운영업,충북,2019,11,17555420,496
106210,충북,휴양콘도 운영업,충북,2019,12,21765500,550
106211,충북,휴양콘도 운영업,충북,2020,1,29644576,788
106212,충북,휴양콘도 운영업,충북,2020,2,8025000,287


In [35]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [21]:
df_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year,month,AMT,CNT
0,0,0,0,2019,1,147831746,1105
1,0,0,0,2019,2,146166571,1150
2,0,0,0,2019,3,109439025,1020
3,0,0,0,2019,4,105208473,989
4,0,0,0,2019,5,146659738,1191
...,...,...,...,...,...,...,...
106381,16,40,16,2019,11,19681520,535
106382,16,40,16,2019,12,23808100,612
106383,16,40,16,2020,1,34335376,903
106384,16,40,16,2020,2,10002800,315


In [None]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
x = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(train_num['AMT'])

In [36]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
x = train_num.drop(['AMT', 'CNT'], axis=1)
y = np.log1p(train_num['AMT'])

In [37]:
k = int(len(x)*0.9)

In [38]:
x_train = x[:k]
y_train = y[:k]
x_val = x[k:]
y_val = y[k:]

In [39]:
import lightgbm as lgb

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [40]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)

In [41]:
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [42]:
model = lgb.train(params,
                  train_ds,
                  1000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.67273
[200]	valid_0's rmse: 1.47194
[300]	valid_0's rmse: 1.35072
[400]	valid_0's rmse: 1.27774
[500]	valid_0's rmse: 1.21895
[600]	valid_0's rmse: 1.16501
[700]	valid_0's rmse: 1.11031
[800]	valid_0's rmse: 1.07503
[900]	valid_0's rmse: 1.04312
[1000]	valid_0's rmse: 1.01164
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1.01164


In [43]:
# 예측 템플릿 만들기
#REG_YYMMs     = [202004, 202007]
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
#AGEs          = df_num['AGE'].unique()
#SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
#FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
#for REG_YYMM in REG_YYMMs:
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            #for AGE in AGEs:
                #for SEX_CTGO_CD in SEX_CTGO_CDs:
                    #for FLC in FLCs:
            for year in years:
                for month in months:
                    temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, year, month]) # , AGE, SEX_CTGO_CD, FLC, 
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=x.columns)

In [44]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year,month
0,0,0,0,2020,4
1,0,0,0,2020,7
2,0,0,1,2020,4
3,0,0,1,2020,7
4,0,0,2,2020,4
...,...,...,...,...,...
23693,16,30,10,2020,7
23694,16,30,12,2020,4
23695,16,30,12,2020,7
23696,16,30,14,2020,4


In [45]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [46]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,0,0,2.967448e+07
1,202004,0,1,2.629636e+08
2,202004,0,2,1.779094e+08
3,202004,0,3,1.253341e+07
4,202004,0,4,9.133140e+05
...,...,...,...,...
1389,202007,16,36,9.733619e+07
1390,202007,16,37,2.595307e+09
1391,202007,16,38,2.484390e+07
1392,202007,16,39,2.948701e+07


In [47]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [48]:
temp

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,건강보조식품 소매업,2.967448e+07
1,202004,강원,골프장 운영업,2.629636e+08
2,202004,강원,과실 및 채소 소매업,1.779094e+08
3,202004,강원,관광 민예품 및 선물용품 소매업,1.253341e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,9.133140e+05
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,9.733619e+07
1390,202007,충북,한식 음식점업,2.595307e+09
1391,202007,충북,호텔업,2.484390e+07
1392,202007,충북,화장품 및 방향제 소매업,2.948701e+07


In [49]:
# 제출 파일 만들기
submission = pd.read_csv('data/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('20200707_1.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,29674482.0
1,202004,강원,골프장 운영업,262963557.0
2,202004,강원,과실 및 채소 소매업,177909425.0
3,202004,강원,관광 민예품 및 선물용품 소매업,12533414.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,913314.0
