In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder

## Data Cleansing & Pre-Processing

In [3]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [4]:
world_gdp = pd.read_csv("data/world_gdp.csv")
trade_amount = pd.read_csv("data/Trade in goods and services forecast.csv")

In [5]:
# date
data = pd.read_csv('../../Data/Jeju/201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))

In [6]:
data.REG_YYMM = pd.to_datetime(data.REG_YYMM, format='%Y%m')

In [7]:
world_gdp['Unnamed: 0'] = pd.to_datetime(world_gdp['Unnamed: 0'])
trade_amount['Unnamed: 0'] = pd.to_datetime(trade_amount['Unnamed: 0'])

In [8]:
data = data.drop(['REG_YYMM'], axis=1)

In [9]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

# 카테고리 별로 묶어서 AMT를 각각 예측
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [10]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [11]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(train_num['AMT'])

In [12]:
train_features['kor_gdp'] = 0
train_features['trade_amount'] = 0

for date, i in zip(world_gdp['Unnamed: 0'], range(len(world_gdp))):
    train_features.loc[(train_features['year'] == date.year) & (train_features['month'] == date.month), 'kor_gdp'] = world_gdp.iloc[i].KOR_gdp
    
for date, i in zip(trade_amount['Unnamed: 0'], range(len(trade_amount))):
    train_features.loc[(train_features['year'] == date.year) & (train_features['month'] == date.month), 'trade_amount'] = trade_amount.iloc[i].Value
    

In [13]:
train_features.trade_amount = np.log1p(train_features['trade_amount'])

## Model Tuning & Evaluation

In [14]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)

In [15]:
temp = pd.DataFrame(data=temp, columns=['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 
                                        'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month'])

In [16]:
temp['kor_gdp'] = 0
temp['trade_amount'] = 0

for date, i in zip(world_gdp['Unnamed: 0'], range(len(world_gdp))):
    temp.loc[(temp['year'] == date.year) & (temp['month'] == date.month), 'kor_gdp'] = world_gdp.iloc[i].KOR_gdp
    
for date, i in zip(trade_amount['Unnamed: 0'], range(len(trade_amount))):
    temp.loc[(temp['year'] == date.year) & (temp['month'] == date.month), 'trade_amount'] = np.log1p(trade_amount.iloc[i].Value)
    

In [17]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [24]:
# Split data -> train_features, train_target
X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.3, shuffle=101)

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:tweedie'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1
    
watchlist = [(d_train, 'train'), (d_test, 'test')]

model = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=800, verbose_eval=10)

preds = model.predict(xgb.DMatrix(X_test))

rmse = np.sqrt(mean_squared_error(y_test, preds))

print("RMSE: %f" % (rmse))

[0]	train-mae:13.7376	test-mae:13.7452
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 800 rounds.
[10]	train-mae:13.5077	test-mae:13.5152
[20]	train-mae:13.1904	test-mae:13.1979
[30]	train-mae:12.7653	test-mae:12.7727
[40]	train-mae:12.2162	test-mae:12.2237
[50]	train-mae:11.537	test-mae:11.5445
[60]	train-mae:10.7359	test-mae:10.7433
[70]	train-mae:9.83654	test-mae:9.84406
[80]	train-mae:8.87464	test-mae:8.88208
[90]	train-mae:7.89077	test-mae:7.89826
[100]	train-mae:6.92358	test-mae:6.93109
[110]	train-mae:6.00435	test-mae:6.01182
[120]	train-mae:5.15573	test-mae:5.1631
[130]	train-mae:4.39538	test-mae:4.40296
[140]	train-mae:3.74248	test-mae:3.75045
[150]	train-mae:3.21029	test-mae:3.21829
[160]	train-mae:2.79899	test-mae:2.80627
[170]	train-mae:2.49424	test-mae:2.50083
[180]	train-mae:2.27613	test-mae:2.28203
[190]	train-mae:2.12267	test-mae:2.12804
[200]	train-mae:2.01584	test-mae:2.0207
[210]	train

[1960]	train-mae:1.43773	test-mae:1.44076
[1970]	train-mae:1.43697	test-mae:1.43999
[1980]	train-mae:1.43573	test-mae:1.43875
[1990]	train-mae:1.43496	test-mae:1.43799
[2000]	train-mae:1.43429	test-mae:1.43732
[2010]	train-mae:1.43328	test-mae:1.43632
[2020]	train-mae:1.43246	test-mae:1.43551
[2030]	train-mae:1.43145	test-mae:1.43451
[2040]	train-mae:1.42965	test-mae:1.43274
[2050]	train-mae:1.42753	test-mae:1.43065
[2060]	train-mae:1.42681	test-mae:1.42995
[2070]	train-mae:1.42455	test-mae:1.42765
[2080]	train-mae:1.42341	test-mae:1.42648
[2090]	train-mae:1.42202	test-mae:1.42509
[2100]	train-mae:1.42083	test-mae:1.42391
[2110]	train-mae:1.41977	test-mae:1.42285
[2120]	train-mae:1.41908	test-mae:1.42215
[2130]	train-mae:1.41796	test-mae:1.42104
[2140]	train-mae:1.41677	test-mae:1.41988
[2150]	train-mae:1.41531	test-mae:1.41845
[2160]	train-mae:1.41453	test-mae:1.41768
[2170]	train-mae:1.41345	test-mae:1.41662
[2180]	train-mae:1.41237	test-mae:1.41555
[2190]	train-mae:1.41118	test-mae:

[3920]	train-mae:1.28164	test-mae:1.28483
[3930]	train-mae:1.28061	test-mae:1.28379
[3940]	train-mae:1.28	test-mae:1.28318
[3950]	train-mae:1.27948	test-mae:1.28266
[3960]	train-mae:1.27905	test-mae:1.28223
[3970]	train-mae:1.27871	test-mae:1.2819
[3980]	train-mae:1.27821	test-mae:1.28141
[3990]	train-mae:1.27742	test-mae:1.28063
[4000]	train-mae:1.27633	test-mae:1.27955
[4010]	train-mae:1.27537	test-mae:1.27861
[4020]	train-mae:1.27468	test-mae:1.27791
[4030]	train-mae:1.27355	test-mae:1.27677
[4040]	train-mae:1.27299	test-mae:1.27621
[4050]	train-mae:1.27199	test-mae:1.2752
[4060]	train-mae:1.27117	test-mae:1.27438
[4070]	train-mae:1.27096	test-mae:1.27416
[4080]	train-mae:1.27057	test-mae:1.27377
[4090]	train-mae:1.27015	test-mae:1.27335
[4100]	train-mae:1.26967	test-mae:1.27287
[4110]	train-mae:1.26901	test-mae:1.27221
[4120]	train-mae:1.2687	test-mae:1.27191
[4130]	train-mae:1.26832	test-mae:1.27153
[4140]	train-mae:1.268	test-mae:1.27121
[4150]	train-mae:1.26753	test-mae:1.27074


[5890]	train-mae:1.1992	test-mae:1.2032
[5900]	train-mae:1.19861	test-mae:1.20262
[5910]	train-mae:1.19824	test-mae:1.20225
[5920]	train-mae:1.19725	test-mae:1.20127
[5930]	train-mae:1.19637	test-mae:1.20038
[5940]	train-mae:1.19611	test-mae:1.20013
[5950]	train-mae:1.19525	test-mae:1.19927
[5960]	train-mae:1.19413	test-mae:1.19816
[5970]	train-mae:1.19331	test-mae:1.19735
[5980]	train-mae:1.19174	test-mae:1.19576
[5990]	train-mae:1.19146	test-mae:1.19548
[6000]	train-mae:1.19125	test-mae:1.19528
[6010]	train-mae:1.19092	test-mae:1.19496
[6020]	train-mae:1.19066	test-mae:1.1947
[6030]	train-mae:1.1897	test-mae:1.19376
[6040]	train-mae:1.18914	test-mae:1.19318
[6050]	train-mae:1.18844	test-mae:1.19249
[6060]	train-mae:1.18799	test-mae:1.19202
[6070]	train-mae:1.18737	test-mae:1.19139
[6080]	train-mae:1.18692	test-mae:1.19094
[6090]	train-mae:1.18639	test-mae:1.19039
[6100]	train-mae:1.18592	test-mae:1.18991
[6110]	train-mae:1.18572	test-mae:1.18971
[6120]	train-mae:1.18504	test-mae:1.18

[7860]	train-mae:1.11532	test-mae:1.11939
[7870]	train-mae:1.11517	test-mae:1.11925
[7880]	train-mae:1.11499	test-mae:1.11908
[7890]	train-mae:1.11469	test-mae:1.11878
[7900]	train-mae:1.11445	test-mae:1.11852
[7910]	train-mae:1.11407	test-mae:1.11814
[7920]	train-mae:1.11364	test-mae:1.1177
[7930]	train-mae:1.11354	test-mae:1.1176
[7940]	train-mae:1.11338	test-mae:1.11746
[7950]	train-mae:1.1133	test-mae:1.11738
[7960]	train-mae:1.11301	test-mae:1.11707
[7970]	train-mae:1.11264	test-mae:1.11671
[7980]	train-mae:1.11253	test-mae:1.1166
[7990]	train-mae:1.11192	test-mae:1.11597
[8000]	train-mae:1.11177	test-mae:1.11582
[8010]	train-mae:1.11172	test-mae:1.11577
[8020]	train-mae:1.11122	test-mae:1.11527
[8030]	train-mae:1.11112	test-mae:1.11516
[8040]	train-mae:1.11093	test-mae:1.11498
[8050]	train-mae:1.11063	test-mae:1.11468
[8060]	train-mae:1.11029	test-mae:1.11434
[8070]	train-mae:1.10973	test-mae:1.11381
[8080]	train-mae:1.10944	test-mae:1.11353
[8090]	train-mae:1.10918	test-mae:1.11

[9820]	train-mae:1.05883	test-mae:1.06311
[9830]	train-mae:1.05872	test-mae:1.063
[9840]	train-mae:1.05859	test-mae:1.06287
[9850]	train-mae:1.05849	test-mae:1.06277
[9860]	train-mae:1.0584	test-mae:1.06269
[9870]	train-mae:1.05831	test-mae:1.0626
[9880]	train-mae:1.05811	test-mae:1.06241
[9890]	train-mae:1.05796	test-mae:1.06226
[9900]	train-mae:1.05787	test-mae:1.06217
[9910]	train-mae:1.05766	test-mae:1.06196
[9920]	train-mae:1.05757	test-mae:1.06187
[9930]	train-mae:1.05746	test-mae:1.06175
[9940]	train-mae:1.05734	test-mae:1.06164
[9950]	train-mae:1.05698	test-mae:1.06129
[9960]	train-mae:1.05669	test-mae:1.061
[9970]	train-mae:1.05648	test-mae:1.06079
[9980]	train-mae:1.05603	test-mae:1.06034
[9990]	train-mae:1.05567	test-mae:1.05998
[9999]	train-mae:1.05551	test-mae:1.05981
RMSE: 1.338080


In [25]:
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

AttributeError: 'Booster' object has no attribute 'evals_result'

In [26]:
prediction = model.predict(xgb.DMatrix(temp))

In [27]:
prediction

array([12.386752 , 12.309543 , 11.206735 , ...,  6.304988 ,  6.0002875,
        6.304988 ], dtype=float32)

In [29]:
prediction = np.expm1(prediction)
temp['AMT'] = np.round(prediction, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [30]:
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [31]:
submission = pd.read_csv('../../Data/Jeju/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('submission/xgboost_tweedie.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,205519300.0
1,202004,강원,골프장 운영업,1753121000.0
2,202004,강원,과실 및 채소 소매업,1096827000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,43618560.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,11119950.0
