In [5]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

## Data Cleansing & Pre-Processing

In [6]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [7]:
world_gdp = pd.read_csv("data/world_gdp.csv")
trade_amount = pd.read_csv("data/Trade in goods and services forecast.csv")

In [8]:
# date
data = pd.read_csv('../../Data/Jeju/201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))

In [9]:
data.REG_YYMM = pd.to_datetime(data.REG_YYMM, format='%Y%m')

In [10]:
world_gdp['Unnamed: 0'] = pd.to_datetime(world_gdp['Unnamed: 0'])
trade_amount['Unnamed: 0'] = pd.to_datetime(trade_amount['Unnamed: 0'])

In [11]:
data = data.drop(['REG_YYMM'], axis=1)

In [57]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

# 카테고리 별로 묶어서 AMT를 각각 예측
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [58]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [None]:
df.info()

## Exploratory Data Analysis

## Feature Engineering & Initial Modeling

In [59]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(train_num['AMT'])

In [60]:
train_features['kor_gdp'] = 0
train_features['trade_amount'] = 0

for date, i in zip(world_gdp['Unnamed: 0'], range(len(world_gdp))):
    train_features.loc[(train_features['year'] == date.year) & (train_features['month'] == date.month), 'kor_gdp'] = world_gdp.iloc[i].KOR_gdp
    
for date, i in zip(trade_amount['Unnamed: 0'], range(len(trade_amount))):
    train_features.loc[(train_features['year'] == date.year) & (train_features['month'] == date.month), 'trade_amount'] = trade_amount.iloc[i].Value
    

In [61]:
train_features.trade_amount = np.log1p(train_features['trade_amount'])

## Model Tuning & Evaluation

In [None]:
# Random Forest
model = RandomForestRegressor(n_jobs=-1, random_state=0)
model.fit(train_features, train_target)

In [66]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)

In [67]:
temp = pd.DataFrame(data=temp, columns=['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 
                                        'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month'])

In [68]:
temp['kor_gdp'] = 0
temp['trade_amount'] = 0

for date, i in zip(world_gdp['Unnamed: 0'], range(len(world_gdp))):
    temp.loc[(temp['year'] == date.year) & (temp['month'] == date.month), 'kor_gdp'] = world_gdp.iloc[i].KOR_gdp
    
for date, i in zip(trade_amount['Unnamed: 0'], range(len(trade_amount))):
    temp.loc[(temp['year'] == date.year) & (temp['month'] == date.month), 'trade_amount'] = np.log1p(trade_amount.iloc[i].Value)
    

In [None]:
temp_temp = temp.copy()

In [111]:
# 예측(Random Forest)
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [112]:
# 디코딩 (Random Forest)
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [113]:
# 제출 파일 만들기(Random Forest)
submission = pd.read_csv('../../Data/Jeju/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('submission/kor_gdp_trade_amount_RF.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,474945900.0
1,202004,강원,골프장 운영업,5737034000.0
2,202004,강원,과실 및 채소 소매업,1879838000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,103665800.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,100021200.0


In [35]:
train_features

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,kor_gdp,trade_amount
149245,2,4,2,2,1,2,2019,4,4.118698,27.141631
554956,8,26,11,1,1,2,2020,2,-5.029080,27.121861
918516,14,33,14,4,2,4,2019,5,4.118698,27.141631
425414,7,18,8,2,2,1,2019,10,5.350648,27.157812
640949,9,39,16,5,2,5,2020,1,-5.029080,27.121861
...,...,...,...,...,...,...,...,...,...,...
359783,6,16,9,1,1,1,2019,12,5.350648,27.157812
152315,2,7,11,3,2,3,2020,3,-5.029080,27.121861
963395,15,19,9,5,1,5,2020,2,-5.029080,27.121861
117952,1,31,4,1,1,2,2019,7,1.510299,27.154597


In [37]:
train_target

149245    12.524530
554956    11.399768
918516    20.495010
425414    14.850157
640949    12.762830
            ...    
359783    15.138871
152315    12.345448
963395    13.691082
117952    11.584697
305711    12.944458
Name: AMT, Length: 1057394, dtype: float64

In [69]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [91]:
k = int(len(train_features)*0.8)

In [92]:
"""x_train = train_features[:k]
y_train = train_target[:k]
x_val = train_features[k:]
y_val = train_target[k:]"""

In [98]:
 x_train, x_val, y_train, y_val = train_test_split(train_features, train_target, random_state=42)

In [99]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)

In [100]:
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [102]:
model = lgb.train(params,
                  train_ds,
                  5000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.97669
[200]	valid_0's rmse: 1.80236
[300]	valid_0's rmse: 1.67421
[400]	valid_0's rmse: 1.59336
[500]	valid_0's rmse: 1.50983
[600]	valid_0's rmse: 1.45913
[700]	valid_0's rmse: 1.41036
[800]	valid_0's rmse: 1.35388
[900]	valid_0's rmse: 1.31624
[1000]	valid_0's rmse: 1.27966
[1100]	valid_0's rmse: 1.25016
[1200]	valid_0's rmse: 1.22295
[1300]	valid_0's rmse: 1.20152
[1400]	valid_0's rmse: 1.17102
[1500]	valid_0's rmse: 1.14961
[1600]	valid_0's rmse: 1.12764
[1700]	valid_0's rmse: 1.11034
[1800]	valid_0's rmse: 1.09335
[1900]	valid_0's rmse: 1.07771
[2000]	valid_0's rmse: 1.0628
[2100]	valid_0's rmse: 1.04588
[2200]	valid_0's rmse: 1.03221
[2300]	valid_0's rmse: 1.01892
[2400]	valid_0's rmse: 1.00703
[2500]	valid_0's rmse: 0.996135
[2600]	valid_0's rmse: 0.984182
[2700]	valid_0's rmse: 0.975801
[2800]	valid_0's rmse: 0.966017
[2900]	valid_0's rmse: 0.957454
[3000]	valid_0's rmse: 0.949711
[3100]	valid

In [114]:
ax = lgb.plot_metric(train_ds, metric='l2')
plt.show()

TypeError: booster must be dict or LGBMModel.

In [106]:
temp = temp_temp

NameError: name 'temp_temp' is not defined

In [108]:
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [109]:
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [111]:

submission = pd.read_csv('../../Data/Jeju/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('submission/lightgbm_korea_gdp_trade_amount.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,178912800.0
1,202004,강원,골프장 운영업,3430744000.0
2,202004,강원,과실 및 채소 소매업,1350315000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,32225660.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,8352531.0
