In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jeju-dataset/sample_submission.csv
/kaggle/input/jeju-dataset/train.csv
/kaggle/input/jeju-dataset/test.csv
/kaggle/input/jeju-dataset/international_trade.csv


In [2]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn import tree
# from catboost import CatBoostRegressor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.model_selection import KFold, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer

In [3]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

DATA_PATH = '/kaggle/input/jeju-dataset/'

SEED = 42

train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')
submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
trade = pd.read_csv(f'{DATA_PATH}international_trade.csv')

In [4]:

train['name'] = train['ID'].apply(lambda x: x[:6])
test['name'] = test['ID'].apply(lambda x: x[:6])

In [5]:
train.rename(columns={'supply(kg)': 'supply_kg', 'price(원/kg)': 'price'}, inplace=True)
test.rename(columns={'supply(kg)': 'supply_kg', 'price(원/kg)': 'price'}, inplace=True)


## 로그 변환 1 (price, supply)

In [6]:
# price_original = train['price']
# supply_original = train['supply_kg']

# train['price'] = np.log(train['price'])
# train['supply_kg'] = np.log(train['supply_kg'])

# train.replace([np.inf, -np.inf], 0, inplace=True)

# train.info()

## 시간 Feature

In [7]:
train.timestamp = pd.to_datetime(train.timestamp)
test.timestamp = pd.to_datetime(test.timestamp)


In [8]:
train['year'] = train.timestamp.dt.year
train['month'] = train.timestamp.dt.month
train['day'] = train.timestamp.dt.day # 드롭예정
train['day_of_week'] = train.timestamp.dt.dayofweek
train['day_of_year'] = train.timestamp.dt.dayofyear
train['week_of_year'] = train.timestamp.dt.isocalendar().week  # 연중 주간 (ISO 주차 기준)
# train['is_weekend'] = train.day_of_week.apply(lambda x: 1 if x >= 5 else 0)
# train['is_sunday'] = train.day_of_week.apply(lambda x: 1 if x == 6 else 0)

import math

# 연중일 주기 
def cos_year(date):
    day_of_year = date.timetuple().tm_yday  # 해당 날짜의 연중 몇 번째 날인지를 가져옵니다.
    period = 365  # 1년은 365일
    return math.cos(2 * math.pi * day_of_year / period)

def sin_year(date):
    day_of_year = date.timetuple().tm_yday  # 해당 날짜의 연중 몇 번째 날인지를 가져옵니다.
    period = 365  # 1년은 365일
    return math.sin(2 * math.pi * day_of_year / period)

train['cos_year'] = train['timestamp'].apply(cos_year)
train['sin_year'] = train['timestamp'].apply(sin_year)

# 요일 주기 (주간)
def cos_weekday(date):
    day_of_week = date.weekday()  # Monday is 0, Sunday is 6
    period = 7  # 1 week has 7 days
    return math.cos(2 * math.pi * day_of_week / period)

def sin_weekday(date):
    day_of_week = date.weekday()  # Monday is 0, Sunday is 6
    period = 7  # 1 week has 7 days
    return math.sin(2 * math.pi * day_of_week / period)

train['cos_dow'] = train['timestamp'].apply(cos_weekday)
train['sin_dow'] = train['timestamp'].apply(sin_weekday)

# 연중주 주기 
def cos_week_of_year(date):
    week_of_year = date.isocalendar()[1]  # Get the ISO week number
    period = 52  # 1 year has 52 weeks
    return math.cos(2 * math.pi * week_of_year / period)

def sin_week_of_year(date):
    week_of_year = date.isocalendar()[1]  # Get the ISO week number
    period = 52  # 1 year has 52 weeks
    return math.sin(2 * math.pi * week_of_year / period)

train['cos_week_of_year'] = train['timestamp'].apply(cos_week_of_year)
train['sin_week_of_year'] = train['timestamp'].apply(sin_week_of_year)

In [9]:
test['year'] = test.timestamp.dt.year
test['month'] = test.timestamp.dt.month
test['day'] = test.timestamp.dt.day # 드롭예정
test['day_of_week'] = test.timestamp.dt.dayofweek
test['day_of_year'] = test.timestamp.dt.dayofyear
test['week_of_year'] = test.timestamp.dt.isocalendar().week  # 연중 주간 (ISO 주차 기준)
# test['is_weekend'] = test.day_of_week.apply(lambda x: 1 if x >= 5 else 0)
# test['is_sunday'] = test.day_of_week.apply(lambda x: 1 if x == 6 else 0)

import math

# 연중일 주기 
def cos_year(date):
    day_of_year = date.timetuple().tm_yday  # 해당 날짜의 연중 몇 번째 날인지를 가져옵니다.
    period = 365  # 1년은 365일
    return math.cos(2 * math.pi * day_of_year / period)

def sin_year(date):
    day_of_year = date.timetuple().tm_yday  # 해당 날짜의 연중 몇 번째 날인지를 가져옵니다.
    period = 365  # 1년은 365일
    return math.sin(2 * math.pi * day_of_year / period)

test['cos_year'] = test['timestamp'].apply(cos_year)
test['sin_year'] = test['timestamp'].apply(sin_year)

# 요일 주기 (주간)
def cos_weekday(date):
    day_of_week = date.weekday()  # Monday is 0, Sunday is 6
    period = 7  # 1 week has 7 days
    return math.cos(2 * math.pi * day_of_week / period)

def sin_weekday(date):
    day_of_week = date.weekday()  # Monday is 0, Sunday is 6
    period = 7  # 1 week has 7 days
    return math.sin(2 * math.pi * day_of_week / period)

test['cos_dow'] = test['timestamp'].apply(cos_weekday)
test['sin_dow'] = test['timestamp'].apply(sin_weekday)

# 연중주 주기 
def cos_week_of_year(date):
    week_of_year = date.isocalendar()[1]  # Get the ISO week number
    period = 52  # 1 year has 52 weeks
    return math.cos(2 * math.pi * week_of_year / period)

def sin_week_of_year(date):
    week_of_year = date.isocalendar()[1]  # Get the ISO week number
    period = 52  # 1 year has 52 weeks
    return math.sin(2 * math.pi * week_of_year / period)

test['cos_week_of_year'] = test['timestamp'].apply(cos_week_of_year)
test['sin_week_of_year'] = test['timestamp'].apply(sin_week_of_year)

In [10]:

# train, test에 추가

price_name_y_m_dow_avg = train.groupby(['name','year','month','day_of_week'])['price'].mean().reset_index()
train = train.merge(price_name_y_m_dow_avg, on = ['name','year','month','day_of_week'],how='left',suffixes=('','_name_y_m_dow_avg'))
test = test.merge(price_name_y_m_dow_avg, on = ['name','year','month','day_of_week'],how='left',suffixes=('','_name_y_m_dow_avg'))

price_name_y_m_dow_median = train.groupby(['name','year','month','day_of_week'])['price'].median().reset_index()
train = train.merge(price_name_y_m_dow_median, on = ['name','year','month','day_of_week'],how='left',suffixes=('','_name_y_m_dow_median'))
test = test.merge(price_name_y_m_dow_median, on = ['name','year','month','day_of_week'],how='left',suffixes=('','_name_y_m_dow_median'))

supply_corp_m_std = train.groupby(['corporation','month'])['supply_kg'].std().reset_index()
train = train.merge(supply_corp_m_std, on = ['corporation','month'],how='left',suffixes=('','_corp_m_std'))
test = test.merge(supply_corp_m_std, on = ['corporation','month'],how='left',suffixes=('','_corp_m_std'))

supply_corp_m_max = train.groupby(['corporation','month'])['supply_kg'].max().reset_index()
train = train.merge(supply_corp_m_max, on = ['corporation','month'],how='left',suffixes=('','_corp_m_max'))
test = test.merge(supply_corp_m_max, on = ['corporation','month'],how='left',suffixes=('','_corp_m_max'))


In [11]:
trade = pd.read_csv(f'{DATA_PATH}international_trade.csv')

In [12]:
trade['기간']= pd.to_datetime(trade['기간'])
trade['year'] = trade['기간'].dt.year
trade['month'] = trade['기간'].dt.month

trade['수출 금액'] = trade['수출 금액']*1000 * 1200 #천달러 단위 -> 환율 1200원으로 계산
trade['수입 금액'] = trade['수입 금액']*1000 * 1200

trade['무역수지'] = trade['수출 금액'] - trade['수입 금액']
trade['수출 단가'] = trade['수출 금액'] / trade['수출 중량']
trade['수입 단가'] = trade['수입 금액'] / trade['수입 중량']

trade = trade.fillna(0)

trade.columns = ['timestamp','item','exp_supply','exp_revenue','imp_supply','imp_revenue','trade_balance','year','month','exp_price','imp_price']

In [13]:
# 방울다다기 양배추 제외 버전
mask = trade['item'].str.contains('감귤|브로콜리|순무|당근|양배추')
trade_items = trade[mask]

trade_items = trade_items.replace({'꽃양배추와 브로콜리(broccoli)':'BC',
                           '양배추':'CB',
                           '당근':'CR',
                           '감귤':'TG',
                           '순무':'RD'})
cb_extra = trade_items['item'] == '방울다다기 양배추' # 방울다다기 양배추 미포함
trade_items = trade_items[~cb_extra]

trade_items = trade_items.drop(columns = 'timestamp').reset_index().drop(columns = 'index')


trade_items_month = pd.pivot_table(trade_items, index=['item','month'],fill_value=0).reset_index()
trade_items_month = trade_items_month.round(1)

# trade_items = pd.pivot_table(trade_items, index=['item','year','month'],fill_value=0).reset_index()
# trade_items = trade_items.round(1)

# * 평균이랑 + max-min평균

In [14]:
trade_items_month

Unnamed: 0,item,month,exp_price,exp_revenue,exp_supply,imp_price,imp_revenue,imp_supply,trade_balance,year
0,BC,1,9000.0,480000,46.4,1307.3,902160000,682774.2,-901680000,2021.0
1,BC,2,1057.7,1200000,423.2,1277.8,519360000,403745.2,-518160000,2021.0
2,BC,3,24000.0,1200000,45.5,1331.6,983400000,753128.5,-982200000,2020.5
3,BC,4,3571.4,300000,33.5,1503.9,1327200000,890748.0,-1326900000,2020.5
4,BC,5,18750.0,300000,18.0,1422.4,1027500000,722724.2,-1027200000,2020.5
5,BC,6,0.0,0,12.0,1342.4,682800000,513319.5,-682800000,2020.5
6,BC,7,0.0,0,20.0,1391.0,892800000,642013.2,-892800000,2020.5
7,BC,8,0.0,0,8.2,1441.8,1760100000,1224899.8,-1760100000,2020.5
8,BC,9,0.0,0,6.0,1427.8,2012400000,1407729.0,-2012400000,2020.5
9,BC,10,0.0,0,6.0,1376.7,1297200000,971622.5,-1297200000,2020.5


In [15]:
from numpy.core.numeric import tensordot
# 버전 1 ) item, year, month 기준 채우고 3월 값들은 0으로 대체

# train = pd.merge(train,trade_items,on=['item','year','month'],how='left')
# test = pd.merge(test,trade_items,on=['item','year','month'],how='left')

# # 버전 2 ) item, month 기준 채우기
trade_items_month = trade_items_month.drop(columns='year')



## 로그변환 2 - trade 

In [16]:
# trade_items_month['exp_price'] = np.log(trade_items_month['exp_price'])
# trade_items_month['exp_revenue'] = np.log(trade_items_month['exp_revenue'])
# trade_items_month['exp_supply'] = np.log(trade_items_month['exp_supply'])
# trade_items_month['imp_price'] = np.log(trade_items_month['imp_price'])
# trade_items_month['imp_revenue'] = np.log(trade_items_month['imp_revenue'])
# trade_items_month['imp_supply'] = np.log(trade_items_month['imp_supply'])
# trade_items_month['trade_balance'] = trade_items_month['exp_revenue'] - trade_items_month['imp_revenue']
# trade_items_month

# trade_items_month.replace([np.inf, -np.inf], 0, inplace=True)
# trade_items_month.fillna(0, inplace=True)
# trade_items_month

In [17]:
train = pd.merge(train,trade_items_month,on=['item','month'],how='left')
test = pd.merge(test,trade_items_month,on=['item','month'],how='left')

In [18]:
train.shape, test.shape

((59397, 31), (1092, 29))

In [19]:
# 선형보간법으로 결측치 채우기 -> 결측치 채우는 방법 확인 interpolate()

train = train.fillna(0)
test = test.fillna(0)

## train_ft, test_ft 생성

In [20]:
train_ft = train.copy()
test_ft = test.copy()


train_ft.shape, test_ft.shape

((59397, 31), (1092, 29))

In [21]:
# cat_train = train_ft[['item']]

# cat_train_encoded = pd.get_dummies(cat_train)
# cat_train_encoded = cat_train_encoded.applymap(lambda x: 1 if x != 0 else 0)

# cat_test = test_ft[['item']]

# cat_test_encoded = pd.get_dummies(cat_test)
# cat_test_encoded = cat_test_encoded.applymap(lambda x: 1 if x != 0 else 0)


# train_ft = pd.concat([train_ft,cat_train_encoded],axis=1)
# test_ft = pd.concat([test_ft,cat_test_encoded],axis=1)

In [22]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# train_ft['name'] = label_encoder.fit_transform(train_ft['name'])
# test_ft['name'] = label_encoder.transform(test_ft['name'])


train_ft['item'] = train_ft['item'].replace({'TG':0,'CR':1,'CB':2,'RD':3,'BC':4})
train_ft['corporation'] = train_ft['corporation'].replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5})
train_ft['location'] = train_ft['location'].replace({'J':0,'S':1})

test_ft['item'] = test_ft['item'].replace({'TG':0,'CR':1,'CB':2,'RD':3,'BC':4})
test_ft['corporation'] = test_ft['corporation'].replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5})
test_ft['location'] = test_ft['location'].replace({'J':0,'S':1})

In [23]:
train.item.unique()

array(['TG', 'CR', 'CB', 'RD', 'BC'], dtype=object)

In [24]:
train.corporation.unique()

array(['A', 'B', 'C', 'D', 'E', 'F'], dtype=object)

In [25]:


test_ft = test_ft.rename(columns = {'price': 'price_name_y_m_dow_avg',
                           'supply_kg':'supply_kg_corp_m_std'})


train_ft.shape, test_ft.shape

((59397, 31), (1092, 29))

In [26]:
train_ft

Unnamed: 0,ID,timestamp,item,corporation,location,supply_kg,price,name,year,month,...,price_name_y_m_dow_median,supply_kg_corp_m_std,supply_kg_corp_m_max,exp_price,exp_revenue,exp_supply,imp_price,imp_revenue,imp_supply,trade_balance
0,TG_A_J_20190101,2019-01-01,0,0,0,0.0,0.0,TG_A_J,2019,1,...,1456.0,31856.341911,242656.0,2691.9,380880000.0,192621.6,0.0,0.0,0.2,380880000.0
1,TG_A_J_20190102,2019-01-02,0,0,0,0.0,0.0,TG_A_J,2019,1,...,1470.0,31856.341911,242656.0,2691.9,380880000.0,192621.6,0.0,0.0,0.2,380880000.0
2,TG_A_J_20190103,2019-01-03,0,0,0,60601.0,1728.0,TG_A_J,2019,1,...,2148.0,31856.341911,242656.0,2691.9,380880000.0,192621.6,0.0,0.0,0.2,380880000.0
3,TG_A_J_20190104,2019-01-04,0,0,0,25000.0,1408.0,TG_A_J,2019,1,...,1477.0,31856.341911,242656.0,2691.9,380880000.0,192621.6,0.0,0.0,0.2,380880000.0
4,TG_A_J_20190105,2019-01-05,0,0,0,32352.0,1250.0,TG_A_J,2019,1,...,1352.0,31856.341911,242656.0,2691.9,380880000.0,192621.6,0.0,0.0,0.2,380880000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,3,5,0,452440.0,468.0,RD_F_J,2023,2,...,500.5,185192.615088,827560.0,1200.0,4800000.0,4000.0,0.0,0.0,2.0,4800000.0
59393,RD_F_J_20230228,2023-02-28,3,5,0,421980.0,531.0,RD_F_J,2023,2,...,503.0,185192.615088,827560.0,1200.0,4800000.0,4000.0,0.0,0.0,2.0,4800000.0
59394,RD_F_J_20230301,2023-03-01,3,5,0,382980.0,574.0,RD_F_J,2023,3,...,574.0,172162.635251,736920.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59395,RD_F_J_20230302,2023-03-02,3,5,0,477220.0,523.0,RD_F_J,2023,3,...,523.0,172162.635251,736920.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
drop_cols = ['ID']

train_ft = train_ft.drop(columns = drop_cols)
test_ft = test_ft.drop(columns = drop_cols)

In [28]:
train_ft.rename(columns={'name':'item_id'},inplace=True)
test_ft.rename(columns={'name':'item_id'},inplace=True)

# autogluon

In [29]:
!pip install autogluon

Collecting autogluon
  Obtaining dependency information for autogluon from https://files.pythonhosted.org/packages/2e/42/f26592ecf3dc4e4edcbc5f7f7e3deba25b0681fad2ad04a82d12af17e3a4/autogluon-0.8.2-py3-none-any.whl.metadata
  Downloading autogluon-0.8.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Obtaining dependency information for autogluon.core[all]==0.8.2 from https://files.pythonhosted.org/packages/e0/56/545adb1d388e78591cd7e36de0c8b889c1944de362bdaeec0f31d01890df/autogluon.core-0.8.2-py3-none-any.whl.metadata
  Downloading autogluon.core-0.8.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==0.8.2 (from autogluon)
  Obtaining dependency information for autogluon.features==0.8.2 from https://files.pythonhosted.org/packages/bb/ea/7892719f78a30aee1bf42c4a0540fbae98bfbdf56b85fab79ffc437eb687/autogluon.features-0.8.2-py3-none-any.whl.metadata
  Downloading autogluon.features-0.8.2-py3-none-any.whl.metadata (11 kB)
Collecti

In [None]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor


data = TimeSeriesDataFrame(train_ft)
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price",
    eval_metric="RMSE"
)
# seed 고정
predictor.fit(data, random_seed=SEED)

No path specified. Models will be saved in: "AutogluonModels/ag-20231112_124457/"
TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
AutoGluon will save models to AutogluonModels/ag-20231112_124457/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price'
	past covariates:  ['item', 'corporation', 'location', 'supply_kg', 'year', '

In [None]:
predictor.refit_full()

In [None]:
pred = pred.reset_index()['mean']
pred

In [None]:
pred[pred<0] = 0

In [None]:
submission['answer']= pred
submission

In [None]:
submission.to_csv('submission23_1112.csv',index=False)