# Library Import & Settings

In [1]:
import pandas as pd
import numpy as np
import lightgbm
from tqdm import tqdm
import warnings

In [2]:
# 경고 끄기
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

# 전처리

## lag_feature 추가 및 기타 전처리

In [3]:
def preprocessing(temp_df, pum, len_lag) :
    # p_lag, q_lag 추가
    for lag in range(1,len_lag+1) :
      temp_df[f'p_lag_{lag}'] = -1
      temp_df[f'q_lag_{lag}'] = -1
      for index in range(lag, len(temp_df)) :
        temp_df.loc[index, f'p_lag_{lag}'] = temp_df[f'{pum}_가격(원/kg)'][index-lag] #1일전, 2일전, ... 가격을 feature로 추가
        temp_df.loc[index, f'q_lag_{lag}'] = temp_df[f'{pum}_거래량(kg)'][index-lag] #1일전, 2일전, ... 거래량을 feature로 추가

    # month 추가
    temp_df['date'] = pd.to_datetime(temp_df['date'])
    temp_df['month'] = temp_df['date'].dt.month

    # 예측 대상(1w,2w,4w) 추가
    for week in ['1_week','2_week','4_week'] :
      temp_df[week] = 0
      n_week = int(week[0])
      for index in range(len(temp_df)) :
        try : temp_df[week][index] = temp_df[f'{pum}_가격(원/kg)'][index+7*n_week]
        except : continue

    # 불필요한 column 제거        
    temp_df = temp_df.drop(['date',f'{pum}_거래량(kg)',f'{pum}_가격(원/kg)'], axis=1)
    
    return temp_df

In [6]:
raw_path = '/Users/cge/Documents/yeardream/프로젝트/기업프로젝트/products_price_predict/data/raw/'
processed_path = '/Users/cge/Documents/yeardream/프로젝트/기업프로젝트/products_price_predict/data/processed/'

tomato = pd.read_csv(processed_path+'tomato.csv')
grape = pd.read_csv(processed_path+'grape.csv')
shinemuscat = pd.read_csv(processed_path+'shinemuscat.csv')

train = pd.read_csv(raw_path+'train.csv')
test = pd.read_csv(raw_path+'test.csv')
private = pd.read_csv(raw_path+'private_data.csv')
submission = pd.read_csv(raw_path+'sample_submission.csv')


print(train.shape)
print(test.shape)
print(private.shape)
print(submission.shape)

(1733, 44)
(37, 44)
(291, 44)
(228, 22)


In [38]:
train = tomato.copy()

unique_kind = ['토마토']

In [32]:
# preprocessing 함수 예시
pum = '토마토'
temp_df = train[['date',f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']]
preprocessing(temp_df, pum, len_lag=28)

Unnamed: 0,p_lag_1,q_lag_1,p_lag_2,q_lag_2,p_lag_3,q_lag_3,p_lag_4,q_lag_4,p_lag_5,q_lag_5,...,p_lag_26,q_lag_26,p_lag_27,q_lag_27,p_lag_28,q_lag_28,month,1_week,2_week,4_week
0,-1,-1.0,-1,-1.0,-1,-1.0,-1,-1.0,-1,-1.0,...,-1,-1.0,-1,-1.0,-1,-1.0,1,2107,2436,2999
1,0,0.0,-1,-1.0,-1,-1.0,-1,-1.0,-1,-1.0,...,-1,-1.0,-1,-1.0,-1,-1.0,1,2111,2473,2966
2,1621,30950.0,0,0.0,-1,-1.0,-1,-1.0,-1,-1.0,...,-1,-1.0,-1,-1.0,-1,-1.0,1,0,0,0
3,0,0.0,1621,30950.0,0,0.0,-1,-1.0,-1,-1.0,...,-1,-1.0,-1,-1.0,-1,-1.0,1,2170,2513,3089
4,1834,291057.0,0,0.0,1621,30950.0,0,0.0,-1,-1.0,...,-1,-1.0,-1,-1.0,-1,-1.0,1,2281,2587,2892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1728,5529,88460.0,5536,108182.0,6004,228581.0,0,0.0,6299,169781.0,...,3140,333618.0,2664,240920.0,2268,280612.0,9,0,0,0
1729,5587,142487.0,5529,88460.0,5536,108182.0,6004,228581.0,0,0.0,...,0,0.0,3140,333618.0,2664,240920.0,9,0,0,0
1730,5596,152575.5,5587,142487.0,5529,88460.0,5536,108182.0,6004,228581.0,...,2970,430335.5,0,0.0,3140,333618.0,9,0,0,0
1731,5262,176807.0,5596,152575.5,5587,142487.0,5529,88460.0,5536,108182.0,...,3155,314110.0,2970,430335.5,0,0.0,9,0,0,0


# 학습

## metric 정의

In [33]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def rmse(week_answer, week_submission):
    answer = week_answer.to_numpy()
    target_idx = np.where(answer!=0)
    true = answer[target_idx]
    pred = week_submission[target_idx]
    score = np.sqrt(mean_squared_error(true, pred))
    
    return score


def at_rmse(pred, dataset):
    y_true = dataset.get_label()
    week_1_answer = y_true[0::3]
    week_2_answer = y_true[1::3]
    week_4_answer = y_true[2::3]
    
    week_1_submission = pred[0::3]
    week_2_submission = pred[1::3]
    week_4_submission = pred[2::3]
    
    score1 = rmse(week_1_answer, week_1_submission)
    score2 = rmse(week_2_answer, week_2_submission)
    score4 = rmse(week_4_answer, week_4_submission)
    
    score = (score1+score2+score4)/3
    
    return 'score', score, False

## 학습 정의

In [34]:
def model_train(x_train, y_train, x_valid, y_valid) :
    params = {'learning_rate': 0.01, 
              'max_depth': 6, 
              'boosting': 'gbdt', 
              'objective': 'regression',  
              'is_training_metric': True, 
              'num_leaves': 100, 
              'feature_fraction': 0.8, 
              'bagging_fraction': 0.8, 
              'bagging_freq': 5, 
              'seed':42,
              'num_threads':8
             }

    model = lightgbm.train(params, 
                   train_set = lightgbm.Dataset(data = x_train, label = y_train),
                   num_boost_round = 10000, 
                   valid_sets = lightgbm.Dataset(data = x_valid, label = y_valid), 
                   init_model = None, 
                   early_stopping_rounds = 100,
                   feval = at_rmse,
                   verbose_eval = False
                    )
    
    return model

In [None]:
model_train()

## 품목 및 품종별 모델 학습

In [39]:
model_dict = {}
split = 28 #validation

for pum in tqdm(unique_kind):
    # 품목 품종별 전처리
    temp_df = train[['date',f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']]
    temp_df = preprocessing(temp_df, pum, len_lag=28)
    
    # 주차별(1,2,4w) 학습
    for week_num in [1,2,4] :
        x = temp_df[temp_df[f'{week_num}_week']>0].iloc[:,:-3]
        y = temp_df[temp_df[f'{week_num}_week']>0][f'{week_num}_week']
        
        #train, test split
        x_train = x[:-split]
        y_train = y[:-split]
        x_valid = x[-split:]
        y_valid = y[-split:]
        
        model_dict[f'{pum}_model_{week_num}'] = model_train(x_train, y_train, x_valid, y_valid)

  0%|                                                               | 0/1 [00:02<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14293
[LightGBM] [Info] Number of data points in the train set: 1442, number of used features: 57
[LightGBM] [Info] Start training from score 2141.410541





AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

# 추론

In [68]:
public_date_list = submission[submission['예측대상일자'].str.contains('2020')]['예측대상일자'].str.split('+').str[0].unique()
# ['2020-09-29', ...]

for date in tqdm(public_date_list) :
    test = pd.read_csv(f'./data/public_data/test_files/test_{date}.csv')
    for pum in unique_pum + unique_kind:
        # 예측기준일에 대해 전처리
        temp_test = pd.DataFrame([{'date' : date}]) #예측기준일
        alldata = pd.concat([train, test, temp_test], sort=False).reset_index(drop=True)
        alldata = alldata[['date', f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']].fillna(0)
        alldata = alldata.iloc[-28:].reset_index(drop=True)
        alldata = preprocessing(alldata, pum, len_lag=28)
        temp_test = alldata.iloc[-1].astype(float)
        
        # 개별 모델을 활용하여 1,2,4주 후 가격 예측
        for week_num in [1,2,4] :
            temp_model = model_dict[f'{pum}_model_{week_num}']
            result = temp_model.predict(temp_test)
            condition = (submission['예측대상일자']==f'{date}+{week_num}week')
            idx = submission[condition].index
            submission.loc[idx, f'{pum}_가격(원/kg)'] = result[0]

100%|██████████| 38/38 [01:58<00:00,  3.13s/it]


In [69]:
submission.to_csv('baseline2_0920.csv',index=False)