In [124]:
from time import time
from joblib import dump, load

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [117]:
# 데이터셋 불러오기
train = pd.read_csv('./data/train_refined.csv')
valid = pd.read_csv('./data/valid_refined.csv')
feature_rankings = pd.read_csv('./data/xai_result.csv')

1.110095739364624


In [4]:
feature_rankings

Unnamed: 0,feature,impurity,score,feature.1,lime,score.1,total
0,Promo,0.1367,2,Promo,855.8936,1,3
1,DayOfWeek_1,0.0335,5,DayOfWeek_1,222.7965,2,7
2,CompetitionOpenElapsedDays,0.1302,3,CompetitionOpenElapsedDays,68.0728,7,10
3,Promo2ElapsedDays,0.0413,4,Promo2ElapsedDays,74.806,6,10
4,CompetitionDistance,0.3928,1,CompetitionDistance,54.9832,10,11
5,Month_12,0.0223,7,Month_12,108.2533,4,11
6,Assortment_c,0.0112,14,Assortment_c,117.1192,3,17
7,Assortment_a,0.0107,15,Assortment_a,102.2983,5,20
8,"PromoInterval_Mar,Jun,Sept,Dec",0.0157,11,"PromoInterval_Mar,Jun,Sept,Dec",59.333,9,20
9,PromoInterval_0,0.0206,8,PromoInterval_0,20.9589,17,25


In [5]:
# 순위로 정렬된 특징 집합
features = feature_rankings.feature

In [45]:
feature_set = list(features[:-1])

In [46]:
# 레이블 추가
feature_set.append("Sales")

In [47]:
feature_set

['Promo',
 'DayOfWeek_1',
 'CompetitionOpenElapsedDays',
 'Promo2ElapsedDays',
 'CompetitionDistance',
 'Month_12',
 'Assortment_c',
 'Assortment_a',
 'PromoInterval_Mar,Jun,Sept,Dec',
 'PromoInterval_0',
 'DayOfWeek_5',
 'SchoolHoliday',
 'DayOfWeek_6',
 'StoreType_c',
 'Year_2014',
 'Year_2015',
 'StoreType_b',
 'Month_6',
 'StoreType_a',
 'Year_2013',
 'Month_4',
 'Month_3',
 'Month_5',
 'Month_8',
 'PromoInterval_Feb,May,Aug,Nov',
 'StateHoliday_0',
 'StoreType_d',
 'Month_10',
 'DayOfWeek_2',
 'DayOfWeek_3',
 'Month_1',
 'Month_7',
 'StateHoliday_b',
 'DayOfWeek_4',
 'DayOfWeek_7',
 'Month_2',
 'Month_9',
 'Month_11',
 'StateHoliday_a',
 'Assortment_b',
 'Sales']

In [50]:
# 학습데이터 준비 
x_train = train[feature_set[:-1]].to_numpy()
y_train = train[feature_set[-1]].to_numpy()
x_valid = valid[feature_set[:-1]].to_numpy()
y_valid = valid[feature_set[-1]].to_numpy()

### RFE(Recursive Feature Elimination) 특징 선택
+ 전체 특징 집합에서 시작해서 특징 순위가 가장 낮은 것부터 순차적으로 제거 및 성능 측정


In [54]:
# RFR 모델 초기화
rfr = RandomForestRegressor(
    n_estimators=100, 
    random_state=15,
    bootstrap=True, 
    criterion='mse')

In [150]:
"""
    RFE(Recursive Feature Elimination) 특징 선택
"""

fs_size_lst = []
exec_time_lst = []
mae_lst = []

for i in range(len(features)):
    print('[%dth fs]' % (i))   
    # 특징 선택
    feature_set = list(features[:len(features)-i])
    feature_set.append("Sales")
    fs_size_lst.append(len(feature_set))
    print(len(feature_set))
    
    # 학습데이터 준비 
    x_train = train[feature_set[:-1]].to_numpy()
    y_train = train[feature_set[-1]].to_numpy()
    x_valid = valid[feature_set[:-1]].to_numpy()
    y_valid = valid[feature_set[-1]].to_numpy()
    
    # 모델 초기화
    rfr = RandomForestRegressor(
    n_estimators=100, 
    random_state=15,
    bootstrap=True, 
    criterion='mse')

    start = time()
    # 시간 측정 시작
    rfr.fit(x_train, y_train)
    # 시간 측정 종료
    end = time()
    print('Training time = %f' % (end-start)) 
    exec_time_lst.append(end-start)
    
    # MAE 측정
    y_hat = rfr.predict(x_valid)
    loss = mean_absolute_error(y_valid, y_hat)
    mae_lst.append(loss)
    print('Model loss = %f' % (loss))
    
#     # 모델 저장
#     print('dump the model...(./data/' + 'rfr_model_fs_size_' + str(len(feature_set)) + '.csv)')
#     dump(rfr, './data/' + 'rfr_model_fs_size_' + str(len(feature_set)) + '.csv')
    
    # 결과 저장
    df_result = pd.DataFrame()
    df_result['fs_size'] = fs_size_lst
    df_result['exec_time'] = exec_time_lst
    df_result['mae'] = mae_lst
    df_result.to_csv('./data/' + 'exp_result_fs_size_' + str(len(feature_set)) + '.csv' )

[0th fs]
42
Training time = 302.114446
Model loss = 712.448258
dump the model...(./data/rfr_model_fs_size_42.csv)
[1th fs]
41
Training time = 339.183835
Model loss = 712.528562
dump the model...(./data/rfr_model_fs_size_41.csv)
[2th fs]
40
Training time = 331.770581
Model loss = 712.471850
dump the model...(./data/rfr_model_fs_size_40.csv)
[3th fs]
39
Training time = 315.939503
Model loss = 712.362637
dump the model...(./data/rfr_model_fs_size_39.csv)
[4th fs]
38
Training time = 304.002406
Model loss = 715.980223
dump the model...(./data/rfr_model_fs_size_38.csv)
[5th fs]
37
Training time = 295.678168
Model loss = 717.656929
dump the model...(./data/rfr_model_fs_size_37.csv)
[6th fs]
36
Training time = 290.710381
Model loss = 719.700113
dump the model...(./data/rfr_model_fs_size_36.csv)
[7th fs]
35
Training time = 281.493242
Model loss = 721.401326
dump the model...(./data/rfr_model_fs_size_35.csv)
[8th fs]
34
Training time = 278.033372
Model loss = 725.225015
dump the model...(./data/