# 더미변수 생성 및 예측



In [3]:
#필요한 패키지를 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

#jupyter에서 matplotlib을 사용해 만든 graph를 화면에 표시하기 위해 필요
%matplotlib inline 

#아래는 한글을 사용할 때 깨지는 문제에 대한 해결
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#그래프의 축 등에서 음수를 표시할 때 minus sign이 깨지는 것 해결
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (12, 12)

import warnings
warnings.filterwarnings('ignore')

## 1. 데이터 불러오기

In [4]:
data = pd.read_csv('data/최종데이터/최종_열삭제후_dummy전.csv', encoding='utf-8', index_col=0)
data

Unnamed: 0,광역시도명,날짜,요일,시간대별 시간,업종명,계절,공휴일,기념일,기온,풍속,습도,PM10,운량,날씨,눈비,강수량,확진자수,주문건수
0,경기도,2019-07-17,수,0,도시락,여름,0,0,22.86,0.54,91.2,79.512,6.4,3,0,0.0,0.0,1
1,경기도,2019-07-17,수,11,도시락,여름,0,0,26.80,1.34,71.6,58.013,9.2,4,0,0.0,0.0,13
2,경기도,2019-07-17,수,12,도시락,여름,0,0,27.26,1.46,71.4,57.240,9.8,4,0,0.0,0.0,14
3,경기도,2019-07-17,수,13,도시락,여름,0,0,27.52,1.48,69.8,60.360,9.8,4,0,0.0,0.0,13
4,경기도,2019-07-17,수,14,도시락,여름,0,0,27.44,1.38,72.6,64.595,10.0,4,1,2.2,0.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221205,서울,2020-09-30,수,19,회,가을,1,0,19.00,2.70,90.0,31.320,10.0,4,1,29.5,30.0,7
221206,서울,2020-09-30,수,20,회,가을,1,0,18.40,1.90,91.0,27.520,5.0,2,1,4.5,30.0,11
221207,서울,2020-09-30,수,21,회,가을,1,0,18.20,0.70,91.0,22.720,7.0,3,0,0.0,30.0,3
221208,서울,2020-09-30,수,22,회,가을,1,0,17.60,1.50,92.0,19.600,6.0,3,0,0.0,30.0,7


## 2. 더미변수 생성

In [5]:

dummy_dict = {'요일': 'day', '시간대별 시간': 'time', '계절': 'season', '공휴일' : 'holiday', '기념일' : 'holiday2', '날씨' : 'climate', '눈비': 'rain_snow'}

def get_all_dummies(data=None, dummy_dict=None):
    
    df = pd.DataFrame()
    
    for dummy in list(dummy_dict.keys()):
        
        tmp_df = pd.get_dummies(data[dummy], prefix=dummy_dict[dummy])
        
        df = pd.concat([df, tmp_df], axis=1)
        
    return df



def get_final_data(origin_data=None):
    
    dummy_data = get_all_dummies(data=origin_data, dummy_dict=dummy_dict)
    
    tmp_df = origin_data.drop(list(dummy_dict.keys())+['날짜'], axis=1)
    tmp_df['주문건수'] = np.log1p(tmp_df['주문건수'])
    
    final_data = pd.concat([tmp_df, dummy_data], axis=1)
    
    return final_data
        

In [6]:
final_df = get_final_data(origin_data=data)
final_df

Unnamed: 0,광역시도명,업종명,기온,풍속,습도,PM10,운량,강수량,확진자수,주문건수,...,holiday_1,holiday_2,holiday2_0,holiday2_1,climate_1,climate_2,climate_3,climate_4,rain_snow_0,rain_snow_1
0,경기도,도시락,22.86,0.54,91.2,79.512,6.4,0.0,0.0,0.693147,...,0,0,1,0,0,0,1,0,1,0
1,경기도,도시락,26.80,1.34,71.6,58.013,9.2,0.0,0.0,2.639057,...,0,0,1,0,0,0,0,1,1,0
2,경기도,도시락,27.26,1.46,71.4,57.240,9.8,0.0,0.0,2.708050,...,0,0,1,0,0,0,0,1,1,0
3,경기도,도시락,27.52,1.48,69.8,60.360,9.8,0.0,0.0,2.639057,...,0,0,1,0,0,0,0,1,1,0
4,경기도,도시락,27.44,1.38,72.6,64.595,10.0,2.2,0.0,2.079442,...,0,0,1,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221205,서울,회,19.00,2.70,90.0,31.320,10.0,29.5,30.0,2.079442,...,1,0,1,0,0,0,0,1,0,1
221206,서울,회,18.40,1.90,91.0,27.520,5.0,4.5,30.0,2.484907,...,1,0,1,0,0,1,0,0,0,1
221207,서울,회,18.20,0.70,91.0,22.720,7.0,0.0,30.0,1.386294,...,1,0,1,0,0,0,1,0,1,0
221208,서울,회,17.60,1.50,92.0,19.600,6.0,0.0,30.0,2.079442,...,1,0,1,0,0,0,1,0,1,0


## 3. 광역시도, 업종별로 데이터 나누기

In [7]:
seoul_dic = {}
gg_dic = {}

for area in np.unique(final_df['광역시도명']):
    for category in np.unique(final_df['업종명']):
        
        tmp_df = final_df.loc[(final_df['광역시도명']==area)&(final_df['업종명']==category)]
        tmp_df.reset_index(drop=True, inplace=True)
        
        if area == '서울':
            seoul_dic[category] = tmp_df.iloc[:, 2:]
        else: 
            gg_dic[category] = tmp_df.iloc[:, 2:]

In [8]:
from sklearn.model_selection import train_test_split

y_target = seoul_dic['치킨']['주문건수']
X_features = seoul_dic['치킨'].drop('주문건수', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)

In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

In [10]:

lr = LinearRegression()
ridge = Ridge()
rf_r = RandomForestRegressor()
gbr = GradientBoostingRegressor()


models = [lr, ridge, rf_r, gbr]


In [11]:

result = {}

for key in seoul_dic.keys():
    print('-----------------------------------')
    y_target = seoul_dic[key]['주문건수']
    X_features = seoul_dic[key].drop('주문건수', axis=1, inplace=False)

    X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)
    
    for model in models:
        
        model.fit(X_train, y_train)
        model_name = model.__class__.__name__
        
        score = model.score(X_test, y_test)
        print(f'{key} {model.__class__.__name__} score : {score:.3f}')
        
        

-----------------------------------
도시락 LinearRegression score : 0.350
도시락 Ridge score : 0.351
도시락 RandomForestRegressor score : 0.422
도시락 GradientBoostingRegressor score : 0.432
-----------------------------------
돈까스/일식 LinearRegression score : 0.673
돈까스/일식 Ridge score : 0.672
돈까스/일식 RandomForestRegressor score : 0.729
돈까스/일식 GradientBoostingRegressor score : 0.687
-----------------------------------
배달전문업체 LinearRegression score : 0.439
배달전문업체 Ridge score : 0.439
배달전문업체 RandomForestRegressor score : 0.780
배달전문업체 GradientBoostingRegressor score : 0.743
-----------------------------------
분식 LinearRegression score : 0.875
분식 Ridge score : 0.874
분식 RandomForestRegressor score : 0.890
분식 GradientBoostingRegressor score : 0.875
-----------------------------------
아시안/양식 LinearRegression score : 0.587
아시안/양식 Ridge score : 0.588
아시안/양식 RandomForestRegressor score : 0.650
아시안/양식 GradientBoostingRegressor score : 0.614
-----------------------------------
야식 LinearRegression score : 0.807
야식 

In [12]:
area_dic = {'서울': seoul_dic, '경기도': gg_dic}

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [14]:
n_splits = [3, 5, 7, 9]
result = pd.DataFrame(columns=['광역시도명','업종명', '모델명', 'fold 수', '스코어'])

category_list = []
model_list = []
fold_list = []
score_list = []

i=0
for area_key, area in area_dic.items():
    print(f'------------{area_key}------------')
    
    for key in area.keys():
        print(f'=============={key}============')
        y_target = area[key]['주문건수']
        X_features = area[key].drop('주문건수', axis=1, inplace=False)
        
        for model in models:
            for n in n_splits:
            
                kfold = KFold(n_splits=n, shuffle=True)
                model_name = model.__class__.__name__

                score = cross_val_score(model, X_features, y_target, cv=kfold)
                score = np.round(np.mean(score), 3)
                
                result.loc[i] = [area_key, key, model_name, n, score]
                
                print(f'업종명 : {key} \t model: {model_name} \t Fold : {n} \t cross_val_score : {score}')
                i+=1
        print('')
        

------------서울------------
업종명 : 도시락 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.353
업종명 : 도시락 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.356
업종명 : 도시락 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.354
업종명 : 도시락 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.357
업종명 : 도시락 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.349
업종명 : 도시락 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.358
업종명 : 도시락 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.353
업종명 : 도시락 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.354
업종명 : 도시락 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.397
업종명 : 도시락 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.402
업종명 : 도시락 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.409
업종명 : 도시락 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.4
업종명 : 도시락 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.423
업종명 : 도시락 	 model: GradientBoostingRegres

업종명 : 족발/보쌈 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.876
업종명 : 족발/보쌈 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.837
업종명 : 족발/보쌈 	 model: GradientBoostingRegressor 	 Fold : 5 	 cross_val_score : 0.839
업종명 : 족발/보쌈 	 model: GradientBoostingRegressor 	 Fold : 7 	 cross_val_score : 0.84
업종명 : 족발/보쌈 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.839

업종명 : 중식 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.56
업종명 : 중식 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : -1245065936958.515
업종명 : 중식 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.56
업종명 : 중식 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.559
업종명 : 중식 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.56
업종명 : 중식 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.558
업종명 : 중식 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.559
업종명 : 중식 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.558
업종명 : 중식 	 model: RandomForestRegressor 	 

업종명 : 한식 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.816
업종명 : 한식 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.85
업종명 : 한식 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.853
업종명 : 한식 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.851
업종명 : 한식 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.852
업종명 : 한식 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.8
업종명 : 한식 	 model: GradientBoostingRegressor 	 Fold : 5 	 cross_val_score : 0.8
업종명 : 한식 	 model: GradientBoostingRegressor 	 Fold : 7 	 cross_val_score : 0.802
업종명 : 한식 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.801

업종명 : 회 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.658
업종명 : 회 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.657
업종명 : 회 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.657
업종명 : 회 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.656
업종명 : 회 	 model: Ridge 	 Fo

업종명 : 야식 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.908
업종명 : 야식 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.907
업종명 : 야식 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.907
업종명 : 야식 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.907
업종명 : 야식 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.908
업종명 : 야식 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.927
업종명 : 야식 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.929
업종명 : 야식 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.929
업종명 : 야식 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.929
업종명 : 야식 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.885
업종명 : 야식 	 model: GradientBoostingRegressor 	 Fold : 5 	 cross_val_score : 0.885
업종명 : 야식 	 model: GradientBoostingRegressor 	 Fold : 7 	 cross_val_score : 0.886
업종명 : 야식 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.886

업종명 : 족발/보쌈 	 model: LinearRegression 	 Fold : 3 	 

업종명 : 패스트푸드 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.926

업종명 : 피자 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.911
업종명 : 피자 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.911
업종명 : 피자 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.911
업종명 : 피자 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.911
업종명 : 피자 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.911
업종명 : 피자 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.911
업종명 : 피자 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.911
업종명 : 피자 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.911
업종명 : 피자 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.933
업종명 : 피자 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.934
업종명 : 피자 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.934
업종명 : 피자 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.935
업종명 : 피자 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 

In [16]:
result.to_csv('data/KFold_result.csv', encoding='utf-8')