# 더미변수 생성 및 예측



In [2]:
#필요한 패키지를 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#jupyter에서 matplotlib을 사용해 만든 graph를 화면에 표시하기 위해 필요
%matplotlib inline 

#아래는 한글을 사용할 때 깨지는 문제에 대한 해결
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#그래프의 축 등에서 음수를 표시할 때 minus sign이 깨지는 것 해결
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (12, 12)

import warnings
warnings.filterwarnings('ignore')

## 1. 데이터 불러오기

In [3]:
data = pd.read_csv('data/최종데이터/최종_열삭제후_dummy전.csv', encoding='utf-8', index_col=0)
data

Unnamed: 0,광역시도명,날짜,요일,시간대별 시간,업종명,계절,공휴일,기념일,기온,풍속,습도,PM10,운량,날씨,눈비,강수량,확진자수,주문건수
0,경기도,2019-07-17,수,0,기타,여름,0,0,22.86,0.54,91.2,79.512,6.4,3,0,0.0,0.0,14
1,경기도,2019-07-17,수,1,기타,여름,0,0,22.48,0.52,92.2,57.704,7.2,3,0,0.0,0.0,2
2,경기도,2019-07-17,수,10,기타,여름,0,0,26.36,1.24,75.6,59.775,8.0,3,0,0.0,0.0,3
3,경기도,2019-07-17,수,11,기타,여름,0,0,26.80,1.34,71.6,58.013,9.2,4,0,0.0,0.0,27
4,경기도,2019-07-17,수,12,기타,여름,0,0,27.26,1.46,71.4,57.240,9.8,4,0,0.0,0.0,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176351,서울,2020-09-30,수,19,한식,가을,1,0,19.00,2.70,90.0,31.320,10.0,4,1,29.5,30.0,32
176352,서울,2020-09-30,수,20,한식,가을,1,0,18.40,1.90,91.0,27.520,5.0,2,1,4.5,30.0,19
176353,서울,2020-09-30,수,21,한식,가을,1,0,18.20,0.70,91.0,22.720,7.0,3,0,0.0,30.0,7
176354,서울,2020-09-30,수,22,한식,가을,1,0,17.60,1.50,92.0,19.600,6.0,3,0,0.0,30.0,2


## 2. 더미변수 생성

In [4]:

dummy_dict = {'요일': 'day', '시간대별 시간': 'time', '계절': 'season', '공휴일' : 'holiday', '기념일' : 'holiday2', '날씨' : 'climate', '눈비': 'rain_snow'}

def get_all_dummies(data=None, dummy_dict=None):
    
    df = pd.DataFrame()
    
    for dummy in list(dummy_dict.keys()):
        
        tmp_df = pd.get_dummies(data[dummy], prefix=dummy_dict[dummy])
        
        df = pd.concat([df, tmp_df], axis=1)
        
    return df



def get_final_data(origin_data=None):
    
    dummy_data = get_all_dummies(data=origin_data, dummy_dict=dummy_dict)
    
    tmp_df = origin_data.drop(list(dummy_dict.keys())+['날짜'], axis=1)
    tmp_df['주문건수'] = np.log1p(tmp_df['주문건수'])
    
    final_data = pd.concat([tmp_df, dummy_data], axis=1)
    
    return final_data
        

In [5]:
final_df = get_final_data(origin_data=data)
final_df

Unnamed: 0,광역시도명,업종명,기온,풍속,습도,PM10,운량,강수량,확진자수,주문건수,...,holiday_1,holiday_2,holiday2_0,holiday2_1,climate_1,climate_2,climate_3,climate_4,rain_snow_0,rain_snow_1
0,경기도,기타,22.86,0.54,91.2,79.512,6.4,0.0,0.0,2.708050,...,0,0,1,0,0,0,1,0,1,0
1,경기도,기타,22.48,0.52,92.2,57.704,7.2,0.0,0.0,1.098612,...,0,0,1,0,0,0,1,0,1,0
2,경기도,기타,26.36,1.24,75.6,59.775,8.0,0.0,0.0,1.386294,...,0,0,1,0,0,0,1,0,1,0
3,경기도,기타,26.80,1.34,71.6,58.013,9.2,0.0,0.0,3.332205,...,0,0,1,0,0,0,0,1,1,0
4,경기도,기타,27.26,1.46,71.4,57.240,9.8,0.0,0.0,3.555348,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176351,서울,한식,19.00,2.70,90.0,31.320,10.0,29.5,30.0,3.496508,...,1,0,1,0,0,0,0,1,0,1
176352,서울,한식,18.40,1.90,91.0,27.520,5.0,4.5,30.0,2.995732,...,1,0,1,0,0,1,0,0,0,1
176353,서울,한식,18.20,0.70,91.0,22.720,7.0,0.0,30.0,2.079442,...,1,0,1,0,0,0,1,0,1,0
176354,서울,한식,17.60,1.50,92.0,19.600,6.0,0.0,30.0,1.098612,...,1,0,1,0,0,0,1,0,1,0


In [6]:
final_df.columns

Index(['광역시도명', '업종명', '기온', '풍속', '습도', 'PM10', '운량', '강수량', '확진자수', '주문건수',
       'day_금', 'day_목', 'day_수', 'day_월', 'day_일', 'day_토', 'day_화', 'time_0',
       'time_1', 'time_2', 'time_3', 'time_4', 'time_5', 'time_6', 'time_7',
       'time_8', 'time_9', 'time_10', 'time_11', 'time_12', 'time_13',
       'time_14', 'time_15', 'time_16', 'time_17', 'time_18', 'time_19',
       'time_20', 'time_21', 'time_22', 'time_23', 'season_가을', 'season_겨울',
       'season_봄', 'season_여름', 'holiday_0', 'holiday_1', 'holiday_2',
       'holiday2_0', 'holiday2_1', 'climate_1', 'climate_2', 'climate_3',
       'climate_4', 'rain_snow_0', 'rain_snow_1'],
      dtype='object')

## 3. 광역시도, 업종별로 데이터 나누기

In [9]:
seoul_dic = {}
gg_dic = {}

for area in np.unique(final_df['광역시도명']):
    for category in np.unique(final_df['업종명']):
        
        tmp_df = final_df.loc[(final_df['광역시도명']==area)&(final_df['업종명']==category)]
        tmp_df.reset_index(drop=True, inplace=True)
        
        if area == '서울':
            seoul_dic[category] = tmp_df.iloc[:, 2:]
        else: 
            gg_dic[category] = tmp_df.iloc[:, 2:]

In [11]:
from sklearn.model_selection import train_test_split

y_target = seoul_dic['치킨']['주문건수']
X_features = seoul_dic['치킨'].drop('주문건수', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)

In [14]:
X_features.columns

Index(['기온', '풍속', '습도', 'PM10', '운량', '강수량', '확진자수', 'day_금', 'day_목',
       'day_수', 'day_월', 'day_일', 'day_토', 'day_화', 'time_0', 'time_1',
       'time_2', 'time_3', 'time_4', 'time_5', 'time_6', 'time_7', 'time_8',
       'time_9', 'time_10', 'time_11', 'time_12', 'time_13', 'time_14',
       'time_15', 'time_16', 'time_17', 'time_18', 'time_19', 'time_20',
       'time_21', 'time_22', 'time_23', 'season_가을', 'season_겨울', 'season_봄',
       'season_여름', 'holiday_0', 'holiday_1', 'holiday_2', 'holiday2_0',
       'holiday2_1', 'climate_1', 'climate_2', 'climate_3', 'climate_4',
       'rain_snow_0', 'rain_snow_1'],
      dtype='object')

In [9]:
X_features.columns

Index(['기온', '풍속', '습도', 'PM10', '운량', '강수량', '확진자수', 'day_금', 'day_목',
       'day_수', 'day_월', 'day_일', 'day_토', 'day_화', 'time_0', 'time_1',
       'time_2', 'time_3', 'time_4', 'time_5', 'time_6', 'time_7', 'time_8',
       'time_9', 'time_10', 'time_11', 'time_12', 'time_13', 'time_14',
       'time_15', 'time_16', 'time_17', 'time_18', 'time_19', 'time_20',
       'time_21', 'time_22', 'time_23', 'season_가을', 'season_겨울', 'season_봄',
       'season_여름', 'holiday_0', 'holiday_1', 'holiday_2', 'holiday2_0',
       'holiday2_1', 'climate_1', 'climate_2', 'climate_3', 'climate_4',
       'rain_snow_0', 'rain_snow_1'],
      dtype='object')

In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

In [11]:

lr = LinearRegression()
ridge = Ridge()
rf_r = RandomForestRegressor()
gbr = GradientBoostingRegressor()


models = [lr, ridge, rf_r, gbr]


In [12]:

result = {}

for key in seoul_dic.keys():
    print('-----------------------------------')
    y_target = seoul_dic[key]['주문건수']
    X_features = seoul_dic[key].drop('주문건수', axis=1, inplace=False)

    X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)
    
    for model in models:
        
        model.fit(X_train, y_train)
        model_name = model.__class__.__name__
        
        score = model.score(X_test, y_test)
        print(f'{key} {model_name} score : {score:.3f}')
        
        

-----------------------------------
기타 LinearRegression score : 0.671
기타 Ridge score : 0.671
기타 RandomForestRegressor score : 0.749
기타 GradientBoostingRegressor score : 0.686
-----------------------------------
돈까스/일식 LinearRegression score : 0.650
돈까스/일식 Ridge score : 0.650
돈까스/일식 RandomForestRegressor score : 0.735
돈까스/일식 GradientBoostingRegressor score : 0.659
-----------------------------------
분식 LinearRegression score : 0.875
분식 Ridge score : 0.874
분식 RandomForestRegressor score : 0.891
분식 GradientBoostingRegressor score : 0.875
-----------------------------------
야식 LinearRegression score : 0.807
야식 Ridge score : 0.806
야식 RandomForestRegressor score : 0.828
야식 GradientBoostingRegressor score : 0.813
-----------------------------------
족발/보쌈 LinearRegression score : 0.830
족발/보쌈 Ridge score : 0.830
족발/보쌈 RandomForestRegressor score : 0.880
족발/보쌈 GradientBoostingRegressor score : 0.838
-----------------------------------
중식 LinearRegression score : 0.555
중식 Ridge score : 0.555
중식 R

In [13]:
area_dic = {'서울': seoul_dic, '경기도': gg_dic}

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [15]:
n_splits = [3, 5, 7, 9]
result = pd.DataFrame(columns=['광역시도명','업종명', '모델명', 'fold 수', '스코어'])

category_list = []
model_list = []
fold_list = []
score_list = []

i=0
for area_key, area in area_dic.items():
    print(f'------------{area_key}------------')
    
    for key in area.keys():
        print(f'=============={key}============')
        y_target = area[key]['주문건수']
        X_features = area[key].drop('주문건수', axis=1, inplace=False)
        
        for model in models:
            for n in n_splits:
            
                kfold = KFold(n_splits=n, shuffle=True)
                model_name = model.__class__.__name__

                score = cross_val_score(model, X_features, y_target, cv=kfold)
                score = np.round(np.mean(score), 3)
                
                result.loc[i] = [area_key, key, model_name, n, score]
                
                print(f'업종명 : {key} \t model: {model_name} \t Fold : {n} \t cross_val_score : {score}')
                i+=1
        print('')
        

------------서울------------
업종명 : 기타 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.663
업종명 : 기타 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.663
업종명 : 기타 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.663
업종명 : 기타 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.663
업종명 : 기타 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.663
업종명 : 기타 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.663
업종명 : 기타 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.663
업종명 : 기타 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.662
업종명 : 기타 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.744
업종명 : 기타 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.749
업종명 : 기타 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.748
업종명 : 기타 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.75
업종명 : 기타 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.675
업종명 : 기타 	 model: GradientBoostingRegressor 	 Fold : 

업종명 : 찜탕 	 model: GradientBoostingRegressor 	 Fold : 5 	 cross_val_score : 0.594
업종명 : 찜탕 	 model: GradientBoostingRegressor 	 Fold : 7 	 cross_val_score : 0.595
업종명 : 찜탕 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.598

업종명 : 치킨 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.925
업종명 : 치킨 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.925
업종명 : 치킨 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.924
업종명 : 치킨 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.924
업종명 : 치킨 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.924
업종명 : 치킨 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.924
업종명 : 치킨 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.924
업종명 : 치킨 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.924
업종명 : 치킨 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.941
업종명 : 치킨 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.943
업종명 : 치킨 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score :

업종명 : 분식 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.957
업종명 : 분식 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.958
업종명 : 분식 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.957
업종명 : 분식 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.904
업종명 : 분식 	 model: GradientBoostingRegressor 	 Fold : 5 	 cross_val_score : 0.905
업종명 : 분식 	 model: GradientBoostingRegressor 	 Fold : 7 	 cross_val_score : 0.906
업종명 : 분식 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.905

업종명 : 야식 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.907
업종명 : 야식 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.908
업종명 : 야식 	 model: LinearRegression 	 Fold : 7 	 cross_val_score : 0.907
업종명 : 야식 	 model: LinearRegression 	 Fold : 9 	 cross_val_score : 0.908
업종명 : 야식 	 model: Ridge 	 Fold : 3 	 cross_val_score : 0.908
업종명 : 야식 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.908
업종명 : 야식 	 model: Ridge 	 Fold : 7

업종명 : 패스트푸드 	 model: Ridge 	 Fold : 5 	 cross_val_score : 0.951
업종명 : 패스트푸드 	 model: Ridge 	 Fold : 7 	 cross_val_score : 0.951
업종명 : 패스트푸드 	 model: Ridge 	 Fold : 9 	 cross_val_score : 0.951
업종명 : 패스트푸드 	 model: RandomForestRegressor 	 Fold : 3 	 cross_val_score : 0.962
업종명 : 패스트푸드 	 model: RandomForestRegressor 	 Fold : 5 	 cross_val_score : 0.964
업종명 : 패스트푸드 	 model: RandomForestRegressor 	 Fold : 7 	 cross_val_score : 0.964
업종명 : 패스트푸드 	 model: RandomForestRegressor 	 Fold : 9 	 cross_val_score : 0.964
업종명 : 패스트푸드 	 model: GradientBoostingRegressor 	 Fold : 3 	 cross_val_score : 0.907
업종명 : 패스트푸드 	 model: GradientBoostingRegressor 	 Fold : 5 	 cross_val_score : 0.909
업종명 : 패스트푸드 	 model: GradientBoostingRegressor 	 Fold : 7 	 cross_val_score : 0.909
업종명 : 패스트푸드 	 model: GradientBoostingRegressor 	 Fold : 9 	 cross_val_score : 0.909

업종명 : 한식 	 model: LinearRegression 	 Fold : 3 	 cross_val_score : 0.932
업종명 : 한식 	 model: LinearRegression 	 Fold : 5 	 cross_val_score : 0.932
업종명 : 한

In [16]:
result.to_csv('data/KFold_result.csv', encoding='utf-8')

In [17]:
kfold_result = pd.read_csv('data/KFold_result.csv', encoding='utf-8', index_col=0)
kfold_result

Unnamed: 0,광역시도명,업종명,모델명,fold 수,스코어
0,서울,기타,LinearRegression,3,0.663
1,서울,기타,LinearRegression,5,0.663
2,서울,기타,LinearRegression,7,0.663
3,서울,기타,LinearRegression,9,0.663
4,서울,기타,Ridge,3,0.663
...,...,...,...,...,...
347,경기도,한식,RandomForestRegressor,9,0.954
348,경기도,한식,GradientBoostingRegressor,3,0.898
349,경기도,한식,GradientBoostingRegressor,5,0.902
350,경기도,한식,GradientBoostingRegressor,7,0.901


In [18]:
pd.options.display.float_format = '{:.3f}'.format

In [19]:
kfold_result.loc[kfold_result['스코어']<=0.6]

Unnamed: 0,광역시도명,업종명,모델명,fold 수,스코어
80,서울,중식,LinearRegression,3,0.557
81,서울,중식,LinearRegression,5,0.56
82,서울,중식,LinearRegression,7,0.558
83,서울,중식,LinearRegression,9,0.557
84,서울,중식,Ridge,3,0.557
85,서울,중식,Ridge,5,0.56
86,서울,중식,Ridge,7,0.557
87,서울,중식,Ridge,9,0.56
96,서울,찜탕,LinearRegression,3,0.558
97,서울,찜탕,LinearRegression,5,0.556
