## 그리드 서치

In [1]:
#필요한 패키지를 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#jupyter에서 matplotlib을 사용해 만든 graph를 화면에 표시하기 위해 필요
%matplotlib inline 

#아래는 한글을 사용할 때 깨지는 문제에 대한 해결
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#그래프의 축 등에서 음수를 표시할 때 minus sign이 깨지는 것 해결
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (12, 12)

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/최종데이터/최종_열삭제후_dummy전.csv', encoding='utf-8', index_col=0)
data

Unnamed: 0,광역시도명,날짜,요일,시간대별 시간,업종명,계절,공휴일,기념일,기온,풍속,습도,PM10,운량,날씨,눈비,강수량,확진자수,주문건수
0,경기도,2019-07-17,수,0,기타,여름,0,0,22.86,0.54,91.2,79.512,6.4,3,0,0.0,0.0,14
1,경기도,2019-07-17,수,1,기타,여름,0,0,22.48,0.52,92.2,57.704,7.2,3,0,0.0,0.0,2
2,경기도,2019-07-17,수,10,기타,여름,0,0,26.36,1.24,75.6,59.775,8.0,3,0,0.0,0.0,3
3,경기도,2019-07-17,수,11,기타,여름,0,0,26.80,1.34,71.6,58.013,9.2,4,0,0.0,0.0,27
4,경기도,2019-07-17,수,12,기타,여름,0,0,27.26,1.46,71.4,57.240,9.8,4,0,0.0,0.0,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176351,서울,2020-09-30,수,19,한식,가을,1,0,19.00,2.70,90.0,31.320,10.0,4,1,29.5,30.0,32
176352,서울,2020-09-30,수,20,한식,가을,1,0,18.40,1.90,91.0,27.520,5.0,2,1,4.5,30.0,19
176353,서울,2020-09-30,수,21,한식,가을,1,0,18.20,0.70,91.0,22.720,7.0,3,0,0.0,30.0,7
176354,서울,2020-09-30,수,22,한식,가을,1,0,17.60,1.50,92.0,19.600,6.0,3,0,0.0,30.0,2


In [3]:

dummy_dict = {'요일': 'day', '시간대별 시간': 'time', '계절': 'season', '공휴일' : 'holiday', '기념일' : 'holiday2', '날씨' : 'climate', '눈비': 'rain_snow'}

def get_all_dummies(data=None, dummy_dict=None):
    
    df = pd.DataFrame()
    
    for dummy in list(dummy_dict.keys()):
        
        tmp_df = pd.get_dummies(data[dummy], prefix=dummy_dict[dummy])
        
        df = pd.concat([df, tmp_df], axis=1)
        
    return df



def get_final_data(origin_data=None):
    
    dummy_data = get_all_dummies(data=origin_data, dummy_dict=dummy_dict)
    
    tmp_df = origin_data.drop(list(dummy_dict.keys())+['날짜'], axis=1)
    tmp_df['주문건수'] = np.log1p(tmp_df['주문건수'])
    
    final_data = pd.concat([tmp_df, dummy_data], axis=1)
    
    return final_data
        

In [4]:
final_df = get_final_data(origin_data=data)
final_df

Unnamed: 0,광역시도명,업종명,기온,풍속,습도,PM10,운량,강수량,확진자수,주문건수,...,holiday_1,holiday_2,holiday2_0,holiday2_1,climate_1,climate_2,climate_3,climate_4,rain_snow_0,rain_snow_1
0,경기도,기타,22.86,0.54,91.2,79.512,6.4,0.0,0.0,2.708050,...,0,0,1,0,0,0,1,0,1,0
1,경기도,기타,22.48,0.52,92.2,57.704,7.2,0.0,0.0,1.098612,...,0,0,1,0,0,0,1,0,1,0
2,경기도,기타,26.36,1.24,75.6,59.775,8.0,0.0,0.0,1.386294,...,0,0,1,0,0,0,1,0,1,0
3,경기도,기타,26.80,1.34,71.6,58.013,9.2,0.0,0.0,3.332205,...,0,0,1,0,0,0,0,1,1,0
4,경기도,기타,27.26,1.46,71.4,57.240,9.8,0.0,0.0,3.555348,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176351,서울,한식,19.00,2.70,90.0,31.320,10.0,29.5,30.0,3.496508,...,1,0,1,0,0,0,0,1,0,1
176352,서울,한식,18.40,1.90,91.0,27.520,5.0,4.5,30.0,2.995732,...,1,0,1,0,0,1,0,0,0,1
176353,서울,한식,18.20,0.70,91.0,22.720,7.0,0.0,30.0,2.079442,...,1,0,1,0,0,0,1,0,1,0
176354,서울,한식,17.60,1.50,92.0,19.600,6.0,0.0,30.0,1.098612,...,1,0,1,0,0,0,1,0,1,0


In [45]:
import os
import fnmatch
import joblib
from sklearn.metrics import mean_squared_error

area_list = ['서울', '경기도']
category_list = ['기타', '돈까스/일식', '분식', '야식', '족발/보쌈', '중식', '찜탕', '치킨', '카페/디저트', '패스트푸드', '한식']
result = pd.DataFrame(columns=['광역시도명', '업종명', 'train_r2_score', 'test_r2_score', 'test_rmse'])
i=0


model_base_path = '../data_analysis/model/'

for area in area_list:
    for category in category_list:
        
        print(f'================================================= 지역 : {area} =================================================')
        print(f'\t------------------------------------ 업종명 :{category} ------------------------------------')
        
        data = final_df.loc[(final_df['광역시도명']==area)&(final_df['업종명']==category)].reset_index(drop=True)

        data.drop(['광역시도명', '업종명'], axis=1, inplace=True)
        
        y_target = data['주문건수']
        X_features = data.drop('주문건수', axis=1, inplace=False)
        
        X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)
        
        model_name = area + '_' + category.replace('/', '') + '_ridge.pkl'
        
        model_path = model_base_path + model_name
        
        loaded_model = joblib.load(model_path)
        
        
        
        train_r2 = loaded_model.score(X_train, y_train)
        test_r2 = loaded_model.score(X_test, y_test)
        y_pred = loaded_model.predict(X_test)
        
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        print(f'\n 모델명 : {model_name} \ttrain r^2 : {train_r2:.3f} \ttest r^2 : {test_r2:.3f} \ttest_rmse : {test_rmse:.3f} \n')
        
        result.loc[i] = [area, category, train_r2, test_r2, test_rmse]
        i+=1
        


	------------------------------------ 업종명 :기타 ------------------------------------

 모델명 : 서울_기타_ridge.pkl 	train r^2 : 0.667 	test r^2 : 0.671 	test_rmse : 0.454 

	------------------------------------ 업종명 :돈까스/일식 ------------------------------------

 모델명 : 서울_돈까스일식_ridge.pkl 	train r^2 : 0.685 	test r^2 : 0.650 	test_rmse : 0.461 

	------------------------------------ 업종명 :분식 ------------------------------------

 모델명 : 서울_분식_ridge.pkl 	train r^2 : 0.890 	test r^2 : 0.875 	test_rmse : 0.350 

	------------------------------------ 업종명 :야식 ------------------------------------

 모델명 : 서울_야식_ridge.pkl 	train r^2 : 0.819 	test r^2 : 0.807 	test_rmse : 0.411 

	------------------------------------ 업종명 :족발/보쌈 ------------------------------------

 모델명 : 서울_족발보쌈_ridge.pkl 	train r^2 : 0.828 	test r^2 : 0.830 	test_rmse : 0.431 

	------------------------------------ 업종명 :중식 ------------------------------------

 모델명 : 서울_중식_ridge.pkl 	train r^2 : 0.568 	test r^2 : 0.555 	test_rmse : 0.519 

In [55]:
import os
import fnmatch
import joblib
from sklearn.metrics import mean_squared_error

area_list = ['서울', '경기도']
category_list = ['기타', '돈까스/일식', '분식', '야식', '족발/보쌈', '중식', '찜탕', '치킨', '카페/디저트', '패스트푸드', '한식']
result = pd.DataFrame(columns=['광역시도명', '업종명', 'train_r2_score', 'test_r2_score', 'test_rmse'])
i=0


model_base_path = '../data_analysis/model/rf/'

for area in area_list:
    for category in category_list:
        
        print(f'================================================= 지역 : {area} =================================================')
        print(f'\t------------------------------------ 업종명 :{category} ------------------------------------')
        
        data = final_df.loc[(final_df['광역시도명']==area)&(final_df['업종명']==category)].reset_index(drop=True)

        data.drop(['광역시도명', '업종명'], axis=1, inplace=True)
        
        y_target = data['주문건수']
        X_features = data.drop('주문건수', axis=1, inplace=False)
        
        X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)
        
        model_name = area + '_' + category.replace('/', '') + '_rf.pkl'
        
        model_path = model_base_path + model_name
        
        loaded_model = joblib.load(model_path)
        
        
        
        train_r2 = loaded_model.score(X_train, y_train)
        test_r2 = loaded_model.score(X_test, y_test)
        y_pred = loaded_model.predict(X_test)
        
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        print(f'\n 모델명 : {model_name} \ttrain r^2 : {train_r2:.3f} \ttest r^2 : {test_r2:.3f} \ttest_rmse : {test_rmse:.3f} \n')
        
        result.loc[i] = [area, category, train_r2, test_r2, test_rmse]
        i+=1
        


	------------------------------------ 업종명 :기타 ------------------------------------

 모델명 : 서울_기타_rf.pkl 	train r^2 : 0.837 	test r^2 : 0.740 	test_rmse : 0.404 

	------------------------------------ 업종명 :돈까스/일식 ------------------------------------

 모델명 : 서울_돈까스일식_rf.pkl 	train r^2 : 0.838 	test r^2 : 0.736 	test_rmse : 0.400 

	------------------------------------ 업종명 :분식 ------------------------------------

 모델명 : 서울_분식_rf.pkl 	train r^2 : 0.924 	test r^2 : 0.848 	test_rmse : 0.386 

	------------------------------------ 업종명 :야식 ------------------------------------

 모델명 : 서울_야식_rf.pkl 	train r^2 : 0.891 	test r^2 : 0.831 	test_rmse : 0.385 

	------------------------------------ 업종명 :족발/보쌈 ------------------------------------

 모델명 : 서울_족발보쌈_rf.pkl 	train r^2 : 0.914 	test r^2 : 0.872 	test_rmse : 0.374 

	------------------------------------ 업종명 :중식 ------------------------------------

 모델명 : 서울_중식_rf.pkl 	train r^2 : 0.798 	test r^2 : 0.688 	test_rmse : 0.435 

	---------------

In [56]:
result.to_csv('rf_test_score.csv', encoding='utf-8')

In [59]:
import time
start = time.time()  # 시작 시간 저장
 
print("time :", time.time() - start)

time : 0.0


In [67]:
from sklearn.linear_model import Ridge

start_time = time.time()

y_target = gg_dic['치킨']['주문건수']
X_features = gg_dic['치킨'].drop('주문건수', axis=1, inplace=False)
        
X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)

ridge = Ridge(alpha=0.1)

ridge.fit(X_train, y_train)

print('걸린 시간 : ', time.time() - start_time)



걸린 시간 :  0.04986691474914551


In [68]:
from sklearn.ensemble import RandomForestRegressor

start_time = time.time()

y_target = gg_dic['치킨']['주문건수']
X_features = gg_dic['치킨'].drop('주문건수', axis=1, inplace=False)
        
X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)

rf_rg = RandomForestRegressor(max_depth=30, min_samples_leaf=8, min_samples_split=8,
                      n_estimators=150, random_state=156)

rf_rg.fit(X_train, y_train)

print('걸린 시간 : ', time.time() - start_time)



걸린 시간 :  7.628971099853516
