## 그리드 서치

In [1]:
#필요한 패키지를 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#jupyter에서 matplotlib을 사용해 만든 graph를 화면에 표시하기 위해 필요
%matplotlib inline 

#아래는 한글을 사용할 때 깨지는 문제에 대한 해결
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#그래프의 축 등에서 음수를 표시할 때 minus sign이 깨지는 것 해결
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (12, 12)

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/최종데이터/최종_열삭제후_dummy전.csv', encoding='utf-8', index_col=0)
data

Unnamed: 0,광역시도명,날짜,요일,시간대별 시간,업종명,계절,공휴일,기념일,기온,풍속,습도,PM10,운량,날씨,눈비,강수량,확진자수,주문건수
0,경기도,2019-07-17,수,0,기타,여름,0,0,22.86,0.54,91.2,79.512,6.4,3,0,0.0,0.0,14
1,경기도,2019-07-17,수,1,기타,여름,0,0,22.48,0.52,92.2,57.704,7.2,3,0,0.0,0.0,2
2,경기도,2019-07-17,수,10,기타,여름,0,0,26.36,1.24,75.6,59.775,8.0,3,0,0.0,0.0,3
3,경기도,2019-07-17,수,11,기타,여름,0,0,26.80,1.34,71.6,58.013,9.2,4,0,0.0,0.0,27
4,경기도,2019-07-17,수,12,기타,여름,0,0,27.26,1.46,71.4,57.240,9.8,4,0,0.0,0.0,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176351,서울,2020-09-30,수,19,한식,가을,1,0,19.00,2.70,90.0,31.320,10.0,4,1,29.5,30.0,32
176352,서울,2020-09-30,수,20,한식,가을,1,0,18.40,1.90,91.0,27.520,5.0,2,1,4.5,30.0,19
176353,서울,2020-09-30,수,21,한식,가을,1,0,18.20,0.70,91.0,22.720,7.0,3,0,0.0,30.0,7
176354,서울,2020-09-30,수,22,한식,가을,1,0,17.60,1.50,92.0,19.600,6.0,3,0,0.0,30.0,2


In [3]:

dummy_dict = {'요일': 'day', '시간대별 시간': 'time', '계절': 'season', '공휴일' : 'holiday', '기념일' : 'holiday2', '날씨' : 'climate', '눈비': 'rain_snow'}

def get_all_dummies(data=None, dummy_dict=None):
    
    df = pd.DataFrame()
    
    for dummy in list(dummy_dict.keys()):
        
        tmp_df = pd.get_dummies(data[dummy], prefix=dummy_dict[dummy])
        
        df = pd.concat([df, tmp_df], axis=1)
        
    return df



def get_final_data(origin_data=None):
    
    dummy_data = get_all_dummies(data=origin_data, dummy_dict=dummy_dict)
    
    tmp_df = origin_data.drop(list(dummy_dict.keys())+['날짜'], axis=1)
    tmp_df['주문건수'] = np.log1p(tmp_df['주문건수'])
    
    final_data = pd.concat([tmp_df, dummy_data], axis=1)
    
    return final_data
        

In [5]:
final_df = get_final_data(origin_data=data)
final_df

Unnamed: 0,광역시도명,업종명,기온,풍속,습도,PM10,운량,강수량,확진자수,주문건수,...,holiday_1,holiday_2,holiday2_0,holiday2_1,climate_1,climate_2,climate_3,climate_4,rain_snow_0,rain_snow_1
0,경기도,기타,22.86,0.54,91.2,79.512,6.4,0.0,0.0,2.708050,...,0,0,1,0,0,0,1,0,1,0
1,경기도,기타,22.48,0.52,92.2,57.704,7.2,0.0,0.0,1.098612,...,0,0,1,0,0,0,1,0,1,0
2,경기도,기타,26.36,1.24,75.6,59.775,8.0,0.0,0.0,1.386294,...,0,0,1,0,0,0,1,0,1,0
3,경기도,기타,26.80,1.34,71.6,58.013,9.2,0.0,0.0,3.332205,...,0,0,1,0,0,0,0,1,1,0
4,경기도,기타,27.26,1.46,71.4,57.240,9.8,0.0,0.0,3.555348,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176351,서울,한식,19.00,2.70,90.0,31.320,10.0,29.5,30.0,3.496508,...,1,0,1,0,0,0,0,1,0,1
176352,서울,한식,18.40,1.90,91.0,27.520,5.0,4.5,30.0,2.995732,...,1,0,1,0,0,1,0,0,0,1
176353,서울,한식,18.20,0.70,91.0,22.720,7.0,0.0,30.0,2.079442,...,1,0,1,0,0,0,1,0,1,0
176354,서울,한식,17.60,1.50,92.0,19.600,6.0,0.0,30.0,1.098612,...,1,0,1,0,0,0,1,0,1,0


In [7]:
seoul_dic = {}
gg_dic = {}

for area in np.unique(final_df['광역시도명']):
    for category in np.unique(final_df['업종명']):
        
        tmp_df = final_df.loc[(final_df['광역시도명']==area)&(final_df['업종명']==category)]
        tmp_df.reset_index(drop=True, inplace=True)
        
        if area == '서울':
            seoul_dic[category] = tmp_df.iloc[:, 2:]
        else: 
            gg_dic[category] = tmp_df.iloc[:, 2:]

In [33]:
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
             'normalize': [True, False]}

area_dic = {'서울': seoul_dic, '경기도': gg_dic}

file_path = './model/'

for area_key, area in area_dic.items():
    print(f'------------{area_key}------------')
    
    for key in area.keys():
        print(f'====={key}=====')
        y_target = area[key]['주문건수']
        X_features = area[key].drop('주문건수', axis=1, inplace=False)
        
        ## train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)
        
        # cv=9로 고정
        grid_search = GridSearchCV(Ridge(), param_grid, cv=9, return_train_score=True)
        
        grid_search.fit(X_train, y_train)
        
        
        # 모델 저장
        file_name = area_key + '_' + key.replace('/', '') +'_'+'ridge'
        model_path = file_path +file_name + '.pkl'
        joblib.dump(grid_search.best_estimator_, model_path)
        
        # 결과 데이터프레임화
        tmp_result = pd.DataFrame(grid_search.cv_results_)
        result_path = file_path + file_name + '.csv'
        
        tmp_result.to_csv(result_path)
        
                

------------서울------------
=====기타=====
=====돈까스/일식=====
=====분식=====
=====야식=====
=====족발/보쌈=====
=====중식=====
=====찜탕=====
=====치킨=====
=====카페/디저트=====
=====패스트푸드=====
=====한식=====
------------경기도------------
=====기타=====
=====돈까스/일식=====
=====분식=====
=====야식=====
=====족발/보쌈=====
=====중식=====
=====찜탕=====
=====치킨=====
=====카페/디저트=====
=====패스트푸드=====
=====한식=====


In [25]:
from sklearn.model_selection import train_test_split
y_target = gg_dic['중식']['주문건수']
X_features = gg_dic['중식'].drop('주문건수', axis=1, inplace=False)
        
X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)

In [29]:
import os
import fnmatch
import joblib

loaded_model = joblib.load('../data_analysis/model/경기도_야식_ridge.pkl')
lm2 = joblib.load('../data_analysis/model/경기도_중식_ridge.pkl')

In [34]:
lm2.score(X_test, y_test)

pred = lm2.predict(X_test)
np.expm1(pred)[:10], np.expm1(y_test)[:10]

(array([13.24626238, 21.00632252, 27.10891783, 15.08868004, 14.69536454,
        19.42297711, 33.41575118, 25.18651854, 87.24161402, 58.11735418]),
 array([ 21.,   7.,  17.,  41.,  41.,  27.,  21.,  26.,  91., 116.]))

In [54]:
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30], 
    'min_samples_leaf': [8, 12, 18],
    'min_samples_split': [8, 16, 20]  
}

area_dic = {'서울': seoul_dic, '경기도': gg_dic}

file_path = './model/rf/'

for area_key, area in area_dic.items():
    print(f'------------{area_key}------------')
    
    for key in area.keys():
        print(f'====={key}=====')
        y_target = area[key]['주문건수']
        X_features = area[key].drop('주문건수', axis=1, inplace=False)
        
        X_train, X_test, y_train, y_test = train_test_split(X_features.values, y_target.values, test_size=0.2, random_state=156)
        
        grid_search = GridSearchCV(RandomForestRegressor(random_state=156), param_grid, cv=9, return_train_score=True)
        
        grid_search.fit(X_train, y_train)
        
        
        # 모델 저장
        file_name = area_key + '_' + key.replace('/', '') +'_'+'rf'
        model_path = file_path +file_name + '.pkl'
        joblib.dump(grid_search.best_estimator_, model_path)
        
        # 결과 데이터프레임화
        tmp_result = pd.DataFrame(grid_search.cv_results_)
        result_path = file_path + file_name + '.csv'
        
        tmp_result.to_csv(result_path)
        

------------서울------------
=====기타=====
=====돈까스/일식=====
=====분식=====
=====야식=====
=====족발/보쌈=====
=====중식=====
=====찜탕=====
=====치킨=====
=====카페/디저트=====
=====패스트푸드=====
=====한식=====
------------경기도------------
=====기타=====
=====돈까스/일식=====
=====분식=====
=====야식=====
=====족발/보쌈=====
=====중식=====
=====찜탕=====
=====치킨=====
=====카페/디저트=====
=====패스트푸드=====
=====한식=====
