# 더미변수 생성 및 예측



In [14]:
#필요한 패키지를 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#jupyter에서 matplotlib을 사용해 만든 graph를 화면에 표시하기 위해 필요
%matplotlib inline 

#아래는 한글을 사용할 때 깨지는 문제에 대한 해결
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#그래프의 축 등에서 음수를 표시할 때 minus sign이 깨지는 것 해결
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (12, 12)

import warnings
warnings.filterwarnings('ignore')

## 1. 데이터 불러오기

In [15]:
data = pd.read_csv('data/predict/predict_dummy전.csv', encoding='utf-8', index_col=0)
data

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10,기온,강수량,풍속,습도,운량,확진자수,계절,요일,기념일,공휴일,눈비,날씨
0,서울,2021-03-19,0,116.240,12.50,0.0,0.80,47.0,9.0,118,봄,금,0,0,0,4
1,서울,2021-03-19,1,114.400,12.40,0.0,1.70,49.0,9.0,118,봄,금,0,0,0,4
2,서울,2021-03-19,2,97.360,12.50,0.0,2.40,52.0,10.0,118,봄,금,0,0,0,4
3,서울,2021-03-19,3,96.800,12.40,0.0,2.60,55.0,10.0,118,봄,금,0,0,0,4
4,서울,2021-03-19,4,90.640,12.20,0.0,1.80,55.0,10.0,118,봄,금,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,2021-04-19,19,68.578,15.08,0.0,2.42,55.8,0.0,180,봄,월,0,0,0,1
1532,경기도,2021-04-19,20,67.990,13.62,0.0,2.04,62.0,0.0,180,봄,월,0,0,0,1
1533,경기도,2021-04-19,21,66.196,12.50,0.0,1.80,65.4,1.0,180,봄,월,0,0,0,1
1534,경기도,2021-04-19,22,64.941,11.36,0.0,1.58,71.4,0.0,180,봄,월,0,0,0,1


## 2. 더미변수 생성

In [16]:

dummy_dict = {'요일': 'day', '측정일시': 'time', '계절': 'season', '공휴일' : 'holiday', '기념일' : 'holiday2', '날씨' : 'climate', '눈비': 'rain_snow'}

def get_all_dummies(data=None, dummy_dict=None):
    
    df = pd.DataFrame()
    
    for dummy in list(dummy_dict.keys()):
        
        tmp_df = pd.get_dummies(data[dummy], prefix=dummy_dict[dummy])
        
        df = pd.concat([df, tmp_df], axis=1)
        
    return df



def get_predict_data(origin_data=None):
    
    dummy_data = get_all_dummies(data=origin_data, dummy_dict=dummy_dict)
    
    tmp_df = origin_data.drop(list(dummy_dict.keys())+['측정날짜'], axis=1)
    
    final_data = pd.concat([tmp_df, dummy_data], axis=1)
    
    return final_data
        

In [17]:
final_df = get_predict_data(origin_data=data)
final_df

Unnamed: 0,광역시도명,PM10,기온,강수량,풍속,습도,운량,확진자수,day_금,day_목,...,season_봄,holiday_0,holiday2_0,holiday2_1,climate_1,climate_2,climate_3,climate_4,rain_snow_0,rain_snow_1
0,서울,116.240,12.50,0.0,0.80,47.0,9.0,118,1,0,...,1,1,1,0,0,0,0,1,1,0
1,서울,114.400,12.40,0.0,1.70,49.0,9.0,118,1,0,...,1,1,1,0,0,0,0,1,1,0
2,서울,97.360,12.50,0.0,2.40,52.0,10.0,118,1,0,...,1,1,1,0,0,0,0,1,1,0
3,서울,96.800,12.40,0.0,2.60,55.0,10.0,118,1,0,...,1,1,1,0,0,0,0,1,1,0
4,서울,90.640,12.20,0.0,1.80,55.0,10.0,118,1,0,...,1,1,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,68.578,15.08,0.0,2.42,55.8,0.0,180,0,0,...,1,1,1,0,1,0,0,0,1,0
1532,경기도,67.990,13.62,0.0,2.04,62.0,0.0,180,0,0,...,1,1,1,0,1,0,0,0,1,0
1533,경기도,66.196,12.50,0.0,1.80,65.4,1.0,180,0,0,...,1,1,1,0,1,0,0,0,1,0
1534,경기도,64.941,11.36,0.0,1.58,71.4,0.0,180,0,0,...,1,1,1,0,1,0,0,0,1,0


In [18]:
final_df['season_여름'] = 0
final_df['season_가을'] = 0
final_df['season_겨울'] = 0
final_df['holiday_1'] = 0
final_df['holiday_2'] = 0

In [19]:
final_df

Unnamed: 0,광역시도명,PM10,기온,강수량,풍속,습도,운량,확진자수,day_금,day_목,...,climate_2,climate_3,climate_4,rain_snow_0,rain_snow_1,season_여름,season_가을,season_겨울,holiday_1,holiday_2
0,서울,116.240,12.50,0.0,0.80,47.0,9.0,118,1,0,...,0,0,1,1,0,0,0,0,0,0
1,서울,114.400,12.40,0.0,1.70,49.0,9.0,118,1,0,...,0,0,1,1,0,0,0,0,0,0
2,서울,97.360,12.50,0.0,2.40,52.0,10.0,118,1,0,...,0,0,1,1,0,0,0,0,0,0
3,서울,96.800,12.40,0.0,2.60,55.0,10.0,118,1,0,...,0,0,1,1,0,0,0,0,0,0
4,서울,90.640,12.20,0.0,1.80,55.0,10.0,118,1,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,68.578,15.08,0.0,2.42,55.8,0.0,180,0,0,...,0,0,0,1,0,0,0,0,0,0
1532,경기도,67.990,13.62,0.0,2.04,62.0,0.0,180,0,0,...,0,0,0,1,0,0,0,0,0,0
1533,경기도,66.196,12.50,0.0,1.80,65.4,1.0,180,0,0,...,0,0,0,1,0,0,0,0,0,0
1534,경기도,64.941,11.36,0.0,1.58,71.4,0.0,180,0,0,...,0,0,0,1,0,0,0,0,0,0


In [20]:
final_df = final_df[['광역시도명', '기온', '풍속', '습도', 'PM10', '운량', '강수량', '확진자수',
                     'day_금', 'day_목', 'day_수', 'day_월', 'day_일', 'day_토', 'day_화', 'time_0',
       'time_1', 'time_2', 'time_3', 'time_4', 'time_5', 'time_6', 'time_7',
       'time_8', 'time_9', 'time_10', 'time_11', 'time_12', 'time_13',
       'time_14', 'time_15', 'time_16', 'time_17', 'time_18', 'time_19',
       'time_20', 'time_21', 'time_22', 'time_23', 'season_가을', 'season_겨울',
       'season_봄', 'season_여름', 'holiday_0', 'holiday_1', 'holiday_2',
       'holiday2_0', 'holiday2_1', 'climate_1', 'climate_2', 'climate_3',
       'climate_4', 'rain_snow_0', 'rain_snow_1']]

In [39]:
final_df.to_csv('data/predict_final.csv', encoding='utf-8')

## 3. 광역시도별로 데이터 나누기

In [22]:

seoul = final_df.loc[final_df['광역시도명']=='서울']
seoul.drop('광역시도명', axis=1, inplace=True)

gg = final_df.loc[final_df['광역시도명']=='경기도'].reset_index(drop=True)
gg.drop('광역시도명', axis=1, inplace=True)

In [23]:
seoul.columns

Index(['기온', '풍속', '습도', 'PM10', '운량', '강수량', '확진자수', 'day_금', 'day_목',
       'day_수', 'day_월', 'day_일', 'day_토', 'day_화', 'time_0', 'time_1',
       'time_2', 'time_3', 'time_4', 'time_5', 'time_6', 'time_7', 'time_8',
       'time_9', 'time_10', 'time_11', 'time_12', 'time_13', 'time_14',
       'time_15', 'time_16', 'time_17', 'time_18', 'time_19', 'time_20',
       'time_21', 'time_22', 'time_23', 'season_가을', 'season_겨울', 'season_봄',
       'season_여름', 'holiday_0', 'holiday_1', 'holiday_2', 'holiday2_0',
       'holiday2_1', 'climate_1', 'climate_2', 'climate_3', 'climate_4',
       'rain_snow_0', 'rain_snow_1'],
      dtype='object')

In [35]:
import joblib

loaded_model = joblib.load('../model/서울_분식_ridge.pkl')

pred = loaded_model.predict(seoul.values)

np.exp(pred)

array([  7.15150242,   4.09725581,   2.87330889,   8.94028958,
         2.59296553,  17.15210659,   2.00956112,   3.69788127,
         2.46557284,   2.10879582,  14.89052932,  63.89807167,
        61.39657961,  48.48812308,  38.72869006,  36.71779529,
        44.04981164,  62.82539529,  88.10774221,  85.89924185,
        58.61188988,  42.213689  ,  24.48495168,  10.8050135 ,
         7.30925422,   4.44174904,   3.15262237,   9.83625027,
         2.86189755,  18.98814056,   2.2402473 ,   4.1605231 ,
         2.8025256 ,   2.60248655,  18.73020635,  81.47305387,
        87.08000169,  63.11593975,  50.76694359,  50.86722147,
        57.92925556,  82.17782509, 112.55401056, 108.84671388,
        73.47807817,  52.61050326,  31.06251138,  13.16288256,
         8.15279929,   4.6791137 ,   3.32080141,  10.36917252,
         2.87309866,  20.22425606,   2.24479635,   4.41115634,
         2.96207449,   2.70826361,  19.34747806,  83.47791847,
        76.67683872,  64.87438774,  48.81496563,  48.32