# 최종 데이터 변수 추가



In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
raw_data = pd.read_csv('data/predict/predict.csv', encoding='utf-8', index_col=0)
raw_data

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10,기온,강수량,풍속,습도,운량,적설,확진자수
0,서울,2021-03-19,0,116.240,12.50,0.0,0.80,47.0,9.0,0.0,118
1,서울,2021-03-19,1,114.400,12.40,0.0,1.70,49.0,9.0,0.0,118
2,서울,2021-03-19,2,97.360,12.50,0.0,2.40,52.0,10.0,0.0,118
3,서울,2021-03-19,3,96.800,12.40,0.0,2.60,55.0,10.0,0.0,118
4,서울,2021-03-19,4,90.640,12.20,0.0,1.80,55.0,10.0,0.0,118
...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,2021-04-19,19,68.578,15.08,0.0,2.42,55.8,0.0,0.0,180
1532,경기도,2021-04-19,20,67.990,13.62,0.0,2.04,62.0,0.0,0.0,180
1533,경기도,2021-04-19,21,66.196,12.50,0.0,1.80,65.4,1.0,0.0,180
1534,경기도,2021-04-19,22,64.941,11.36,0.0,1.58,71.4,0.0,0.0,180


In [3]:
# 날짜 형 변환
raw_data['측정날짜'] = pd.to_datetime(raw_data['측정날짜'], format='%Y-%m-%d')

## 1. 계절 변수 추가
* 봄 : 3 ~ 5월
* 여름 :  6 ~ 8월
* 가을 : 9 ~ 11월
* 겨울 : 12 ~ 2월

In [4]:
season_df = raw_data.copy()
season_df['계절'] = '겨울'

In [5]:

# 봄 계절 변경
season_df.loc[((season_df['측정날짜'].dt.month >=3) & (season_df['측정날짜'].dt.month <= 5)), '계절'] = '봄'

# 여름
season_df.loc[((season_df['측정날짜'].dt.month >=6) & (season_df['측정날짜'].dt.month <= 8)), '계절'] = '여름'

# 가을
season_df.loc[((season_df['측정날짜'].dt.month >=9) & (season_df['측정날짜'].dt.month <= 11)), '계절'] = '가을'


In [6]:
season_df

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10,기온,강수량,풍속,습도,운량,적설,확진자수,계절
0,서울,2021-03-19,0,116.240,12.50,0.0,0.80,47.0,9.0,0.0,118,봄
1,서울,2021-03-19,1,114.400,12.40,0.0,1.70,49.0,9.0,0.0,118,봄
2,서울,2021-03-19,2,97.360,12.50,0.0,2.40,52.0,10.0,0.0,118,봄
3,서울,2021-03-19,3,96.800,12.40,0.0,2.60,55.0,10.0,0.0,118,봄
4,서울,2021-03-19,4,90.640,12.20,0.0,1.80,55.0,10.0,0.0,118,봄
...,...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,2021-04-19,19,68.578,15.08,0.0,2.42,55.8,0.0,0.0,180,봄
1532,경기도,2021-04-19,20,67.990,13.62,0.0,2.04,62.0,0.0,0.0,180,봄
1533,경기도,2021-04-19,21,66.196,12.50,0.0,1.80,65.4,1.0,0.0,180,봄
1534,경기도,2021-04-19,22,64.941,11.36,0.0,1.58,71.4,0.0,0.0,180,봄


## 2. 요일 변수 추가
* 각 일자의 요일 변수 추가

In [7]:


season_df['요일'] = season_df['측정날짜'].dt.dayofweek
season_df

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10,기온,강수량,풍속,습도,운량,적설,확진자수,계절,요일
0,서울,2021-03-19,0,116.240,12.50,0.0,0.80,47.0,9.0,0.0,118,봄,4
1,서울,2021-03-19,1,114.400,12.40,0.0,1.70,49.0,9.0,0.0,118,봄,4
2,서울,2021-03-19,2,97.360,12.50,0.0,2.40,52.0,10.0,0.0,118,봄,4
3,서울,2021-03-19,3,96.800,12.40,0.0,2.60,55.0,10.0,0.0,118,봄,4
4,서울,2021-03-19,4,90.640,12.20,0.0,1.80,55.0,10.0,0.0,118,봄,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,2021-04-19,19,68.578,15.08,0.0,2.42,55.8,0.0,0.0,180,봄,0
1532,경기도,2021-04-19,20,67.990,13.62,0.0,2.04,62.0,0.0,0.0,180,봄,0
1533,경기도,2021-04-19,21,66.196,12.50,0.0,1.80,65.4,1.0,0.0,180,봄,0
1534,경기도,2021-04-19,22,64.941,11.36,0.0,1.58,71.4,0.0,0.0,180,봄,0


In [8]:
weekday = {0:'월', 1:'화', 2:'수', 3:'목', 4:'금', 5:'토', 6:'일'}

week_df = season_df.replace({'요일':weekday})

In [9]:
week_df

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10,기온,강수량,풍속,습도,운량,적설,확진자수,계절,요일
0,서울,2021-03-19,0,116.240,12.50,0.0,0.80,47.0,9.0,0.0,118,봄,금
1,서울,2021-03-19,1,114.400,12.40,0.0,1.70,49.0,9.0,0.0,118,봄,금
2,서울,2021-03-19,2,97.360,12.50,0.0,2.40,52.0,10.0,0.0,118,봄,금
3,서울,2021-03-19,3,96.800,12.40,0.0,2.60,55.0,10.0,0.0,118,봄,금
4,서울,2021-03-19,4,90.640,12.20,0.0,1.80,55.0,10.0,0.0,118,봄,금
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,경기도,2021-04-19,19,68.578,15.08,0.0,2.42,55.8,0.0,0.0,180,봄,월
1532,경기도,2021-04-19,20,67.990,13.62,0.0,2.04,62.0,0.0,0.0,180,봄,월
1533,경기도,2021-04-19,21,66.196,12.50,0.0,1.80,65.4,1.0,0.0,180,봄,월
1534,경기도,2021-04-19,22,64.941,11.36,0.0,1.58,71.4,0.0,0.0,180,봄,월


## 3. 기념일 변수 추가

* 복날, 국가대표 국제 경기, 블랙데이

In [10]:
anniv = np.array(['2021-04-14', '2021-03-25'])

anniv.shape

(2,)

In [11]:
week_df['기념일'] = 0

In [12]:
week_df['측정날짜'] = week_df['측정날짜'].dt.strftime('%Y-%m-%d')

In [13]:
week_df.loc[np.isin(week_df['측정날짜'], anniv), '기념일'] = 1

In [14]:
np.unique(week_df.loc[week_df['기념일']==1, '측정날짜'])

array(['2021-03-25', '2021-04-14'], dtype=object)

## 4. 공휴일 변수 추가

* 공휴일 전날은 2, 공휴일 당일은 1, 나머지 0
* 황금연휴 포함

In [15]:
from datetime import datetime, timedelta

# 공휴일 날짜
holiday = np.array(['2021-03-01'])


## 공휴일 전날은 2
day_holiday = pd.to_datetime(holiday) - timedelta(days=1)
day_holiday = day_holiday.strftime('%Y-%m-%d')

In [16]:
day_holiday

Index(['2021-02-28'], dtype='object')

In [17]:
week_df['공휴일'] = 0

In [18]:
## 공휴일 전날은 2로 설정
week_df.loc[np.isin(week_df['측정날짜'], day_holiday), '공휴일'] = 2

In [19]:
week_df.loc[week_df['공휴일']==2]

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10,기온,강수량,풍속,습도,운량,적설,확진자수,계절,요일,기념일,공휴일


In [20]:
## 공휴일 당일은 1로 설정
week_df.loc[np.isin(week_df['측정날짜'], holiday), '공휴일'] = 1

In [21]:
np.unique(week_df.loc[week_df['공휴일']==2, '측정날짜'])

array([], dtype=object)

In [22]:
holiday

array(['2021-03-01'], dtype='<U10')

In [23]:
week_df.to_csv('data/predict/predict_변수추가최종.csv', encoding='utf-8')