# prediction 기상데이터 병합 및 일자별 평균


In [1]:
# 필요 라이브러리 import
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
from urllib import parse
import os

### 경기도

In [2]:
data = pd.read_csv('data/predict_raw/대기질측정정보.csv', engine='python')
data.head(3)

Unnamed: 0,시군명,측정소명,설치년도,측정망명,측정일시각,아황산가스농도값(ppm),일산화탄소농도값(ppm),오존농도값(ppm),이산화질소농도값(ppm),미세먼지PM10농도값(μg/m³),미세먼지PM2.5농도값(μg/m³)
0,동두천시,보산동,2006.0,도시대기,2021-04-20 15:00,0.005,0.5,0.102,0.021,122.0,31.0
1,안성시,봉산동,2007.0,도시대기,2021-04-20 15:00,0.004,0.6,0.107,0.021,103.0,55.0
2,경기도,공도읍,2019.0,도시대기,2021-04-20 15:00,0.005,0.6,0.118,0.021,113.0,53.0


In [3]:
## 도시 대기만 추출
data = data.loc[(data['측정망명'] == '도시대기')].reset_index(drop=True)

In [4]:
## 시군구명을 경기도로 통합

data['시군구명'] = '경기도'


## 측정날짜만 추출 
data['측정날짜'] = data['측정일시각'].astype('str').str[:-6]

data['측정일시'] = data['측정일시각'].astype('str').str[-5:-3].replace('24', '00')

In [5]:
# 필요한 데이터만 추출
final_data = data[['측정날짜', '측정일시', '미세먼지PM10농도값(μg/m³)']]

In [6]:
# 컬럼명 재정의
final_data.columns = ['측정날짜', '측정일시', 'PM10']

In [7]:
# 2021-03-19 ~ 2021-04-19일만 추출
final_df = final_data.loc[(final_data['측정날짜']>='2021-03-19')&(final_data['측정날짜']<='2021-04-19')].reset_index(drop=True)

In [8]:
# 지점별로 나눠진 데이터를 groupby 평균

grouped_gg = final_df.groupby(['측정날짜', '측정일시']).mean().reset_index()

grouped_gg = np.round(grouped_gg, 3)

In [9]:
grouped_gg

Unnamed: 0,측정날짜,측정일시,PM10
0,2021-03-19,00,60.076
1,2021-03-19,01,116.343
2,2021-03-19,02,112.648
3,2021-03-19,03,110.524
4,2021-03-19,04,106.438
...,...,...,...
745,2021-04-19,19,68.578
746,2021-04-19,20,67.990
747,2021-04-19,21,66.196
748,2021-04-19,22,64.941


### 병합을 위한 기초 데이터 생성

In [10]:
basic_data = pd.DataFrame(pd.date_range('2021-03-19', '2021-04-20', freq='H'), columns=['날짜'])
basic_data = basic_data.iloc[:-1]

In [11]:
basic_data['날짜'] = basic_data['날짜'].dt.strftime('%Y-%m-%d %H')

In [12]:
basic_df = basic_data['날짜'].str.split(' ', expand=True)
basic_df.columns=['날짜', '시각']

In [13]:
basic_df

Unnamed: 0,날짜,시각
0,2021-03-19,00
1,2021-03-19,01
2,2021-03-19,02
3,2021-03-19,03
4,2021-03-19,04
...,...,...
763,2021-04-19,19
764,2021-04-19,20
765,2021-04-19,21
766,2021-04-19,22


### 기초 날짜 데이터와 경기도 데이터 병합

In [14]:
gg_data = pd.merge(basic_df, grouped_gg, how='left', left_on=['날짜', '시각'], right_on=['측정날짜', '측정일시'])

In [15]:
gg_data.drop(['측정날짜', '측정일시'], axis=1, inplace=True)

In [16]:
gg_data['광역시도명'] = '경기도'

In [17]:
gg_data.isnull().sum()

날짜        0
시각        0
PM10     18
광역시도명     0
dtype: int64

* 18개의 NULL값을 선형적인 관계를 고려하여 채움

In [18]:
final_gg = gg_data.interpolate()

In [19]:
final_gg.columns = ['측정날짜', '측정일시', 'PM10', '광역시도명']
final_gg = final_gg[['광역시도명', '측정날짜', '측정일시', 'PM10']]

In [20]:
final_gg

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10
0,경기도,2021-03-19,00,60.076
1,경기도,2021-03-19,01,116.343
2,경기도,2021-03-19,02,112.648
3,경기도,2021-03-19,03,110.524
4,경기도,2021-03-19,04,106.438
...,...,...,...,...
763,경기도,2021-04-19,19,68.578
764,경기도,2021-04-19,20,67.990
765,경기도,2021-04-19,21,66.196
766,경기도,2021-04-19,22,64.941


### 서울

In [21]:
seoul_data = pd.read_csv('data/predict_raw/서울시 기간별 시간평균 대기환경 정보.csv', engine='python')
seoul_data

Unnamed: 0,측정일시,권역코드,권역명,측정소코드,측정소명,미세먼지 1시간(㎍/㎥),미세먼지 24시간(㎍/㎥),초미세먼지(㎍/㎥),오존(ppm),이산화질소농도(ppm),일산화탄소농도(ppm),아황산가스농도(ppm)
0,202104201700,100,도심권,111121,중구,89,81,40,0.076,0.029,0.5,0.006
1,202104201700,100,도심권,111131,용산구,84,69,42,0.061,0.034,0.5,0.005
2,202104201700,100,도심권,111123,종로구,85,75,38,0.084,0.026,0.5,0.007
3,202104201700,101,서북권,111201,마포구,60,64,25,0.060,0.036,0.4,0.006
4,202104201700,101,서북권,111181,은평구,100,86,40,0.076,0.035,0.5,0.011
...,...,...,...,...,...,...,...,...,...,...,...,...
35845,202102200000,103,서남권,111212,강서구,45,45,26,0.034,0.029,0.5,0.002
35846,202102200000,104,동남권,111261,강남구,46,45,30,0.035,0.024,0.7,0.004
35847,202102200000,104,동남권,111274,강동구,52,50,33,0.034,0.030,0.6,0.003
35848,202102200000,104,동남권,111273,송파구,40,40,21,0.044,0.030,0.6,0.005


In [22]:
# 측정소명 서울로 통합
seoul_data['측정소명'] = '서울'

## 측정 날짜 및 일시 추출
seoul_data['측정날짜'] = seoul_data['측정일시'].astype('str').str[:-4]

seoul_data['측정일시'] = seoul_data['측정일시'].astype('str').str[-4:-2]

In [23]:
## 필요한 데이터만 추출 및 컬럼명 재정의

seoul_data = seoul_data[['측정날짜', '측정일시', '미세먼지 1시간(㎍/㎥)']]
seoul_data.columns = ['측정날짜', '측정일시', 'PM10']

In [24]:
seoul_data['측정날짜'] = pd.to_datetime(seoul_data['측정날짜'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seoul_data['측정날짜'] = pd.to_datetime(seoul_data['측정날짜'])


In [25]:
final_seoul = seoul_data.loc[(seoul_data['측정날짜']>='2021-03-19')&(seoul_data['측정날짜']<='2021-04-19')].reset_index(drop=True)

In [26]:
grouped_seoul = final_seoul.groupby(['측정날짜', '측정일시']).mean().reset_index()
display(grouped_seoul)

Unnamed: 0,측정날짜,측정일시,PM10
0,2021-03-19,00,116.24
1,2021-03-19,01,114.40
2,2021-03-19,02,97.36
3,2021-03-19,03,96.80
4,2021-03-19,04,90.64
...,...,...,...
763,2021-04-19,19,58.32
764,2021-04-19,20,57.84
765,2021-04-19,21,58.32
766,2021-04-19,22,55.76


In [27]:
grouped_seoul['광역시도명'] = '서울'

final_seoul = grouped_seoul[['광역시도명', '측정날짜', '측정일시', 'PM10' ]]
final_seoul['측정날짜'] = final_seoul['측정날짜'].dt.strftime('%Y-%m-%d')

In [28]:
final_seoul

Unnamed: 0,광역시도명,측정날짜,측정일시,PM10
0,서울,2021-03-19,00,116.24
1,서울,2021-03-19,01,114.40
2,서울,2021-03-19,02,97.36
3,서울,2021-03-19,03,96.80
4,서울,2021-03-19,04,90.64
...,...,...,...,...
763,서울,2021-04-19,19,58.32
764,서울,2021-04-19,20,57.84
765,서울,2021-04-19,21,58.32
766,서울,2021-04-19,22,55.76


In [29]:
prediction_final = pd.concat([final_seoul, final_gg], axis=0)

In [30]:
prediction_final.to_csv('data/predict/predict_날씨데이터.csv', encoding='utf-8')