In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import requests
from datetime import datetime, timedelta
import time
import xml.etree.ElementTree as ET

In [2]:
df = pd.read_csv('1211.대기오염미래예측/data/12-23대기오염nan처리.csv')

In [4]:
df.tail(1)

Unnamed: 0,측정일시,이산화질소,오존,일산화탄소,아황산,미세,초미세
4303,20231211,0.015,0.0227,0.4,0.0024,7.2,3.0


In [5]:

# API 키
with open('keys/api.txt') as file:
    road_key = file.read()

# 구 이름 목록
gu_names = [
    "강남구", "강남대로", "강동구", "강변북로", "강북구", "강서구", "공항대로",
    "관악구", "광진구", "구로구", "금천구", "노원구", "도봉구", "도산대로",
    "동대문구", "동작구", "동작대로", "마포구", "서대문구", "서초구", "성동구",
    "성북구", "송파구", "신촌로", "양천구", "영등포구", "영등포로", "용산구",
    "은평구", "정릉로", "종로", "종로구", "중구", "중랑구", "천호대로",
    "청계천로", "한강대로", "홍릉로", "화랑로"
]

start_point = df['측정일시'].tail(1).values

# start_point 값을 datetime 객체로 변환
start_point_datetime = datetime.strptime(str(start_point[0]), "%Y%m%d")

# 시작 날짜 설정: start_point + 1일
start_date = start_point_datetime + timedelta(days=1)

# 종료 날짜 설정: 현재 날짜
end_date = datetime.now()
# 모든 데이터를 저장할 빈 데이터프레임 생성


for attempt in range(3):  # 최대 3번 재시도
    all_data_df1 = pd.DataFrame()
    try:

        # 지정된 날짜 범위에 대해 반복
        for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
            date_str = single_date.strftime("%Y%m%d")  # 날짜를 yyyymmdd 형태의 문자열로 변환
            
            # 각 구에 대한 API 요청을 보내고 결과를 처리합니다.
            for gu_name in gu_names:
                # API 요청 URL 생성
                url = f"http://openAPI.seoul.go.kr:8088/{road_key}/xml/DailyAverageAirQuality/1/5/{date_str}/{gu_name}"
                
                # API 요청
                result = requests.get(url, timeout=20)
                
                # API 요청 결과 처리
                if result.status_code == 200:
                    xml_data = result.text
                    # XML 파싱
                    root = ET.fromstring(xml_data)
                    rows = []

                    # 'row' 태그에 대한 모든 데이터 추출
                    for row in root.findall('.//row'):
                        # 각 데이터 필드의 값을 추출하고, 데이터가 없으면 None (Pandas에서 NaN으로 변환됨) 으로 설정
                        rows.append({
                            "측정일시": row.find('MSRDT_DE').text if row.find('MSRDT_DE') is not None else None,
                            "측정소명": row.find('MSRSTE_NM').text if row.find('MSRSTE_NM') is not None else None,
                            "이산화질소": row.find('NO2').text if row.find('NO2') is not None else None,
                            "오존": row.find('O3').text if row.find('O3') is not None else None,
                            "일산화탄소": row.find('CO').text if row.find('CO') is not None else None,
                            "아황산": row.find('SO2').text if row.find('SO2') is not None else None,
                            "미세": row.find('PM10').text if row.find('PM10') is not None else None,
                            "초미세": row.find('PM25').text if row.find('PM25') is not None else None
                        })

                    # 추출된 데이터로부터 DataFrame 생성
                    df2 = pd.DataFrame(rows)
                    
                    # 전체 데이터프레임에 추가
                    all_data_df1 = pd.concat([all_data_df1, df2], ignore_index=True)
                else:
                    # 요청 실패 시 오류 메시지 출력
                    print(f"에러: {gu_name} - {date_str} - 상태 코드: {result.status_code}")
    except requests.exceptions.RequestException:
        time.sleep(5)  # 5초 대기 후 재시도

# 데이터가 없는 경우 NaN으로 변환
all_data_df1.replace({None: np.nan}, inplace=True)

combined_df = all_data_df1.dropna()

combined_df = combined_df.drop(columns=['측정소명'])

import numpy as np

for col in combined_df.columns:
    if col == '측정일시':
        combined_df[col] = combined_df[col].astype(np.int64)
    else:
        combined_df[col] = combined_df[col].astype(float)

combined_df = combined_df.groupby('측정일시', as_index=False).mean()

combined_df['이산화질소'] = combined_df['이산화질소'].round(3)
combined_df['오존'] = combined_df['오존'].round(3)
combined_df['일산화탄소'] = combined_df['일산화탄소'].round(1)
combined_df['아황산'] = combined_df['아황산'].round(3)
combined_df['미세'] = combined_df['미세'].round(0)
combined_df['초미세'] = combined_df['초미세'].round(0)

# df와 combined_df를 수직으로 결합
df = pd.concat([df, combined_df], ignore_index=True)

df.to_csv('1211.대기오염미래예측/data/12-23대기오염nan처리.csv', index= False)

In [20]:
df.tail(1)

Unnamed: 0,측정일시,이산화질소,오존,일산화탄소,아황산,미세,초미세
4303,20231211,0.015,0.0227,0.4,0.0024,7.2,3.0


In [40]:
all_data_df1.dtypes

측정일시     object
측정소명     object
이산화질소    object
오존       object
일산화탄소    object
아황산      object
미세       object
초미세      object
dtype: object

In [41]:
combined_df.columns

Index(['측정일시', '이산화질소', '오존', '일산화탄소', '아황산', '미세', '초미세'], dtype='object')

In [6]:
combined_df = all_data_df1.dropna()


In [7]:
combined_df

Unnamed: 0,측정일시,측정소명,이산화질소,오존,일산화탄소,아황산,미세,초미세
0,20231212,강남구,0.0176,0.0192,0.29,0.0026,6,2
1,20231212,강남대로,0.0215,0.0137,0.81,0.0021,16,5
2,20231212,강동구,0.0157,0.0194,0.3,0.0024,4,1
3,20231212,강변북로,0.0161,0.0188,0.37,0.0022,6,4
4,20231212,강북구,0.0114,0.0198,0.36,0.002,4,2
...,...,...,...,...,...,...,...,...
355,20231220,천호대로,0.0264,0.0192,0.6,0.0026,53,29
356,20231220,청계천로,0.0164,0.0243,0.57,0.0022,51,31
357,20231220,한강대로,0.0224,0.0206,0.66,0.003,53,35
358,20231220,홍릉로,0.0211,0.0223,0.61,0.0027,47,26


In [8]:
combined_df = combined_df.drop(columns=['측정소명'])

In [9]:
import numpy as np

for col in combined_df.columns:
    if col == '측정일시':
        combined_df[col] = combined_df[col].astype(np.int64)
    else:
        combined_df[col] = combined_df[col].astype(float)


In [10]:
combined_df = combined_df.groupby('측정일시', as_index=False).mean()

In [11]:
combined_df['이산화질소'] = combined_df['이산화질소'].round(3)
combined_df['오존'] = combined_df['오존'].round(3)
combined_df['일산화탄소'] = combined_df['일산화탄소'].round(1)
combined_df['아황산'] = combined_df['아황산'].round(3)
combined_df['미세'] = combined_df['미세'].round(0)
combined_df['초미세'] = combined_df['초미세'].round(0)

In [12]:
df

Unnamed: 0,측정일시,이산화질소,오존,일산화탄소,아황산,미세,초미세
0,20120101,0.031,0.0080,1.17,0.0093,88.3,57.3
1,20120102,0.036,0.0050,1.20,0.0087,100.0,62.7
2,20120103,0.041,0.0063,1.27,0.0073,96.0,63.0
3,20120104,0.024,0.0130,0.83,0.0103,70.3,42.0
4,20120105,0.037,0.0073,0.77,0.0063,53.3,29.3
...,...,...,...,...,...,...,...
4299,20231207,0.026,0.0184,0.50,0.0033,62.2,16.5
4300,20231208,0.029,0.0300,0.56,0.0031,84.2,22.8
4301,20231209,0.027,0.0379,0.61,0.0029,72.3,30.7
4302,20231210,0.025,0.0209,0.67,0.0029,80.2,35.6


In [13]:
# df와 combined_df를 수직으로 결합
df = pd.concat([df, combined_df], ignore_index=True)


In [14]:
df.tail(20)

Unnamed: 0,측정일시,이산화질소,오존,일산화탄소,아황산,미세,초미세
4293,20231201,0.023,0.02,0.45,0.0028,23.8,10.7
4294,20231202,0.034,0.0118,0.58,0.0031,34.2,18.5
4295,20231203,0.023,0.021,0.58,0.0035,38.0,22.9
4296,20231204,0.046,0.0064,0.77,0.0034,42.3,26.1
4297,20231205,0.046,0.0087,0.69,0.0036,43.4,27.0
4298,20231206,0.046,0.0066,0.81,0.0035,66.1,47.0
4299,20231207,0.026,0.0184,0.5,0.0033,62.2,16.5
4300,20231208,0.029,0.03,0.56,0.0031,84.2,22.8
4301,20231209,0.027,0.0379,0.61,0.0029,72.3,30.7
4302,20231210,0.025,0.0209,0.67,0.0029,80.2,35.6


In [5]:
df['측정일시'] = pd.to_datetime(df['측정일시'], format='%Y%m%d')

In [7]:
df = df.set_index('측정일시')

In [8]:
df.index

DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03', '2012-01-04',
               '2012-01-05', '2012-01-06', '2012-01-07', '2012-01-08',
               '2012-01-09', '2012-01-10',
               ...
               '2023-12-02', '2023-12-03', '2023-12-04', '2023-12-05',
               '2023-12-06', '2023-12-07', '2023-12-08', '2023-12-09',
               '2023-12-10', '2023-12-11'],
              dtype='datetime64[ns]', name='측정일시', length=4304, freq=None)

In [None]:
df.columns

Index(['Unnamed: 0', '이산화질소', '오존', '일산화탄소', '아황산', '미세', '초미세'], dtype='object')

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import TimeseriesGenerator

In [45]:
from sklearn.preprocessing import MinMaxScaler

# 피쳐 및 타겟 분리
features = df[['이산화질소', '오존', '일산화탄소', '아황산', '초미세']]
target = df['미세']

# 피쳐 스케일링
scaler_features = MinMaxScaler()
features_scaled = scaler_features.fit_transform(features)

# 타겟 스케일링
scaler_target = MinMaxScaler()
target_scaled = scaler_target.fit_transform(target.values.reshape(-1, 1))


In [25]:
from keras.preprocessing.sequence import TimeseriesGenerator

n_input = 30
n_features = features.shape[1]
generator = TimeseriesGenerator(features_scaled, target_scaled, length=n_input, batch_size=1)


In [47]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

# LSTM 모델 구축
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# EarlyStopping 콜백 설정 조기종료
early_stopping = EarlyStopping(monitor='loss', patience=10)


In [48]:
# 모델 훈련 with EarlyStopping
model.fit(generator, epochs=100, callbacks=[early_stopping])


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x1e757d86190>

In [33]:
df.tail()

Unnamed: 0_level_0,이산화질소,오존,일산화탄소,아황산,미세,초미세
측정일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-12-07,0.026,0.0184,0.5,0.0033,62.2,16.5
2023-12-08,0.029,0.03,0.56,0.0031,84.2,22.8
2023-12-09,0.027,0.0379,0.61,0.0029,72.3,30.7
2023-12-10,0.025,0.0209,0.67,0.0029,80.2,35.6
2023-12-11,0.015,0.0227,0.4,0.0024,7.2,3.0


In [49]:
from sklearn.metrics import r2_score

# 테스트 데이터셋 준비 (예시)
# 이 부분은 실제 데이터셋과 상황에 맞게 조정해야 합니다.
# test_features_scaled, test_target_scaled는 테스트 데이터의 스케일된 피쳐와 타겟입니다.
test_generator = TimeseriesGenerator(features_scaled , target_scaled, length=n_input, batch_size=1)

# 테스트 데이터셋에 대한 예측 수행
test_predictions = model.predict(test_generator)

# 예측값 역 스케일링
test_predictions_inverse = scaler_target.inverse_transform(test_predictions)

# 실제 타겟 값 역 스케일링
actual_target = scaler_target.inverse_transform(target_scaled[n_input:])

# R^2 값 계산
r2 = r2_score(actual_target, test_predictions_inverse)
print(f'R^2 Score: {r2}')


R^2 Score: 0.5005735436162553


In [51]:
# 최근 데이터를 사용한 예측
x_input = features_scaled[-n_input:]
x_input = x_input.reshape((1, n_input, n_features))
predicted = model.predict(x_input, verbose=0)

# 예측값 역 스케일링
predicted_inverse = scaler_target.inverse_transform(predicted)
print(predicted_inverse)


[[15.164758]]


In [52]:
model.save('model/a/pm10_model.h5')

In [10]:
from keras.models import load_model

# 저장된 모델 불러오기
model_pm10 = load_model('model/a/pm10_model.h5')


In [26]:
# 마지막 n_input 일의 데이터 선택
recent_data = features[-n_input:]

# 데이터 스케일링
recent_data_scaled = scaler_features.transform(recent_data)

# 모델 입력 형태에 맞게 조정
recent_data_scaled = recent_data_scaled.reshape((1, n_input, n_features))

# 모델로 하루 뒤 값 예측
predicted_next_day = model_pm10.predict(recent_data_scaled)

# 예측값 역 스케일링
predicted_next_day_inverse = scaler_target.inverse_transform(predicted_next_day)

# 예측 결과 출력
print(predicted_next_day_inverse)


[[15.164758]]


In [27]:
# 피쳐 및 타겟 분리
features = df[['이산화질소', '오존', '일산화탄소','미세', '초미세']]
target = df['아황산']

# 피쳐 스케일링
scaler_features = MinMaxScaler()
features_scaled = scaler_features.fit_transform(features)

# 타겟 스케일링
scaler_target = MinMaxScaler()
target_scaled = scaler_target.fit_transform(target.values.reshape(-1, 1))

from keras.preprocessing.sequence import TimeseriesGenerator

n_input = 30
n_features = features.shape[1]
generator = TimeseriesGenerator(features_scaled, target_scaled, length=n_input, batch_size=1)

from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

# LSTM 모델 구축
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# EarlyStopping 콜백 설정 조기종료
early_stopping = EarlyStopping(monitor='loss', patience=10)

# 모델 훈련 with EarlyStopping
model.fit(generator, epochs=100, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1fbb60d2c50>

In [28]:
# 최근 데이터를 사용한 예측
x_input = features_scaled[-n_input:]
x_input = x_input.reshape((1, n_input, n_features))
predicted = model.predict(x_input, verbose=0)

# 예측값 역 스케일링
predicted_inverse = scaler_target.inverse_transform(predicted)
print(predicted_inverse)


[[0.00253596]]


In [29]:
df.tail(1)

Unnamed: 0_level_0,이산화질소,오존,일산화탄소,아황산,미세,초미세
측정일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-12-11,0.015,0.0227,0.4,0.0024,7.2,3.0


In [30]:
from sklearn.metrics import r2_score

# 테스트 데이터셋 준비 (예시)
# 이 부분은 실제 데이터셋과 상황에 맞게 조정해야 합니다.
# test_features_scaled, test_target_scaled는 테스트 데이터의 스케일된 피쳐와 타겟입니다.
test_generator = TimeseriesGenerator(features_scaled , target_scaled, length=n_input, batch_size=1)

# 테스트 데이터셋에 대한 예측 수행
test_predictions = model.predict(test_generator)

# 예측값 역 스케일링
test_predictions_inverse = scaler_target.inverse_transform(test_predictions)

# 실제 타겟 값 역 스케일링
actual_target = scaler_target.inverse_transform(target_scaled[n_input:])

# R^2 값 계산
r2 = r2_score(actual_target, test_predictions_inverse)
print(f'R^2 Score: {r2}')


R^2 Score: 0.8293557177563251


In [None]:
#이산화질소 (NO2): 질소 산화물의 일종으로, 주로 자동차 배기가스와 산업 공정에서 발생합니다.

#오존 (O3): 지상 근처에서는 오염물질로 간주되며, 태양광과 자동차 배기가스 등의 화학 반응으로 생성됩니다.

#일산화탄소 (CO): 불완전 연소로 인해 발생하는 가스로, 특히 연료를 태우는 차량에서 많이 배출됩니다.

#아황산 (SO2): 일반적으로 아황산가스라고 불리며, 화석 연료의 연소, 특히 석탄과 석유의 연소로 인해 발생합니다.

#미세먼지 (PM10): 입자 지름이 10 마이크로미터(µm) 이하인 고체 또는 액체 입자입니다.

#초미세먼지 (PM2.5): 입자 지름이 2.5 마이크로미터(µm) 이하인 매우 작은 먼지 입자입니다.

In [31]:
model.save('model/a/SO2_model.h5')