In [1]:
import os
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import time
import numpy as np

In [6]:


def fetch_air_quality_data():
    df = pd.read_csv('1211.대기오염미래예측/data/12-23대기오염nan처리.csv')

    # API 키
    with open('keys/api.txt') as file:
        road_key = file.read()

    # 구 이름 목록
    gu_names = [
        "강남구", "강남대로", "강동구", "강변북로", "강북구", "강서구", "공항대로",
        "관악구", "광진구", "구로구", "금천구", "노원구", "도봉구", "도산대로",
        "동대문구", "동작구", "동작대로", "마포구", "서대문구", "서초구", "성동구",
        "성북구", "송파구", "신촌로", "양천구", "영등포구", "영등포로", "용산구",
        "은평구", "정릉로", "종로", "종로구", "중구", "중랑구", "천호대로",
        "청계천로", "한강대로", "홍릉로", "화랑로"
    ]

    start_point = df['측정일시'].tail(1).values
    start_point_datetime = datetime.strptime(str(start_point[0]), "%Y%m%d")
    start_date = start_point_datetime + timedelta(days=1)
    end_date = datetime.now()

    # 시작 날짜와 종료 날짜가 다를 경우에만 데이터 처리
    if start_date < end_date:
        all_data_df1 = pd.DataFrame()

        for attempt in range(3):  # 최대 3번 재시도
            try:
                for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
                    date_str = single_date.strftime("%Y%m%d")
                    for gu_name in gu_names:
                        url = f"http://openAPI.seoul.go.kr:8088/{road_key}/xml/DailyAverageAirQuality/1/5/{date_str}/{gu_name}"
                        result = requests.get(url, timeout=20)
                        if result.status_code == 200:
                            xml_data = result.text
                            root = ET.fromstring(xml_data)
                            rows = []
                            for row in root.findall('.//row'):
                                rows.append({
                                    "측정일시": row.find('MSRDT_DE').text if row.find('MSRDT_DE') is not None else None,
                                    "측정소명": row.find('MSRSTE_NM').text if row.find('MSRSTE_NM') is not None else None,
                                    "이산화질소": row.find('NO2').text if row.find('NO2') is not None else None,
                                    "오존": row.find('O3').text if row.find('O3') is not None else None,
                                    "일산화탄소": row.find('CO').text if row.find('CO') is not None else None,
                                    "아황산": row.find('SO2').text if row.find('SO2') is not None else None,
                                    "미세": row.find('PM10').text if row.find('PM10') is not None else None,
                                    "초미세": row.find('PM25').text if row.find('PM25') is not None else None
                                })
                            df2 = pd.DataFrame(rows)
                            all_data_df1 = pd.concat([all_data_df1, df2], ignore_index=True)
                        else:
                            print(f"에러: {gu_name} - {date_str} - 상태 코드: {result.status_code}")
            except requests.exceptions.RequestException:
                time.sleep(5)  # 5초 대기 후 재시도

        all_data_df1.replace({None: np.nan}, inplace=True)
        combined_df = all_data_df1.dropna()
        combined_df = combined_df.drop(columns=['측정소명'])

        for col in combined_df.columns:
            if col == '측정일시':
                combined_df[col] = combined_df[col].astype(np.int64)
            else:
                combined_df[col] = combined_df[col].astype(float)

        combined_df = combined_df.groupby('측정일시', as_index=False).mean()
        combined_df['이산화질소'] = combined_df['이산화질소'].round(3)
        combined_df['오존'] = combined_df['오존'].round(3)
        combined_df['일산화탄소'] = combined_df['일산화탄소'].round(1)
        combined_df['아황산'] = combined_df['아황산'].round(3)
        combined_df['미세'] = combined_df['미세'].round(0)
        combined_df['초미세'] = combined_df['초미세'].round(0)

        df = pd.concat([df, combined_df], ignore_index=True)
        df.to_csv('1211.대기오염미래예측/data/12-23대기오염nan처리.csv', index=False)

    else:
        print("데이터가 이미 최신 상태입니다.")

if __name__ == '__main__':
    fetch_air_quality_data()


데이터가 이미 최신 상태입니다.


In [4]:
df = pd.read_csv('1211.대기오염미래예측/data/12-23대기오염nan처리.csv')

In [5]:
df.tail()

Unnamed: 0,측정일시,이산화질소,오존,일산화탄소,아황산,미세,초미세
4309,20231217,0.01,0.03,0.4,0.002,15.0,7.0
4310,20231218,0.023,0.02,0.4,0.002,16.0,7.0
4311,20231219,0.038,0.008,0.6,0.002,26.0,15.0
4312,20231220,0.018,0.025,0.5,0.002,44.0,28.0
4313,20231221,0.013,0.025,0.4,0.002,19.0,12.0
